NCBI C++ ToolKit
gap_stats.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gap_stats.cpp 92178 2020-12-22 18:06:45Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Michael Kornbluh
27  *
28  * File Description:
29  * Compute gap statistics.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbienv.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <corelib/ncbifile.hpp>
38 
39 #include <objmgr/scope.hpp>
41 
43 
46 
49 
50 #include <util/format_guess.hpp>
51 #include <util/table_printer.hpp>
52 
53 #include <serial/objistr.hpp>
54 
56 
59 
60 namespace {
61  typedef CTablePrinter::SEndOfCell CellEnd;
62 
63  // some types used so often that we give them an abbreviation
64  typedef CGapAnalysis GA;
65  typedef CFastaReader FR;
66 
67  template<typename T1, typename T2>
68  ostream & operator << (ostream & ostr, const pair<T1, T2> & a_pair )
69  {
70  ostr << '(' << a_pair.first << ", " << a_pair.second << ')';
71  return ostr;
72  }
73 
74  template<typename TMap>
75  const typename TMap::mapped_type &
76  find_attr_or_die(
77  const TMap & a_map, const typename TMap::key_type & key )
78  {
79  typename TMap::const_iterator find_it =
80  a_map.find(key);
81  if( find_it == a_map.end() ) {
83  "Could not find map key: " << key);
84  }
85  return find_it->second;
86  }
87 
88  const CTempString
89  find_attrib_attr_or_die(
90  const xml::attributes & attribs, const CTempString & key)
91  {
92  xml::attributes::const_iterator find_it = attribs.find(key.data());
93  if( find_it == attribs.end() ) {
95  "Could not find map key: " << key);
96  }
97  return find_it->get_value();
98  }
99 
100  const CTempString
101  find_node_attr_or_die(
102  const xml::node & node, const CTempString & key )
103  {
104  return find_attrib_attr_or_die(node.get_attributes(), key);
105  }
106 
107  Uint8 to_uint8(const CTempString & str_of_num)
108  {
109  return NStr::StringToUInt8(str_of_num);
110  }
111 
112  // translate gap types to string for ASCII output
113  typedef SStaticPair<GA::EGapType, const char *> TGapTypeNameElem;
114  static const TGapTypeNameElem sc_gaptypename_map[] = {
115  { GA::eGapType_All, "Any Kind"},
116  { GA::eGapType_SeqGap, "Seq Gaps"},
117  { GA::eGapType_UnknownBases, "Runs of Ns"},
118  };
119  typedef CStaticArrayMap<
120  GA::EGapType, const char *> TGapTypeNameMap;
122  TGapTypeNameMap, sc_gaptypename, sc_gaptypename_map);
123 
124  // we iterate through sc_gaptypename many times and we also
125  // want to keep the same order each time because there are a
126  // number of places that assume the same order each time.
127  #define ITERATE_GAP_TYPES(iter_name) \
128  ITERATE(TGapTypeNameMap, iter_name, sc_gaptypename)
129 
130  // map each flag value to an enum for CGapAnalysis
131  struct SGapRelatedInfo {
132  GA::EGapType gap_type;
133  GA::TAddFlag gap_add_flag;
134  FR::TFlags fasta_flag;
135  };
136  typedef SStaticPair<const char *, SGapRelatedInfo> TAddGapTypeElem;
137  static const TAddGapTypeElem sc_addgaptypename_map[] = {
138  { "all",
139  { GA::eGapType_All,
140  ( GA::fAddFlag_IncludeSeqGaps | GA::fAddFlag_IncludeUnknownBases),
141  ( FR::fParseGaps | FR::fLetterGaps) } },
142  { "seq-gaps",
143  { GA::eGapType_SeqGap,
144  GA::fAddFlag_IncludeSeqGaps, FR::fParseGaps } },
145  { "unknown-bases",
146  { GA::eGapType_UnknownBases,
147  GA::fAddFlag_IncludeUnknownBases, FR::fLetterGaps } },
148  };
149  typedef CStaticArrayMap<
150  const char *, SGapRelatedInfo, PCase_CStr> TAddGapTypeMap;
152  TAddGapTypeMap, sc_addgaptypename, sc_addgaptypename_map);
153 
154  /// Prints start_str when constructed and end_str
155  /// when destroyed. Example usage would be
156  /// to print start and end tags of XML
157  class CBeginEndStrToCoutRAII
158  {
159  public:
160  CBeginEndStrToCoutRAII(
161  const string & start_str, const string & end_str)
162  : m_end_str(end_str)
163  {
164  cout << start_str << endl;
165  }
166 
167  ~CBeginEndStrToCoutRAII()
168  {
169  cout << m_end_str << endl;
170  }
171 
172  private:
173  const string m_end_str;
174  };
175 
176  /// Holds information about an issue that occurred that
177  /// we need to output as a message.
178  ///
179  /// Normally this would be printed via WriteAsXML
180  /// or WriteAsText, but
181  /// this can be thrown to indicate that we're giving
182  /// up on a given file or accn.
183  struct SOutMessage : public std::runtime_error {
184  public:
185  const static string kErrorStr;
186  const static string kFatalStr;
187 
188  SOutMessage(
189  const string & file_or_accn,
190  const string & level,
191  const string & code,
192  const string & text) :
193  std::runtime_error(
194  x_CalcWhat(file_or_accn, level, code, text)),
195  m_file_or_accn_basename(
196  x_CalcFileBaseNameOrAccn(file_or_accn)),
197  m_level(level),
198  m_code(code),
199  m_text(text)
200  { }
201 
202  // parent has no-throw dtor, so we must too
203  ~SOutMessage() throw() { }
204 
205  const string m_file_or_accn_basename;
206  const string m_level;
207  const string m_code;
208  const string m_text;
209 
210  void WriteAsXML(CNcbiOstream & out_strm) const;
211  void WriteAsText(CNcbiOstream & out_strm) const;
212 
213  private:
214  static string x_CalcFileBaseNameOrAccn(
215  const string & file_or_accn);
216 
217  static string x_CalcWhat(
218  const string & file_or_accn,
219  const string & level,
220  const string & code,
221  const string & text);
222  };
223 
224  const string SOutMessage::kErrorStr("ERROR");
225  const string SOutMessage::kFatalStr("FATAL");
226 
227  void SOutMessage::WriteAsXML(CNcbiOstream & out_strm) const
228  {
229  xml::document expn_doc("message");
230  xml::node & expn_root_node = expn_doc.get_root_node();
231 
232  xml::attributes & expn_root_attribs =
233  expn_root_node.get_attributes();
234  if( ! m_file_or_accn_basename.empty() ) {
235  expn_root_attribs.insert(
236  "file_or_accn", m_file_or_accn_basename.c_str());
237  }
238  expn_root_attribs.insert("severity", m_level.c_str());
239  expn_root_attribs.insert("code", m_code.c_str());
240 
241  expn_root_node.set_content(m_text.c_str());
242 
243  expn_doc.save_to_stream(out_strm, xml::save_op_no_decl);
244  }
245 
246  void SOutMessage::WriteAsText(CNcbiOstream & out_strm) const
247  {
248  out_strm << what() << endl;
249  }
250 
251  string SOutMessage::x_CalcFileBaseNameOrAccn(
252  const string & file_or_accn)
253  {
254  CFile maybe_file(file_or_accn);
255  if( maybe_file.IsFile() ) {
256  // if file, return basename
257  return maybe_file.GetName();
258  } else {
259  // if accn, return as-is
260  return file_or_accn;
261  }
262  }
263 
264  string SOutMessage::x_CalcWhat(
265  const string & file_or_accn,
266  const string & level,
267  const string & code,
268  const string & text)
269  {
270  // build answer here
271  CNcbiOstrstream answer_str_strm;
272 
273  const string file_or_accn_basename =
274  x_CalcFileBaseNameOrAccn(file_or_accn);
275 
276  answer_str_strm << level << ": [" << code << "] ";
277  if( ! file_or_accn_basename.empty() ) {
278  answer_str_strm << file_or_accn_basename << ": ";
279  }
280  answer_str_strm << text;
281 
282  return CNcbiOstrstreamToString(answer_str_strm);
283  }
284 
285 }
286 
287 /////////////////////////////////////////////////////////////////////////////
288 // CGapStatsApplication::
289 
290 
292 {
293 public:
294 
295  CGapStatsApplication(void);
296 
297  virtual void Init(void);
298  virtual int Run(void);
299 private:
303  GA::ESortGapLength m_eSort;
304  GA::ESortDir m_eSortDir;
305 
308  bool x_IncludeGapType(GA::EGapType eGapType) const;
309  GA::TAddFlag m_fGapAddFlags;
310  FR::TFlags m_fFastaFlags;
311 
312  enum EOutFormat {
315  };
317 
318  /// "Run" will catch all exceptions and try to do something reasonable,
319  /// and calls RunNoCatch where the real work happens.
320  ///
321  /// (This is just to avoid having to surround an entire function with
322  /// a try-catch block)
323  int RunNoCatch(void);
324 
325  CRef<CScope> x_GetScope(void);
326  static string x_GapNameToGapXMLNodeName(const CTempString & gap_name);
327 
328  typedef vector<GA::TGapLength> TGapLengthVec;
329  /// Returns a vector of all possible gap lengths we've seen
331 
332 
333  /// Reads and loads into m_gapAnalysis
334  void x_ReadFileOrAccn(const string & sFileOrAccn);
335  void x_PrintSummaryView(void);
336  void x_PrintSeqsForGapLengths(void);
337  void x_PrintHistogram(Uint8 num_bins,
338  CHistogramBinning::EHistAlgo eHistAlgo);
339  void x_PrintOutMessage(
340  const SOutMessage &out_message, CNcbiOstream & out_strm) const;
341 };
342 
343 /////////////////////////////////////////////////////////////////////////////
344 // Constructor
345 
347  m_MolFilter(CSeq_inst::eMol_na),
348  m_MaxResolveCount(kMax_Int),
349  // default to all unless user adds the gap types manually
350  m_fGapAddFlags(
351  // all by default
352  GA::fAddFlag_All),
353  m_fFastaFlags(
354  // all by default
355  FR::fParseGaps |
356  FR::fLetterGaps),
357  m_eOutFormat(eOutFormat_ASCIITable)
358 {
359  m_IncludedGapTypes.insert(GA::eGapType_All);
361 }
362 
363 /////////////////////////////////////////////////////////////////////////////
364 // Init test for all different types of arguments
365 
366 
368 {
369  // Create command-line argument descriptions class
370  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
371 
372  // Specify USAGE context
373  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
374  "Gap analysis program", false);
375 
376  // Describe the expected command-line arguments
377 
378  arg_desc->SetCurrentGroup("BASIC");
379 
380  arg_desc->AddExtra(
381  1, kMax_UInt,
382  "The files or accessions to do gap analysis on. "
383  "ASN.1, XML, and FASTA are some of the supported file formats.",
385 
386  vector<string> gap_types;
387  ITERATE(TAddGapTypeMap, add_gap_type_it, sc_addgaptypename) {
388  gap_types.push_back(add_gap_type_it->first);
389  }
390  // special gap-types
391  arg_desc->AddOptionalKey(
392  "gap-types",
393  "GapTypes",
394  "A comma-separated list of types of gaps we look at. "
395  "If none are specified, all types will be shown. Possibilities: " +
396  NStr::Join(gap_types, ", ") + " but note that 'all' means adding "
397  "one column that combines all types. Also, there is "
398  "the gap-type 'both' as a shortcut for showing a column "
399  "for every possible gap type "
400  "(except 'all', unless you also add that to -gap-types)",
402 
403  arg_desc->AddDefaultKey(
404  "out-format", "Format",
405  "Specifies how the results should be printed.",
407  "ascii-table");
408  arg_desc->SetConstraint("out-format", &(*new CArgAllow_Strings,
409  "ascii-table", "xml"));
410 
411  arg_desc->AddDefaultKey("sort-on", "HowToSortResults",
412  "Specify the order of the summary. length sorts on the gap length. "
413  "num_seqs sorts on the number of sequences "
414  "each gap length appears on. "
415  "num_gaps sorts on the total number of times a gap "
416  "of each gap length appears. ",
418  "length");
419  arg_desc->SetConstraint("sort-on", &(*new CArgAllow_Strings,
420  "length", "num_seqs", "num_gaps"));
421 
422  arg_desc->AddFlag("rev-sort",
423  "Set this to reverse the sorting order.");
424 
425  arg_desc->AddFlag("show-seqs-for-gap-lengths",
426  "This will show the sequences that each gap size has. It "
427  "is not affected by the sorting options.");
428 
429  arg_desc->AddDefaultKey("mol", "MolTypesToLookAt",
430  "Specify whether you just want to look at sequences which are "
431  "nucleic acids (na), amino acids (aa), or any.",
433  "na" );
434  arg_desc->SetConstraint("mol", &(*new CArgAllow_Strings,
435  "na", "aa", "any"));
436 
437  arg_desc->AddDefaultKey("assume-mol", "AssumedMolType",
438  "If unable to determine mol of sequence from ASN.1, "
439  "FASTA mods, etc. this specifies what mol we assume it is.",
441  "na" );
442  arg_desc->SetConstraint("assume-mol", &(*new CArgAllow_Strings,
443  "na", "aa"));
444 
445  arg_desc->SetCurrentGroup("RESOLVING SEQUENCES");
446 
447  arg_desc->AddFlag(
448  "no-gbload", "If set, avoids using the GenBank data loader");
449 
450  arg_desc->AddDefaultKey(
451  "max-resolve-count", "MAX_COUNT",
452  "How deep to resolve when following far references. Note that being "
453  "unable to resolve a far reference whether due to reaching max "
454  "resolve count or any other reason will result in an error since "
455  "unable to calculate 'gaps' if cannot get sequence. Zero disables "
456  "all far reference resolution.",
458  NStr::NumericToString(kMax_Int) // Default is "effectively unlimited"
459  );
460  arg_desc->SetConstraint(
461  "max-resolve-count", new CArgAllow_Integers(0, kMax_Int));
462 
463  arg_desc->SetCurrentGroup("HISTOGRAM");
464 
465  arg_desc->AddFlag("show-hist",
466  "Set this flag to see a histogram of gap data");
467 
468  arg_desc->AddDefaultKey("hist-bins", "NumBins",
469  "Set the number of histogram bins to aim for (not guaranteed "
470  "to be that exact number). Default is 0, which means to "
471  "automatically pick a reasonable number of bins",
473  "0" );
474  arg_desc->SetConstraint("hist-bins",
475  new CArgAllow_Int8s(0, kMax_I8) );
476  arg_desc->SetDependency(
477  "hist-bins", CArgDescriptions::eRequires, "show-hist" );
478 
479  arg_desc->AddDefaultKey("hist-algo", "HistogramAlgorithm",
480  "Set this if you want the histogram binner to try to "
481  "use a different binning algorithm. The default should "
482  "be fine for most people.",
484  "cluster" );
485  arg_desc->SetConstraint("hist-algo", &(*new CArgAllow_Strings,
486  "cluster", "even_bins"));
487  arg_desc->SetDependency(
488  "hist-algo", CArgDescriptions::eRequires, "show-hist" );
489 
490  // hide debugging args, but leave in source control, at least for now
491  // arg_desc->SetCurrentGroup("DEBUGGING");
492  // arg_desc->AddFlag("trigger-internal-error",
493  // "Since this program should never trigger an "
494  // "internal error (hopefully), this flag causes one "
495  // "to happen for testing purposes");
496 
497  // Setup arg.descriptions for this application
498  arg_desc->SetCurrentGroup(kEmptyStr);
499  SetupArgDescriptions(arg_desc.release());
500 }
501 
502 /////////////////////////////////////////////////////////////////////////////
503 // Run
504 
505 
507 {
508  // Get arguments
509  const CArgs & args = GetArgs();
510 
511  // must check out-format arg _first_ because it may set up start and end
512  // strings which must occur.
513  AutoPtr<CBeginEndStrToCoutRAII> pResultBeginEndStr;
514  const string & sOutFormat = args["out-format"].AsString();
515  if( "ascii-table" == sOutFormat ) {
517  } else if( "xml" == sOutFormat ) {
519  // outermost XML node to hold everything. Use AutoPtr to be
520  // sure the closing tag is printed at the end.
521  pResultBeginEndStr.reset(
522  new CBeginEndStrToCoutRAII("<result>", "</result>"));
523  } else {
524  _TROUBLE;
525  }
526 
527  // of course, this is only used if there was an exception
528  AutoPtr<SOutMessage> p_out_message;
529 
530  try {
531  // almost all work happens in RunNoCatch
532  return RunNoCatch();
533  } catch (const std::exception & ex ) {
534  p_out_message.reset(
535  new SOutMessage(
536  kEmptyStr, SOutMessage::kFatalStr,
537  "INTERNAL_ERROR", ex.what()));
538  } catch(...) {
539  p_out_message.reset(
540  new SOutMessage(
541  kEmptyStr, SOutMessage::kFatalStr,
542  "INTERNAL_ERROR", "(---UNKNOWN INTERNAL ERROR TYPE---)"));
543  }
544 
545  // if we're here, there was a fatal exception, which we now output
546  _ASSERT(p_out_message);
547  x_PrintOutMessage(*p_out_message, cerr);
548 
549  // failure
550  return 1;
551 }
552 
554 {
555  int exit_code = 0;
556 
557  // Get arguments
558  const CArgs & args = GetArgs();
559 
560  // hide debugging args, but leave in source control, at least for now
561  // // if requested, trigger an internal error for testing
562  // // purposes.
563  // if( args["trigger-internal-error"] ) {
564  // throw std::runtime_error(
565  // "This runtime_error was specifically requested "
566  // "via a program parameter");
567  // }
568 
569  // load variables set by args
570  const string & sMol = args["mol"].AsString();
571  if( sMol == "na" ) {
573  } else if( sMol == "aa" ) {
575  } else if( sMol == "any" ) {
577  } else {
578  // shouldn't happen
579  NCBI_USER_THROW_FMT("Unsupported mol: " << sMol);
580  }
581 
582  m_MaxResolveCount = args["max-resolve-count"].AsInteger();
583 
584  const string & sAssumeMol = args["assume-mol"].AsString();
585  if( sAssumeMol == "na" ) {
586  m_fFastaFlags |= FR::fAssumeNuc;
587  } else if( sAssumeMol == "aa" ) {
588  m_fFastaFlags |= FR::fAssumeProt;
589  } else {
590  // shouldn't happen
591  NCBI_USER_THROW_FMT("Unsupported assume-mol: " << sAssumeMol);
592  }
593 
594  // if gap-types specified, use those instead of defaulting to "all"
595  if( args["gap-types"].HasValue() ) {
596  string gap_type_arg = args["gap-types"].AsString();
597 
599  m_fGapAddFlags = 0;
600  m_fFastaFlags = 0;
601 
602  vector< string > raw_gap_types;
603  NStr::Split(args["gap-types"].AsString(), ",", raw_gap_types, 0);
604  if( raw_gap_types.empty() ) {
605  throw SOutMessage(
606  kEmptyStr, SOutMessage::kFatalStr,
607  "NO_GAP_TYPES_GIVEN",
608  "-gap-types must be given at least one gap");
609  }
610 
611  // handle each part of the comma-separated gap-types arg
612  ITERATE(vector< string >, raw_gap_types_it, raw_gap_types)
613  {
614  const string & raw_gap_type_str = *raw_gap_types_it;
615 
616  // unpack gap-types that actually represent multiple gap
617  // types (e.g. the choice "both") and put them all into
618  // gap_types_from_this_arg. Plain gap-types remain unchanged
619  // and aren't "unpacked" per se.
620  vector<string> gap_types_from_this_arg;
621  if( raw_gap_type_str == "both" ) {
622  // a special case
623 
624  const size_t old_num_gap_types =
625  gap_types_from_this_arg.size();
626  ITERATE(TAddGapTypeMap, add_gap_type_it, sc_addgaptypename) {
627  const string & gap_type_str = add_gap_type_it->first;
628  if( gap_type_str != "all" ) {
629  gap_types_from_this_arg.push_back(gap_type_str);
630  }
631  }
632  const size_t new_num_gap_types =
633  gap_types_from_this_arg.size();
634 
635  if( (old_num_gap_types + 2) != new_num_gap_types ) {
636  // "both" doesn't make sense unless exactly 2 choices
637  // are added.
638  _TROUBLE;
639  }
640  } else {
641  // generally a gap type string just represents itself as-is
642  // and isn't "unpacked" per se.
643  gap_types_from_this_arg.push_back(raw_gap_type_str);
644  }
645 
646  ITERATE(vector<string>, gap_type_str_it, gap_types_from_this_arg)
647  {
648  const string & gap_type_str = *gap_type_str_it;
649  TAddGapTypeMap::const_iterator find_it =
650  sc_addgaptypename.find(gap_type_str.c_str());
651  if( find_it == sc_addgaptypename.end() ) {
652  throw SOutMessage(
653  kEmptyStr,
654  SOutMessage::kFatalStr,
655  "UNKNOWN_GAP_TYPE",
656  "This gap type is not recognized: '" +
657  gap_type_str + "'");
658  }
659 
660  const SGapRelatedInfo & gap_related_flags =
661  find_it->second;
662  m_IncludedGapTypes.insert(gap_related_flags.gap_type);
663  m_fGapAddFlags |= gap_related_flags.gap_add_flag;
664  m_fFastaFlags |= gap_related_flags.fasta_flag;
665  }
666  }
667  }
669  _ASSERT(m_fGapAddFlags != 0);
670  _ASSERT(m_fFastaFlags != 0);
671 
672  m_eSort = GA::eSortGapLength_Length;
673  const string sSortOn = args["sort-on"].AsString();
674  if( "length" == sSortOn ) {
675  m_eSort = GA::eSortGapLength_Length;
676  } else if( "num_seqs" == sSortOn ) {
677  m_eSort = GA::eSortGapLength_NumSeqs;
678  } else if( "num_gaps" == sSortOn ) {
679  m_eSort = GA::eSortGapLength_NumGaps;
680  } else {
681  // shouldn't happen
682  NCBI_USER_THROW_FMT("Unsupported sort-on: " << sSortOn);
683  }
684 
685  m_eSortDir = (
686  args["rev-sort"] ?
687  GA::eSortDir_Descending :
688  GA::eSortDir_Ascending );
689 
690  // load given data into m_gapAnalysis
691  // (Note that extra-arg indexing is 1-based )
692  for(size_t ii = 1; ii <= args.GetNExtra(); ++ii ) {
693  const string sFileOrAccn = args[ii].AsString();
694  try {
695  x_ReadFileOrAccn(sFileOrAccn);
696  }
697  catch (const ncbi::objects::CObjMgrException& ex) {
698  if (ex.GetErrCode() == ncbi::objects::CObjMgrException::eAddDataError
699  && ex.GetMsg().find("duplicate Bioseq id") == 0) {
700  SOutMessage out_message(
701  sFileOrAccn, SOutMessage::kErrorStr,
702  ex.GetErrCodeString(), ex.GetMsg());
703  x_PrintOutMessage(out_message, cerr);
704  exit_code = 1;
705  }
706  }
707  catch (const SOutMessage & out_message) {
708  // a thrown SOutMessage indicates we give up on this file_or_accn.
709  // (Note that a non-thrown SOutMessage would just be printed
710  // but would not halt processing of the file_or_accn)
711  x_PrintOutMessage(out_message, cerr);
712  exit_code = 1;
713  }
714  catch (const ncbi::CException& ex) {
715  SOutMessage out_message(
716  sFileOrAccn, SOutMessage::kFatalStr,
717  ex.GetErrCodeString(), ex.GetMsg());
718  x_PrintOutMessage(out_message, cerr);
719  exit_code = 1;
720  }
721  catch (...) {
722  // Unexpected exceptions make us give up without processing
723  // further files-or-accns.
724 
725  // print a message in case higher-up catch clauses are
726  // unable to determine the file-or-accn under which this
727  // occurred
728  SOutMessage out_message(
729  sFileOrAccn, SOutMessage::kFatalStr,
730  "INTERNAL_ERROR", "Unexpected exception");
731  x_PrintOutMessage(out_message, cerr);
732 
733  throw;
734  }
735  }
736 
737  // summary view is always shown
739 
740  if( args["show-seqs-for-gap-lengths"] ) {
742  }
743 
744  if( args["show-hist"] ) {
745  const Uint8 num_bins = args["hist-bins"].AsInt8();
746  const string & sHistAlgo = args["hist-algo"].AsString();
747  CHistogramBinning::EHistAlgo eHistAlgo =
749  if( "cluster" == sHistAlgo ) {
751  } else if( "even_bins" == sHistAlgo ) {
753  } else {
754  // shouldn't happen
755  NCBI_USER_THROW_FMT("Histogram algorithm not supported yet: "
756  << sHistAlgo );
757  }
758 
759  x_PrintHistogram(num_bins, eHistAlgo);
760  }
761 
762  return exit_code;
763 }
764 
765 /////////////////////////////////////////////////////////////////////////////
766 // x_GetScope
767 
770 {
771  DEFINE_STATIC_FAST_MUTEX(s_scope_mtx);
772  CFastMutexGuard guard(s_scope_mtx);
773 
774  static CRef<CScope> s_scope;
775  if( ! s_scope ) {
776  // set up singleton scope
778  if( ! GetArgs()["no-gbload"] ) {
780  }
781  s_scope.Reset( new CScope(*pObjMgr) );
782  s_scope->AddDefaults();
783  }
784 
785  return s_scope;
786 }
787 
789  const CTempString & gap_name)
790 {
791  string answer = gap_name;
792  NStr::ReplaceInPlace(answer, " ", "_");
793  NStr::ToLower(answer);
794  return answer;
795 }
796 
797 bool
798 CGapStatsApplication::x_IncludeGapType(GA::EGapType eGapType) const
799 {
800  if( m_IncludedGapTypes.find(eGapType) != m_IncludedGapTypes.end() ) {
801  return true;
802  } else {
803  return false;
804  }
805 }
806 
807 /////////////////////////////////////////////////////////////////////////////
808 // x_CalcAllGapLens
809 
812 {
813  AutoPtr<TGapLengthVec> all_gap_lengths_list(new TGapLengthVec);
814  {
815  AutoPtr<GA::TVectorGapLengthSummary> pGapLenSummary(
816  m_gapAnalysis.GetGapLengthSummary(
817  GA::eGapType_All, m_eSort, m_eSortDir) );
818  ITERATE( GA::TVectorGapLengthSummary,
819  summary_unit_it, *pGapLenSummary )
820  {
821  all_gap_lengths_list->push_back((*summary_unit_it)->gap_length);
822  }
823  sort(BEGIN_COMMA_END(*all_gap_lengths_list));
824  // make sure unique
825  _ASSERT(
826  unique(BEGIN_COMMA_END(*all_gap_lengths_list))
827  == all_gap_lengths_list->end());
828  }
829 
830  return all_gap_lengths_list;
831 }
832 
833 /////////////////////////////////////////////////////////////////////////////
834 // x_ReadFileOrAccn
835 
836 void CGapStatsApplication::x_ReadFileOrAccn(const string & sFileOrAccn)
837 {
838  CSeq_entry_Handle entry_h;
839 
840  // if file exists, load from that
841  if( CDirEntry(sFileOrAccn).Exists() ) {
842 
843  if( ! CDirEntry(sFileOrAccn).IsFile() ) {
844  throw SOutMessage(
845  sFileOrAccn,
846  SOutMessage::kErrorStr,
847  "NON_FILE_INPUT",
848  FORMAT("This exists but is not a plain file: "
849  << sFileOrAccn));
850  }
851 
852  // auto-detect format and object type
853  CNcbiIfstream in_file(sFileOrAccn.c_str(), ios::in | ios::binary );
854 
855  ESerialDataFormat eSerialDataFormat = eSerial_None;
856 
857  CRef<CSeq_entry> pSeqEntry;
858 
859  CFormatGuess format_guesser(in_file);
860  // prefer formats that we support
861  for( auto a_format : {CFormatGuess::eBinaryASN,
865  {
866  format_guesser.GetFormatHints().AddPreferredFormat(a_format);
867  }
868 
869  CFormatGuess::EFormat eFormat = format_guesser.GuessFormat();
870  switch(eFormat) {
872  eSerialDataFormat = eSerial_AsnBinary;
873  break;
875  eSerialDataFormat = eSerial_AsnText;
876  break;
877  case CFormatGuess::eXml:
878  eSerialDataFormat = eSerial_Xml;
879  break;
880  case CFormatGuess::eFasta: {
881  CFastaReader fasta_reader(in_file,
882  m_fFastaFlags );
883  pSeqEntry = fasta_reader.ReadSet();
884  break;
885  }
886  default:
887 
888  throw SOutMessage(
889  sFileOrAccn,
890  SOutMessage::kErrorStr,
891  "UNSUPPORTED_FORMAT",
892  FORMAT("This format is not yet supported: "
893  << format_guesser.GetFormatName(eFormat)));
894  }
895 
896  _ASSERT(eSerialDataFormat != eSerial_None ||
897  eFormat == CFormatGuess::eFasta);
898  if( ! pSeqEntry ) {
899  // try to parse as Seq-submit
900  in_file.seekg(0);
901  CRef<CSeq_submit> pSeqSubmit( new CSeq_submit );
902  try {
903  in_file >> MSerial_Format(eSerialDataFormat)
904  >> *pSeqSubmit;
905 
906  if( ! pSeqSubmit->IsEntrys() ||
907  pSeqSubmit->GetData().GetEntrys().size() != 1 )
908  {
909  throw SOutMessage(
910  sFileOrAccn,
911  SOutMessage::kErrorStr,
912  "SEQ_SUBMIT_MULTIPLE_SEQ_ENTRIES",
913  FORMAT(
914  "Only Seq-submits with exactly "
915  "one Seq-entry "
916  "inside are supported."));
917  }
918  pSeqEntry = *pSeqSubmit->SetData().SetEntrys().begin();
919  } catch(...) {
920  // keep going and try to parse another way
921  }
922  }
923 
924  if( ! pSeqEntry ) {
925  // try to parse as CSeq_entry
926  try {
927  in_file.seekg(0);
928  CRef<CSeq_entry> pNewSeqEntry( new CSeq_entry );
929  in_file >> MSerial_Format(eSerialDataFormat)
930  >> *pNewSeqEntry;
931  pSeqEntry = pNewSeqEntry;
932  } catch(...) {
933  // keep going and try to parse another way
934  }
935  }
936 
937  if( ! pSeqEntry ) {
938  // try to parse as CBioseq
939  try {
940  in_file.seekg(0);
941  CRef<CBioseq> pBioseq( new CBioseq );
942  in_file >> MSerial_Format(eSerialDataFormat)
943  >> *pBioseq;
944  pSeqEntry.Reset( new CSeq_entry );
945  pSeqEntry->SetSeq( *pBioseq );
946  } catch(...) {
947  }
948  }
949 
950  if( ! pSeqEntry ) {
951  throw SOutMessage(
952  sFileOrAccn,
953  SOutMessage::kErrorStr,
954  "INVALID_FORMAT_OR_BAD_OBJ_TYPE",
955  FORMAT("Invalid ASN.1 or unsupported object type"));
956  }
957 
958  entry_h = x_GetScope()->AddTopLevelSeqEntry(*pSeqEntry);
959  } else {
960 
961  // fall back on trying to load it as an accession
962  CRef<CSeq_id> pSeqId;
963  try {
964  pSeqId.Reset( new CSeq_id(sFileOrAccn) );
965  } catch(const CSeqIdException & ex) {
966  // malformed seq-id
967  throw SOutMessage(
968  sFileOrAccn,
969  SOutMessage::kErrorStr,
970  "BAD_ACCESSION",
971  FORMAT(ex.what()));
972  }
973 
974  CBioseq_Handle bioseq_h = x_GetScope()->GetBioseqHandle(*pSeqId);
975  if( ! bioseq_h ) {
976  throw SOutMessage(
977  sFileOrAccn,
978  SOutMessage::kErrorStr,
979  "ACCESSION_NOT_FOUND",
980  FORMAT("Accession could not be found"));
981  }
982  entry_h = bioseq_h.GetParentEntry();
983  }
984 
985  _ASSERT(entry_h);
986 
987  m_gapAnalysis.AddSeqEntryGaps(
988  entry_h,
989  m_MolFilter,
992  0, // CGapAnalysis::TFlags
994  );
995 
996  // conserve memory
998 }
999 
1000 /////////////////////////////////////////////////////////////////////////////
1001 // x_PrintSummaryView
1002 
1004 {
1005  // turn the data into XML, then into whatever output format is
1006  // appropriate
1007  xml::document gap_info_doc("summary");
1008  xml::node & gap_info_root_node = gap_info_doc.get_root_node();
1009 
1010  xml::node gap_len_infos_node("gap_len_infos");
1011 
1012  // map pair of (gap-len, gap-type) to GA::SOneGapLengthSummary for
1013  // all gap types.
1014  typedef pair<GA::TGapLength, GA::EGapType>
1015  TGapLenTypeKey;
1017  TGapLenTypeToSummaryMap;
1018  TGapLenTypeToSummaryMap gap_len_type_to_summary_map;
1019 
1020  // loop loads gap_len_type_to_summary_map
1021  ITERATE_GAP_TYPES(gap_type_name_it) {
1022  const GA::EGapType eGapType = gap_type_name_it->first;
1023 
1024  if( ! x_IncludeGapType(eGapType) ) {
1025  continue;
1026  }
1027 
1028  AutoPtr<GA::TVectorGapLengthSummary> p_gap_len_summary =
1029  m_gapAnalysis.GetGapLengthSummary(eGapType, m_eSort, m_eSortDir);
1030  ITERATE(GA::TVectorGapLengthSummary, gap_summary_it,
1031  *p_gap_len_summary )
1032  {
1033  CConstRef<GA::SOneGapLengthSummary> p_one_summary =
1034  *gap_summary_it;
1035  TGapLenTypeKey gap_len_type(p_one_summary->gap_length, eGapType);
1036  pair<TGapLenTypeToSummaryMap::iterator, bool> insert_ret =
1037  gap_len_type_to_summary_map.insert(
1038  make_pair(
1039  gap_len_type,
1040  p_one_summary));
1041  // there shouldn't be dups
1042  if( ! insert_ret.second ) {
1043  _TROUBLE;
1044  }
1045  }
1046  }
1047 
1048  // use eGapType_All to determine all possible gap lengths
1049  typedef vector<GA::TGapLength> TGapLengthVec;
1050  AutoPtr<TGapLengthVec> p_all_gap_lengths_list = x_CalcAllGapLens();
1051  // for convenience
1052  TGapLengthVec & all_gap_lengths_list = *p_all_gap_lengths_list;
1053 
1054  // each iteration creates an XML node for one gap length
1055  // with all the relevant info inside
1056  ITERATE( TGapLengthVec, gap_length_it, all_gap_lengths_list ) {
1057 
1058  const GA::TGapLength gap_len = *gap_length_it;
1059 
1060  xml::node one_gap_len_info("one_gap_len_info");
1061  xml::attributes & one_gap_len_attributes =
1062  one_gap_len_info.get_attributes();
1063 
1064  one_gap_len_attributes.insert(
1065  "len", NStr::NumericToString(gap_len).c_str());
1066 
1067  // get information about each kind of gap for the gap length
1068  // set by the loop above
1069  ITERATE_GAP_TYPES(gap_type_name_it) {
1070  const GA::EGapType eGapType = gap_type_name_it->first;
1071  const CTempString pchGapName = gap_type_name_it->second;
1072 
1073  if( ! x_IncludeGapType(eGapType) ) {
1074  continue;
1075  }
1076 
1077  // get the info for this gap type
1079  {
1080  TGapLenTypeToSummaryMap::const_iterator find_it =
1081  gap_len_type_to_summary_map.find(
1082  make_pair(gap_len, eGapType));
1083  if( find_it != gap_len_type_to_summary_map.end() ) {
1084  p_one_summary = find_it->second;
1085  } else {
1086  p_one_summary.Reset(
1087  new GA::SOneGapLengthSummary(
1088  // make sure no one uses that first arg
1090  0, 0));
1091  }
1092  }
1093  _ASSERT(p_one_summary);
1094 
1095  // XML for info about just this gap type
1096  // (convert gap name to reasonable XML node name)
1097  xml::node one_gap_len_type_info(
1098  "one_gap_len_type_info");
1099  xml::attributes & one_gap_len_type_info_attributes =
1100  one_gap_len_type_info.get_attributes();
1101 
1102  one_gap_len_type_info_attributes.insert(
1103  "gap_type",
1104  x_GapNameToGapXMLNodeName(pchGapName).c_str());
1105  one_gap_len_type_info_attributes.insert(
1106  "num_seqs",
1107  NStr::NumericToString(p_one_summary->num_seqs).c_str());
1108  one_gap_len_type_info_attributes.insert(
1109  "num_gaps",
1110  NStr::NumericToString(p_one_summary->num_gaps).c_str());
1111 
1112  one_gap_len_info.insert(one_gap_len_type_info);
1113  }
1114 
1115  gap_len_infos_node.insert(one_gap_len_info);
1116  }
1117  gap_info_root_node.insert(gap_len_infos_node);
1118 
1119  // output to summary cout
1120  if( m_eOutFormat == eOutFormat_XML ) {
1121  // XML case is trivial since we've already formed it
1122  gap_info_doc.save_to_stream(cout, xml::save_op_no_decl);
1123  } else if( m_eOutFormat == eOutFormat_ASCIITable) {
1124  // turn XML into an ASCII table
1125  cout << "SUMMARY:" << endl;
1126 
1127  bool bAnyGapOfLenZero = false;
1128 
1129  const size_t kDigitsInUint8 = numeric_limits<Uint8>::digits10;
1130  CTablePrinter::SColInfoVec vecColInfos;
1131  vecColInfos.AddCol("Gap Length", kDigitsInUint8,
1133 
1134  ITERATE_GAP_TYPES(gap_type_name_it) {
1135  const GA::EGapType eGapType = gap_type_name_it->first;
1136  string pchGapName = gap_type_name_it->second;
1137 
1138  if( ! x_IncludeGapType(eGapType) ) {
1139  continue;
1140  }
1141 
1142  vecColInfos.AddCol(
1143  "#Seqs with " + pchGapName, kDigitsInUint8,
1145  vecColInfos.AddCol(
1146  "# of " + pchGapName, kDigitsInUint8,
1148  }
1149 
1150  CTablePrinter table_printer(vecColInfos, cout);
1151 
1152  ITERATE(xml::node, one_gap_len_it, gap_len_infos_node) {
1153  const xml::node & one_gap_len_node = *one_gap_len_it;
1154  const xml::attributes & one_gap_len_info =
1155  one_gap_len_node.get_attributes();
1156 
1157  const GA::TGapLength gap_len =
1158  to_uint8(find_attrib_attr_or_die(one_gap_len_info, "len"));
1159  table_printer << gap_len << CellEnd();
1160  if( 0 == gap_len ) {
1161  bAnyGapOfLenZero = true;
1162  }
1163 
1164  // children of one_gap_len_info represent the info for
1165  // each gap type
1166  ITERATE( xml::node, child_it, one_gap_len_node ) {
1167  const xml::node & gap_len_summary_node = *child_it;
1168  const Uint8 num_seqs = to_uint8(
1169  find_node_attr_or_die(
1170  gap_len_summary_node, "num_seqs"));
1171  table_printer << num_seqs << CellEnd();
1172 
1173  const Uint8 num_gaps = to_uint8(find_node_attr_or_die(
1174  gap_len_summary_node, "num_gaps"));
1175  table_printer << num_gaps << CellEnd();
1176  }
1177  }
1178  cout << endl;
1179 
1180  // print a note if any gaps are of length 0, which
1181  // means unknown
1182  if( bAnyGapOfLenZero ) {
1183  cout << "* Note: Gap of length zero means "
1184  << "'completely unknown length'." << endl;
1185  }
1186  } else {
1187  _TROUBLE;
1188  }
1189 }
1190 
1191 /////////////////////////////////////////////////////////////////////////////
1192 // x_PrintSeqsForGapLengths
1193 
1195 {
1196  AutoPtr<TGapLengthVec> p_all_gap_lengths_list = x_CalcAllGapLens();
1197 
1198  // turn into XML
1199 
1200  xml::document gap_seqs_doc("seqs_for_gap_lens");
1201  xml::node & gap_seqs_root_node = gap_seqs_doc.get_root_node();
1202 
1203  // each loop iteration handles one gap length (all gap types)
1204  ITERATE(TGapLengthVec, all_gap_lens_it, *p_all_gap_lengths_list ) {
1205  const GA::TGapLength gap_len = *all_gap_lens_it;
1206 
1207  xml::node gap_seqs_one_len_node("gap_length_info");
1208  gap_seqs_one_len_node.get_attributes().insert(
1209  "len", NStr::NumericToString(gap_len).c_str());
1210 
1211  ITERATE_GAP_TYPES(gap_type_name_it) {
1212  const GA::EGapType eGapType = gap_type_name_it->first;
1213  const char * pchGapName = gap_type_name_it->second;
1214 
1215  if( ! x_IncludeGapType(eGapType) ) {
1216  continue;
1217  }
1218 
1219  xml::node gap_seqs_one_len_and_gap_type("gap_type_seq_ids");
1220  gap_seqs_one_len_and_gap_type.get_attributes().insert(
1221  "gap_type", pchGapName);
1222 
1223  const GA::TMapGapLengthToSeqIds & map_len_to_seq_ids =
1224  m_gapAnalysis.GetGapLengthSeqIds(eGapType);
1225 
1226  GA::TMapGapLengthToSeqIds::const_iterator find_seq_ids_it =
1227  map_len_to_seq_ids.find(gap_len);
1228 
1229  // add a node for each seq_id for this gap type
1230  if( find_seq_ids_it != map_len_to_seq_ids.end() ) {
1231  const GA::TSetSeqIdConstRef & set_seq_id_const_ref =
1232  find_seq_ids_it->second;
1233  ITERATE(
1234  GA::TSetSeqIdConstRef, seq_id_ref_it, set_seq_id_const_ref)
1235  {
1236  xml::node one_seq_node("seq_info");
1237  one_seq_node.get_attributes().insert(
1238  "seq_id", (*seq_id_ref_it)->AsFastaString().c_str());
1239 
1240  gap_seqs_one_len_and_gap_type.push_back(one_seq_node);
1241  }
1242  }
1243 
1244  gap_seqs_one_len_node.push_back(gap_seqs_one_len_and_gap_type);
1245  }
1246 
1247  gap_seqs_root_node.push_back(gap_seqs_one_len_node);
1248  }
1249 
1250  // output
1251  if( m_eOutFormat == eOutFormat_XML ) {
1252  // TODO: give example output
1253 
1254  // trivial since already XML
1255  gap_seqs_doc.save_to_stream(cout, xml::save_op_no_decl);
1256  } else if ( m_eOutFormat == eOutFormat_ASCIITable ) {
1257  // convert XML to ASCII table
1258 
1259  // example output:
1260  // SEQ-IDS FOR EACH GAP-LENGTH:
1261  // Seq-ids with a gap of length 10:
1262  // Seq gaps:
1263  // lcl|scaffold17
1264  // lcl|scaffold33
1265  // lcl|scaffold35
1266  // lcl|scaffold37
1267  // Run of Unknown Bases:
1268  // lcl|scaffold40
1269  // lcl|scaffold41
1270  // lcl|scaffold43
1271  // lcl|scaffold5
1272  // lcl|scaffold6
1273  // Seq-ids with a gap of length 68:
1274  // Seq gaps:
1275  // lcl|scaffold6
1276  // Seq-ids with a gap of length 72:
1277  // Seq gaps:
1278  // lcl|scaffold43
1279  // lcl|scaffold88
1280  // Run of Unknown Bases:
1281  // lcl|scaffold88
1282 
1283  cout << "SEQ-IDS FOR EACH GAP-LENGTH:" << endl;
1284 
1285  ITERATE(xml::node, gap_len_node_it, gap_seqs_root_node) {
1286  const GA::TGapLength iGapLength =
1287  to_uint8(find_node_attr_or_die(
1288  *gap_len_node_it, "len"));
1289  cout << "\tSeq-ids with a gap of length "
1290  << iGapLength << ':' << endl;
1291 
1292  ITERATE(xml::node, gap_type_seq_ids_it, *gap_len_node_it) {
1293  const CTempString pchGapName = find_node_attr_or_die(
1294  *gap_type_seq_ids_it, "gap_type");
1295  cout << "\t\t" << pchGapName << ":" << endl;
1296 
1297  if( gap_type_seq_ids_it->size() ) {
1298  ITERATE(xml::node, seq_info_it, *gap_type_seq_ids_it) {
1299  cout << "\t\t\t"
1300  << find_node_attr_or_die(*seq_info_it, "seq_id")
1301  << endl;
1302  }
1303  } else {
1304  cout << "\t\t\t(NONE)" << endl;
1305  }
1306  }
1307  }
1308  cout << endl;
1309 
1310  } else {
1311  _TROUBLE;
1312  }
1313 }
1314 
1315 /////////////////////////////////////////////////////////////////////////////
1316 // x_PrintHistogram
1317 
1319  Uint8 num_bins,
1320  CHistogramBinning::EHistAlgo eHistAlgo)
1321 {
1322  // convert histograms into XML
1323  xml::document hist_doc("histogram_list");
1324  xml::node & hist_root_node = hist_doc.get_root_node();
1325 
1326  // build the histogram for each gap type
1327  ITERATE_GAP_TYPES(gap_type_it) {
1328  const GA::EGapType eGapType = gap_type_it->first;
1329  const char * pchGapName = gap_type_it->second;
1330 
1331  if( ! x_IncludeGapType(eGapType) ) {
1332  continue;
1333  }
1334 
1335  xml::node histogram_node("histogram");
1336  xml::attributes & histogram_node_attrs =
1337  histogram_node.get_attributes();
1338  histogram_node_attrs.insert("gap_type", pchGapName);
1339 
1341  m_gapAnalysis.GetGapHistogram(eGapType, num_bins, eHistAlgo));
1342 
1343  // load each histogram bin into the histogram_node
1344  ITERATE( CHistogramBinning::TListOfBins, bin_iter, *pListOfBins ) {
1345  const CHistogramBinning::SBin & bin = *bin_iter;
1346 
1347  xml::node bin_node("bin");
1348  xml::attributes & bin_node_attrs =
1349  bin_node.get_attributes();
1350  bin_node_attrs.insert(
1351  "start_inclusive",
1352  NStr::NumericToString(bin.first_number).c_str());
1353  bin_node_attrs.insert(
1354  "end_inclusive",
1355  NStr::NumericToString(bin.last_number).c_str());
1356  bin_node_attrs.insert(
1357  "num_appearances",
1359 
1360  histogram_node.insert(bin_node);
1361  }
1362 
1363  hist_root_node.insert(histogram_node);
1364  }
1365 
1366  // output
1367  if( m_eOutFormat == eOutFormat_XML ) {
1368  // trivial since already XML
1369  hist_doc.save_to_stream(cout, xml::save_op_no_decl);
1370  } else if ( m_eOutFormat == eOutFormat_ASCIITable ) {
1371  // convert XML to ASCII table
1372 
1373  const size_t kDigitsInUint8 = numeric_limits<Uint8>::digits10;
1374 
1375  // a histogram for each gap type
1376  _ASSERT(hist_root_node.get_name() == CTempString("histogram_list"));
1377  ITERATE(xml::node, hist_node_it, hist_root_node) {
1378  const xml::node & hist_node = *hist_node_it;
1379  _ASSERT(hist_node.get_name() == CTempString("histogram"));
1380  const xml::attributes & hist_node_attrs =
1381  hist_node_it->get_attributes();
1382 
1383  const char * pchGapName = find_attrib_attr_or_die(
1384  hist_node_attrs, "gap_type").data();
1385 
1386  CTablePrinter::SColInfoVec vecColInfos;
1387  vecColInfos.AddCol("Range", 1 + 2*kDigitsInUint8);
1388  vecColInfos.AddCol("Number in Range", kDigitsInUint8,
1390  CTablePrinter table_printer(vecColInfos, cout);
1391 
1392  cout << "HISTOGRAM FOR " << pchGapName << ":" << endl;
1393 
1394  ITERATE(xml::node, bin_node_it, hist_node) {
1395  _ASSERT(bin_node_it->get_name() == CTempString("bin"));
1396  const xml::attributes & bin_node_attrs =
1397  bin_node_it->get_attributes();
1398 
1399  const Uint8 start = to_uint8(find_attrib_attr_or_die(
1400  bin_node_attrs, "start_inclusive"));
1401  const Uint8 end = to_uint8(find_attrib_attr_or_die(
1402  bin_node_attrs, "end_inclusive"));
1403  const Uint8 num_appearances = to_uint8(find_attrib_attr_or_die(
1404  bin_node_attrs, "num_appearances"));
1405 
1406  table_printer << start << '-' << end << CellEnd();
1407  table_printer << num_appearances << CellEnd();
1408  }
1409  }
1410  }
1411 }
1412 
1413 /////////////////////////////////////////////////////////////////////////////
1414 /// x_PrintOutMessage
1415 
1417  const SOutMessage &out_message, CNcbiOstream & out_strm) const
1418 {
1419  if( m_eOutFormat == eOutFormat_XML ) {
1420  // yes, cout not cerr because everything should go to cout
1421  out_message.WriteAsXML(cout);
1422  } else if ( m_eOutFormat == eOutFormat_ASCIITable ) {
1423  // yes, cout not cerr because everything should go to cout
1424  out_message.WriteAsText(out_strm);
1425  } else {
1426  _TROUBLE;
1427  }
1428 }
1429 
1430 /////////////////////////////////////////////////////////////////////////////
1431 // MAIN
1432 
1433 int main(int argc, const char* argv[])
1434 {
1435  // Execute main application function
1436  return CGapStatsApplication().AppMain(argc, argv);
1437 }
1438 
1439 // just for debugging purposes
1441 {
1442  string as_str;
1443  a_node->save_to_string(as_str);
1444  cerr << as_str << endl;
1445 }
1446 
User-defined methods of the data storage class.
AutoPtr –.
Definition: ncbimisc.hpp:401
CArgAllow_Int8s –.
Definition: ncbiargs.hpp:1706
CArgAllow_Integers –.
Definition: ncbiargs.hpp:1751
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
CConstRef –.
Definition: ncbiobj.hpp:1266
CDirEntry –.
Definition: ncbifile.hpp:262
Base class for reading FASTA sequences.
Definition: fasta.hpp:80
CFile –.
Definition: ncbifile.hpp:1604
CFormatHints & AddPreferredFormat(TFormat fmt)
Mark the format as preferred.
Class implements different ad-hoc unreliable file format identifications.
CFormatHints & GetFormatHints(void)
Get format hints.
EFormat
The formats are checked in the same order as declared here.
@ eBinaryASN
Binary ASN.1.
@ eFasta
FASTA format sequence record, CFastaReader.
@ eTextASN
Text ASN.1.
EFormat GuessFormat(EMode)
static const char * GetFormatName(EFormat format)
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
Give this gaps, or handles containing gaps and then you can get statistics on those gaps.
void x_PrintSeqsForGapLengths(void)
Definition: gap_stats.cpp:1194
GA::ESortGapLength m_eSort
Definition: gap_stats.cpp:303
vector< GA::TGapLength > TGapLengthVec
Definition: gap_stats.cpp:328
virtual int Run(void)
Run the application.
Definition: gap_stats.cpp:506
CRef< CScope > x_GetScope(void)
Definition: gap_stats.cpp:769
TGapTypeCont m_IncludedGapTypes
Definition: gap_stats.cpp:307
int RunNoCatch(void)
"Run" will catch all exceptions and try to do something reasonable, and calls RunNoCatch where the re...
Definition: gap_stats.cpp:553
void x_ReadFileOrAccn(const string &sFileOrAccn)
Reads and loads into m_gapAnalysis.
Definition: gap_stats.cpp:836
void x_PrintHistogram(Uint8 num_bins, CHistogramBinning::EHistAlgo eHistAlgo)
Definition: gap_stats.cpp:1318
void x_PrintOutMessage(const SOutMessage &out_message, CNcbiOstream &out_strm) const
x_PrintOutMessage
Definition: gap_stats.cpp:1416
GA::ESortDir m_eSortDir
Definition: gap_stats.cpp:304
virtual void Init(void)
Initialize the application.
Definition: gap_stats.cpp:367
CSeq_inst::EMol m_MolFilter
Definition: gap_stats.cpp:300
AutoPtr< TGapLengthVec > x_CalcAllGapLens(void) const
Returns a vector of all possible gap lengths we've seen.
Definition: gap_stats.cpp:811
FR::TFlags m_fFastaFlags
Definition: gap_stats.cpp:310
GA::TAddFlag m_fGapAddFlags
Definition: gap_stats.cpp:309
static string x_GapNameToGapXMLNodeName(const CTempString &gap_name)
Definition: gap_stats.cpp:788
set< GA::EGapType > TGapTypeCont
Definition: gap_stats.cpp:306
void x_PrintSummaryView(void)
Definition: gap_stats.cpp:1003
bool x_IncludeGapType(GA::EGapType eGapType) const
Definition: gap_stats.cpp:798
EOutFormat m_eOutFormat
Definition: gap_stats.cpp:316
vector< SBin > TListOfBins
A histogram is given as a vector of bins.
EHistAlgo
Pick which binning algorithm to use when generating the histogram.
@ eHistAlgo_TryForSameNumDataInEachBin
This algorithm tries to make each bin roughly even in size, except the last bin which may be much sma...
@ eHistAlgo_IdentifyClusters
This algorithm tries to make each bin represent values that are clustered together.
@ eHistAlgo_Default
The default algorithm.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CScope –.
Definition: scope.hpp:92
CSeqIdException –.
Definition: Seq_id.hpp:1001
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
class CStaticArrayMap<> provides access to a static array in much the same way as CStaticArraySet<>,...
Definition: static_map.hpp:175
This can be used to lay out neat ASCII data.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
void clear()
Definition: set.hpp:153
bool empty() const
Definition: set.hpp:133
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
const char * get_value(void) const
Get the value of this attribute.
Definition: ait_impl.cpp:350
Const Iterator class for accessing attribute pairs.
Definition: attributes.hpp:320
The xml::attributes class is used to access all the attributes of one xml::node.
Definition: attributes.hpp:78
iterator find(const char *name, const ns *nspace=NULL)
Find the attribute with the given name and namespace.
Definition: attributes.cpp:288
iterator end(void)
Get an iterator that points one past the the last attribute.
Definition: attributes.cpp:174
void insert(const char *name, const char *value, const ns *nspace=NULL)
Add an attribute to the attributes list.
Definition: attributes.cpp:188
The xml::document class is used to hold the XML tree and various bits of information about it.
Definition: document.hpp:80
void save_to_stream(std::ostream &stream, save_option_flags flags=save_op_default) const
Convert the XML document tree into XML text data and then insert it into the given stream.
Definition: document.cpp:922
const node & get_root_node(void) const
Get a reference to the root node of this document.
Definition: document.cpp:539
The xml::node class is used to hold information about one XML node.
Definition: node.hpp:106
const char * get_name(void) const
Get the name of this xml::node.
Definition: node.cpp:769
void push_back(const node &child)
Add a child xml::node to this node.
Definition: node.cpp:1194
void save_to_string(std::string &xml, save_option_flags flags=save_op_default) const
Convert the node and all its children into XML text and set the given string to that text.
Definition: node.cpp:1594
void set_content(const char *content)
Set the content of a node.
Definition: node.cpp:774
iterator insert(const node &n)
Insert a new child node.
Definition: node.cpp:1463
xml::attributes & get_attributes(void)
Get the list of attributes.
Definition: node.cpp:831
CNcbiOstream & operator<<(CNcbiOstream &out, const CEquivRange &range)
Definition: equiv_range.cpp:96
Operators to edit gaps in sequences.
Analyzes gaps and produces various statistics.
USING_SCOPE(objects)
#define ITERATE_GAP_TYPES(iter_name)
Definition: gap_stats.cpp:127
void print_xml_node(xml::node *a_node)
Definition: gap_stats.cpp:1440
int main(int argc, const char *argv[])
Definition: gap_stats.cpp:1433
USING_NCBI_SCOPE
Definition: gap_stats.cpp:57
void reset(element_type *p=0, EOwnership ownership=eTakeOwnership)
Reset will delete the old pointer (if owned), set content to the new value, and assume the ownership ...
Definition: ncbimisc.hpp:480
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
size_t GetNExtra(void) const
Get the number of unnamed positional (a.k.a. extra) args.
Definition: ncbiargs.hpp:422
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eInt8
Convertible into an integer number (Int8 only)
Definition: ncbiargs.hpp:591
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
Definition: ncbiexpt.hpp:724
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
#define FORMAT(message)
Format message using iostreams library.
Definition: ncbiexpt.hpp:672
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_Xml
XML.
Definition: serialdef.hpp:75
@ eSerial_None
Definition: serialdef.hpp:72
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
CRef< CSeq_entry > ReadSet(int max_seqs=kMax_Int, ILineErrorListener *pMessageListener=nullptr)
Read multiple sequences (by default, as many as are available.)
Definition: fasta.cpp:442
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
void ResetDataAndHistory(void)
Clear all information in the scope except added data loaders.
Definition: scope.cpp:331
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
@ eLevel_All
Any bioseq.
Definition: bioseq_ci.hpp:73
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define kMax_Int
Definition: ncbi_limits.h:184
#define kMax_I8
Definition: ncbi_limits.h:221
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define kMax_UInt
Definition: ncbi_limits.h:185
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2697
static Uint8 StringToUInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Uint8.
Definition: ncbistr.cpp:873
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
#define DEFINE_STATIC_FAST_MUTEX(id)
Define static fast mutex and initialize it.
Definition: ncbimtx.hpp:496
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
static void text(MDB_val *v)
Definition: mdb_dump.c:62
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::KEY key
@ save_op_no_decl
Drop the xml declaration.
Definition: xml_save.hpp:60
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
#define NCBI_APP_SET_VERSION_AUTO(major, minor)
Definition: ncbiapp.hpp:67
Defines command line argument related classes.
Defines unified interface to application:
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
T max(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
The Object manager core.
Generic utility macros and templates for exploring NCBI objects.
#define BEGIN_COMMA_END(container)
#define DEFINE_STATIC_ARRAY_MAP(Type, Var, Array)
Definition: static_set.hpp:888
Holds the information about a bin.
Uint8 total_appearances
The total number of data points in this bin for all values from first_number to last_number.
TValue first_number
The start range of the bin (inclusive)
TValue last_number
The end range of the bin (inclusive)
This holds the info about all columns for the table.
void AddCol(const string &sColName, Uint4 iColWidth=0, EJustify eJustify=eJustify_Left, EDataTooLong eDataTooLong=eDataTooLong_Default)
Stream an instance of this object into the CTablePrinter to have it write out the current table cell ...
Define Case-sensitive string comparison methods.
Definition: ncbistr.hpp:4864
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: inftrees.h:24
#define _TROUBLE
#define _ASSERT
Modified on Sun Apr 21 03:41:08 2024 by modify_doxy.py rev. 669887