NCBI C++ ToolKit
rm_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: rm_reader.cpp 94568 2021-08-17 14:20:37Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig, Wratko Hlavina
27  *
28  * File Description:
29  * Repeat Masker file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbithr.hpp>
36 #include <corelib/ncbiutil.hpp>
37 #include <corelib/ncbiexpt.hpp>
38 
39 #include <util/line_reader.hpp>
40 #include <util/value_convert.hpp>
41 #include <util/static_map.hpp>
42 
43 #include <serial/iterator.hpp>
44 #include <serial/objistrasn.hpp>
45 
51 
56 
61 
78 
83 #include <objtools/error_codes.hpp>
84 
85 #include <algorithm>
86 
87 
88 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
89 
91 BEGIN_objects_SCOPE
92 
94 {
96  CBioseq::TId ids;
97  CSeq_id::ParseFastaIds(ids, id);
99  if (best) result = CSeq_id_Handle::GetHandle(*best);
100  return result;
101 }
102 
104 {
105  string family(GetRptFamily());
106  if (family.empty()) {
107  return GetRptClass();
108  } else {
109  return GetRptClass() + '/' + family;
110  }
111 }
112 
113 /*
114 IRepeatRegion::TTaxId IRepeatRegion::GetRptSpecificity() const
115 {
116  return 0;
117 }
118 
119 string IRepeatRegion::GetRptSpecificityName() const
120 {
121  return kEmptyStr;
122 }
123 */
124 
126 {
128  if (location) {
129  result.Assign(*location);
130  } else {
131  result.Reset();
132  }
133 }
134 
136 {
137  return GetLocation()->GetId()->AsFastaString();
138 }
139 
141 {
143 }
144 
146 {
147  return GetLocation()->GetStop(eExtreme_Positional) + 1;
148 }
149 
151 {
152  return GetLocation()->IsReverseStrand();
153 }
154 
156 {
157  return query_location;
158 }
159 
161 {
163  result->SetLocal().SetId(GetRptId());
164  return result;
165 }
166 
167 /// Overridden version returns the orginal unparsed
168 /// sequence identifier, if it was set (non-empty).
169 ///
171 {
172  if (! query_sequence.empty()) return query_sequence;
173  return TParent::GetSeqIdString();
174 }
175 
176 // Implement IRepeatRegion interface. If it weren't for virtual
177 // methods, all of the following could be inlined.
178 
180 {
181  return matching_repeat;
182 }
183 
185 {
186  return rpt_family;
187 }
188 
190 {
191  return rpt_class;
192 }
193 
195  return 0;
196 }
197 
199  if (GetRptPosEnd() == kInvalidSeqPos ||
201  return GetRptPosEnd() + GetRptLeft();
202 }
203 
205  return kEmptyStr;
206 }
207 
209  return kEmptyStr;
210 }
211 
213 {
214  return rpt_id;
215 }
216 
218 {
219  return sw_score;
220 }
221 
223 {
224  return perc_div;
225 }
226 
228 {
229  return perc_del;
230 }
231 
233 {
234  return perc_ins;
235 }
236 
238 {
239  return rpt_pos_begin;
240 }
241 
243 {
244  return rpt_pos_end;
245 }
246 
248 {
249  return rpt_left;
250 }
251 
253 {
254  return query_left;
255 }
256 
258 {
259  return overlapped;
260 }
261 
262 bool CRepeatLibrary::Get(const string& name, TRepeat& dest) const
263 {
264  TMap::const_iterator it(m_Map.find(name));
265  if (it == m_Map.end()) return false;
266  dest = it->second;
267  return true;
268 }
269 
271 {
272  TRepeat repeat;
273  string line;
274  vector<string> tokens;
275 
276  while (! stream.eof()) {
277  NcbiGetlineEOL(stream, line);
278  if (NStr::StartsWith(line, "//")) {
279  // Perl equivalent from rpt_lib2repeat_q.pl:
280  // # Repeats with undefined specificity can be skipped because RepeatMasker
281  // # only uses repeats that have a "Species" set in searches. The
282  // # specificity will be undefined when the "Species:" field is empty.
283  // if ( ($class eq "Simple_repeat" || $class eq "Low_complexity")
284  // && $specificity eq "universal" && $family eq "") {
285  // $length = ""; # length of database sequence is arbitrary
286  // }
287  if ((repeat.m_RptClass == "Simple_repeat" ||
288  repeat.m_RptClass == "Low_complexity") &&
289  repeat.m_RptSpecificityName == "universal" &&
290  repeat.m_RptFamily == "") repeat.m_RptLength = kInvalidSeqPos;
291  m_Map[repeat.m_RptName] = repeat;
292  continue;
293  }
294 
295  // As per EMBL Release 3.4:
296  //
297  // Each line begins with a two-character line type code.
298  // This code is always followed by three blanks, so that the
299  // actual information in each line begins in character position 6.
300  if (line.length() < 6 || line.substr(2, 3) != " ") continue;
301  string code(line.substr(0, 2));
302  string value(line.substr(5));
304 
305  if (code == "ID") {
306  // NOTE: Violates specs as per EMBL Release 3.4.1.
307  // There should be 7 fields.
308  //
309  // Perl equivalent from rpt_lib2repeat_q.pl:
310  // if (m/^ID\s/) {
311  // die "Multiple ID lines found in one record.\nLine $.: $_\n" if defined $name or defined $length;
312  // ($name, $length) = m/^ID\s+(\S+).*\s([1-9][0-9]*) BP\.$/;
313  // die "Failed to extract a repeat name and length from line:\nLine $.: $_\n"
314  // unless defined $name and $name and defined $length and $length;
315  // }
316  repeat.m_RptName = value.substr(0, value.find_first_of(" ;"));
317  string bp(value.substr(value.rfind(';') + 1));
319  repeat.m_RptLength = Convert(bp.substr(0, bp.find(' ')));
320  } else if (code == "DE") {
321  // DE RepbaseID: ACROBAT1
322  if (NStr::StartsWith(value, "RepbaseID:")) {
323  repeat.m_RptRepbaseId = NStr::TruncateSpaces(value.substr(10));
324  }
325  } else if (code == "CC") {
326  if (NStr::MatchesMask(value, "RELEASE *;*")) {
327  m_Release = value.substr(8, value.find(';') - 8);
328  } else if (NStr::StartsWith(value, "Type:")) {
329  // Perl equivalent from rpt_lib2repeat_q.pl:
330  // if (m/^CC +Type:\s*((.*\S)*)\s*$/) {
331  // die "Multiple Type lines found in one record.\nLine $.: $_\n" if defined $class;
332  // $class = $1;
333  // die "Failed to extract a repeat class from line:\nLine $.: $_\n" unless defined $class and $class;
334  // }
335  repeat.m_RptClass = NStr::TruncateSpaces(value.substr(5));
336  if (repeat.m_RptClass.empty()) {
337  repeat.m_RptClass = "Unknown";
338  }
339  } else if (NStr::StartsWith(value, "SubType:")) {
340  // Perl equivalent from rpt_lib2repeat_q.pl:
341  // if (m/^CC +SubType:\s*((.*\S)*)\s*$/) {
342  // die "Multiple SubType lines found in one record.\nLine $.: $_\n" if defined $family;
343  // $family = $1 || ""; # NULL indicates unknown
344  // }
345  repeat.m_RptFamily = NStr::TruncateSpaces(value.substr(8));
346  } else if (NStr::StartsWith(value, "Species:")) {
347  // Perl equivalent from rpt_lib2repeat_q.pl:
348  // if (m/^CC +Species:\s*((.*\S)*)\s*$/) {
349  // die "Multiple Species lines found in one record.\nLine $.: $_\n" if defined $specificity;
350  // $specificity = $1;
351  // $specificity =~ s/_/ /g;
352  // $specificity = "universal" if $specificity eq "root";
353  // }
354  repeat.m_RptSpecificityName = NStr::TruncateSpaces(value.substr(8));
355  if (m_Taxonomy && repeat.m_RptSpecificityName.size()) {
356  pair<TSpecificity2Taxid::iterator, bool> i_specificity =
358  if (i_specificity.second) {
359  i_specificity.first->second = m_Taxonomy->GetTaxId(repeat.m_RptSpecificityName);
360  if (! i_specificity.first->second) {
362  << "RepeatMasker library species failed lookup to taxonomy: "
363  << repeat.m_RptSpecificityName);
364  }
365  }
366  repeat.m_RptSpecificity = i_specificity.first->second;
367  }
368  }
369  }
370  }
371 
372  // Don't need specificity to taxonomy lookups anymore.
374  }
375 
377  const string& name) const
378 {
379  return m_Taxonomy && m_Taxonomy->GetName(taxid) == name;
380 }
381 
382 template <typename T>
383 static void s_SetQual(CSeq_feat::TQual& qual_list,
384  const string& qual, const T val)
385 {
387  result->SetQual(qual);
388  string s = Convert(val).operator string();
389  result->SetVal(s);
390  qual_list.push_back(result);
391 }
392 
393 /// Translate RepeatMasker output to INSDC standard
394 /// nomenclature for repeats. This includes remapping repeat
395 /// family to satellite and mobile element qualifiers, as
396 /// appropriate.
397 ///
398 /// Available INSDC qualifiers are:
399 /// rpt_family, rpt_type, rpt_unit_seq, satellite, standard_name
400 ///
401 static bool s_StandardizeNomenclature(const IRepeatRegion& repeat,
402  CSeq_feat::TQual& qual_list)
403 {
404  string val;
405 
406  string klass = repeat.GetRptClass();
407  string family = repeat.GetRptFamily();
408 
409  if (NStr::EqualNocase(klass, "Satellite")) {
410  val = "satellite:";
411  if (! family.empty()) val += family;
412  val += ' ';
413  val += repeat.GetRptName();
414  s_SetQual(qual_list, "satellite", val);
415  if (! family.empty()) s_SetQual(qual_list, "rpt_family", family);
416  return true;
417  }
418 
419  if (NStr::EqualNocase(klass, "Simple_repeat")) {
420  // Simple_repeat is the family in ASN.1, not the class, based on
421  // evidence of prior submissions to GenBank. For example:
422  // GI:45269107, although this is weak evidence (stuffing
423  // RepeatMasker into Genbank qualifiers without much
424  // effort at standardization).
425  //
426  // Do not expect Simple_repeat/xxx.
427  s_SetQual(qual_list, "rpt_family", klass);
428  s_SetQual(qual_list, "rpt_unit", repeat.GetRptName());
429  return true;
430  }
431 
432  if (NStr::EqualNocase(klass, "SINE") ||
433  NStr::EqualNocase(klass, "LINE") ||
434  NStr::EqualNocase(klass, "LTR")) {
435  // Other valid INSDC mobile elements:
436  // "transposon", "retrotransposon", "integron",
437  // "insertion sequence", "non-LTR retrotransposon",
438  // "MITE", "other"
439  val = klass;
440  val += ':';
441  val += repeat.GetRptName();
442  s_SetQual(qual_list, "mobile_element", val);
443  if (! family.empty()) s_SetQual(qual_list, "rpt_family", family);
444  return true;
445  }
446 
447  return false;
448 }
449 
452  TIdGenerator& ids)
453  : m_Flags(flags)
454  , m_Library(lib)
455  , m_Ids(&ids)
456 {
457 }
458 
460 {
461  m_Library.Reset();
462 }
463 
465 {
466  m_Library.Reset(&lib);
467 }
468 
470 {
471  m_Ids.Reset(new COrdinalFeatIdGenerator);
472 }
473 
475 {
476  m_Ids.Reset(&generator);
477 }
478 
480 {
481  // We can forget old IDs once references have been resolved.
482  m_IdMap.clear();
483 }
484 
486 {
487  CRef<CSeq_feat> feat(new CSeq_feat);
488 
489  // data:
490  CSeqFeatData& sfdata = feat->SetData();
491  CImp_feat_Base& imp = sfdata.SetImp();
492  imp.SetKey("repeat_region");
493 
494  CRef<CFeat_id> id(m_Ids->GenerateId());
495  feat->SetId(*id);
496  TIdMap::iterator id_it(m_IdMap.find(repeat.GetRptId()));
497  if (id_it == m_IdMap.end()) {
498  m_IdMap[repeat.GetRptId()] = id;
499  } else {
501  ref->SetId().Assign(*id_it->second);
502  feat->SetXref().push_back(ref);
503  }
504 
505  // location:
506  repeat.GetLocation(feat->SetLocation());
507 
508  // qualifiers & ext's.
509  if (m_Flags) {
510  // Record if attributes were modified to conform with INSDC standards.
511  bool standardized(false);
512 
514  if (m_Library) m_Library->Get(repeat.GetRptName(), extra);
515 
516  CSeq_feat::TQual& qual_list = feat->SetQual();
517 
520  standardized = s_StandardizeNomenclature(repeat, qual_list);
521  }
522 
523  if (! standardized) {
524  // Did not succeed in standardizing nomenclature
525  // from RepeatMasker to INSDC standards. Fall back to
526  // storing the class/family verbatim.
527  s_SetQual(qual_list, "rpt_family", repeat.GetRptClassFamily());
528  }
529  }
530 
531  if (m_Flags & fIncludeRepeatName && ! standardized) {
532  s_SetQual(qual_list, "standard_name", repeat.GetRptName());
533  }
534 
535  if (m_Flags & fIncludeRepeatPos) {
536  s_SetQual(qual_list, "rpt_unit_range",
538  ".." + NStr::IntToString(repeat.GetRptPosEnd()));
539  }
540 
541  // Get specificity and check it for redundancy (taxid vs name).
542  bool include_specificity_name(false);
544  const IRepeat::TTaxId specificity(extra.GetRptSpecificity());
545  const string specificity_name(extra.GetRptSpecificityName());
546  include_specificity_name = ! specificity_name.empty();
547  if (specificity) {
548  CRef<CDbtag> tag(new CDbtag);
549  // eDbtagType_taxon except the enum is almost useless,
550  // being available to only one function in the Dbtag API.
551  tag->SetDb("taxon");
552  tag->SetTag().SetId(specificity);
553  feat->SetDbxref().push_back(tag);
554  if ((m_Flags & fRemoveRedundancy) && m_Library &&
555  m_Library->TestSpecificityMatchesName(
556  specificity,
557  specificity_name)) {
558  // Name matches taxonomy exactly, so don't store both.
559  include_specificity_name=false;
560  }
561  }
562  }
563 
564  // Get repeat length and check it for redundancy with rpt_left.
565  TSeqPos rpt_length(extra.GetRptLength());
566  if (rpt_length == kInvalidSeqPos) {
567  rpt_length = repeat.GetRptPosEnd() +
568  repeat.GetRptLeft();
569  }
570  bool include_rpt_left(m_Flags & fIncludeCoreStatistics);
571  if ((m_Flags & fRemoveRedundancy) &&
573  (rpt_length == repeat.GetRptPosEnd() +
574  repeat.GetRptLeft())) {
575  // Do not store rpt_left if we know the repeat length,
576  // rpt_left matches it (so it's redundant), and we
577  // want to remove redundancy.
578  include_rpt_left = false;
579  }
580 
581  // Store anything beyond what is possible in INDSC-approved
582  // qualifiers using either non-standard qualifiers or user objects.
583  // There are two options.
584 
586  // Option 1: Use Genbank qualifiers beyond the INDSC-approved set.
587 
589  s_SetQual(qual_list, "sw_score", repeat.GetSwScore());
590  s_SetQual(qual_list, "perc_div", repeat.GetPercDiv());
591  s_SetQual(qual_list, "perc_del", repeat.GetPercDel());
592  s_SetQual(qual_list, "perc_ins", repeat.GetPercIns());
593  if (include_rpt_left) {
594  s_SetQual(qual_list, "rpt_left", repeat.GetRptLeft());
595  }
596  }
597 
599  if (! (m_Flags & fRemoveRedundancy)) {
600  // Query length is always redundant, since sequences
601  // have a bioseq length, and we know the location.
602  s_SetQual(qual_list, "query_length",
603  repeat.GetSeqPosEnd() + repeat.GetSeqLeft());
604  }
605  if (repeat.IsOverlapped()) {
606  s_SetQual(qual_list, "overlapped", true);
607  }
608  }
609 
610  if (m_Flags & fIncludeRepeatId) {
611  s_SetQual(qual_list, "rpt_id", repeat.GetRptId());
612  }
613 
615  s_SetQual(qual_list, "rpt_length", rpt_length);
616  }
617 
618  if (include_specificity_name) {
619  s_SetQual(qual_list, "specificity",
620  extra.GetRptSpecificityName());
621  }
622 
623  } else {
624  // Option 2: Use user objects.
625 
627  feat->SetExts().push_back(uo);
628  uo->SetType().SetStr("RepeatMasker");
629 
631  uo->AddField("sw_score", static_cast<double>(repeat.GetSwScore()));
632  uo->AddField("perc_div", repeat.GetPercDiv());
633  uo->AddField("perc_del", repeat.GetPercDel());
634  uo->AddField("perc_ins", repeat.GetPercIns());
635  if (include_rpt_left) {
636  uo->AddField("rpt_left", static_cast<int>(repeat.GetRptLeft()));
637  }
638  }
639 
641  if (! (m_Flags & fRemoveRedundancy)) {
642  // Query length is always redundant, since sequences
643  // have a bioseq length, and we know the location.
644  uo->AddField("query_length", static_cast<int>(
645  repeat.GetSeqPosEnd() + repeat.GetSeqLeft()));
646  }
647  if (repeat.IsOverlapped()) {
648  uo->AddField("overlapped", true);
649  }
650  }
651 
652  if (m_Flags & fIncludeRepeatId) {
653  uo->AddField("rpt_id", static_cast<int>(repeat.GetRptId()));
654  }
655 
657  uo->AddField("rpt_length", static_cast<int>(rpt_length));
658  }
659 
660  if (include_specificity_name) {
661  uo->AddField("specificity", extra.GetRptSpecificityName());
662  }
663 
664  // Clear out storage of empty user objects.
665  if (! uo->IsSetData()) feat->ResetExts();
666  }
667 
668  // Clear out storage if empty Genbank qualifier lists.
669  if (qual_list.empty()) feat->ResetQual();
670 
672  ! extra.GetRptRepbaseId().empty()) {
673  CRef<CDbtag> tag(new CDbtag);
674  tag->SetDb("REPBASE");
675  tag->SetTag().SetStr(extra.GetRptRepbaseId());
676  feat->SetDbxref().push_back(tag);
677  }
678 
679  if (m_Flags & fSetComment) {
680  // Redundantly, store comments with original information.
681  // The comment tries to stay close to RepeatMasker native
682  // nomenclature. For example, query_left is reported,
683  // rather than the normalized query_length as stored
684  // in user objects or Genbank qualifiers. To accommodate
685  // the possibility the annotation is remapped, the original
686  // query identifier is preserved.
687 
688  CNcbiOstrstream comment;
689  const char eq('='), sep(' ');
690 
691  comment << "source=RepeatMasker";
692  if (m_Flags & fIncludeRepeatName) {
693  comment << sep
694  << "rpt_name" << eq << repeat.GetRptName();
695  }
697  comment << sep
698  << "sw_score" << eq << repeat.GetSwScore() << sep
699  << "perc_div" << eq << repeat.GetPercDiv() << sep
700  << "perc_del" << eq << repeat.GetPercDel() << sep
701  << "perc_ins" << eq << repeat.GetPercIns() << sep
702  << "rpt_left" << eq << repeat.GetRptLeft();
703  }
705  comment << sep
706  << "query" << eq << repeat.GetSeqIdString() << sep
707  << "query_range" << eq;
708  bool reverse(repeat.IsReverseStrand());
709  if (reverse) comment << "complement(";
710  comment << repeat.GetSeqPosBegin()
711  << ".." << repeat.GetSeqPosEnd();
712  if (reverse) comment << ")";
713  comment << sep
714  << "query_left" << eq << repeat.GetSeqLeft();
715  }
716  if (m_Flags & fIncludeRepeatId) {
717  comment << sep
718  << "ID" << eq << repeat.GetRptId();
719  }
720  if (m_Flags & fIncludeExtraStatistics && repeat.IsOverlapped()) {
721  comment << " *";
722  }
723  if (! extra.GetRptSpecificityName().empty()) {
724  comment << sep
725  << "specificity" << eq << extra.GetRptSpecificityName();
726  }
727  if (extra.GetRptLength() != kInvalidSeqPos) {
728  comment << sep
729  << "rpt_length" << eq << extra.GetRptLength();
730  }
731  feat->SetComment(CNcbiOstrstreamToString(comment));
732  }
733  }
734 
735  return feat;
736 }
737 
740  const ISeqIdResolver& seqid_resolver,
741  TIdGenerator& ids)
742  : m_SeqIdResolver(&seqid_resolver)
743  , m_ToFeat(flags, lib, ids)
744 {
745 }
746 
748 {
749 }
750 
752 {
754 }
755 
757 {
758  m_SeqIdResolver.Reset(&seqid_resolver);
759 }
760 
762 {
763  return m_ToFeat;
764 }
765 
768 {
769  CRef<CSerialObject> object(
770  ReadSeqAnnot(lr, pMessageListener).ReleaseOrNull());
771  return object;
772 }
773 
776 {
777  CRef<CSeq_annot> annot(new CSeq_annot);
778  // CRef<CAnnot_descr> desc(new CAnnot_descr);
779  // annot->SetDesc(*desc);
780  CSeq_annot::C_Data::TFtable& ftable = annot->SetData().SetFtable();
781 
782  string line;
783  size_t record_counter = 0;
784 
785  while ( ! lr.AtEOF() ) {
786  line = *++lr;
787 
788  if ( IsHeaderLine( line ) || IsIgnoredLine( line ) ) {
789  continue;
790  }
791  ++record_counter;
792  //if (record_counter == 91555) {
793  // cerr << "";
794  //}
795 
796  SRepeatRegion mask_data;
797  if ( ! ParseRecord( line, mask_data ) ) {
800  eDiag_Error,
801  lr.GetLineNumber(),
802  "RepeatMasker Reader: Parse error in record = " + line) );
803  ProcessError(*pErr, pMessageListener);
804  continue;
805  }
806 
807  if ( ! VerifyData( mask_data ) ) {
810  eDiag_Error,
811  lr.GetLineNumber(),
812  "RepeatMasker Reader: Verification error in record = " + line) );
813  ProcessError(*pErr, pMessageListener);
814  continue;
815  }
816 
817  CRef<CSeq_feat> feat(m_ToFeat(mask_data));
818  if ( ! feat ) {
821  eDiag_Error,
822  lr.GetLineNumber(),
823  "RepeatMasker Reader: Aborting file import, "
824  "unable to create feature table for record = " + line) );
825  ProcessError(*pErr, pMessageListener);
826  // we don't tolerate even a few errors here!
827  break;
828  }
829 
830  ftable.push_back(feat);
831  }
832  // if (! record_counter) annot.Reset();
833  if (annot) {
834  xAddConversionInfo(*annot, pMessageListener);
835  }
836  return annot;
837 }
838 
839 
840 bool CRepeatMaskerReader::IsHeaderLine(const string& line)
841 {
842  string labels_1st_line[] = { "perc", "query", "position", "matching", "" };
843  string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
844 
845  // try to identify 1st line of column labels:
846  size_t current_offset = 0;
847  size_t i = 0;
848  for ( ; labels_1st_line[i] != ""; ++i ) {
849  current_offset = NStr::FindCase( line, labels_1st_line[i], current_offset );
850  if ( NPOS == current_offset ) {
851  break;
852  }
853  }
854  if ( labels_1st_line[i] == "" ) {
855  return true;
856  }
857 
858  // try to identify 2nd line of column labels:
859  current_offset = 0;
860  i = 0;
861  for ( ; labels_2nd_line[i] != ""; ++i ) {
862  current_offset = NStr::FindCase( line, labels_2nd_line[i], current_offset );
863  if ( NPOS == current_offset ) {
864  return false;
865  }
866  }
867  return true;
868 }
869 
870 
871 bool CRepeatMaskerReader::IsIgnoredLine(const string& line)
872 {
873  if ( NStr::StartsWith(line, "There were no repetitive sequences detected in "))
874  return true;
875  if ( NStr::FindCase(line, "only contains ambiguous bases") != NPOS)
876  return true;
877  return ( NStr::TruncateSpaces( line ).length() == 0 );
878 }
879 
880 
881 static void StripParens(string& s)
882 {
883  SIZE_TYPE b = 0;
884  SIZE_TYPE e = s.size();
885  if (e > 0 && s[b] == '(') {
886  ++b;
887  if (s[e - 1] == ')') --e;
888  if (e == b)
889  s = kEmptyStr;
890  else
891  s = s.substr(b, e - b);
892  }
893 }
894 
895 bool CRepeatMaskerReader::ParseRecord(const string& record, SRepeatRegion& mask_data)
896 {
897  const size_t MIN_VALUE_COUNT = 15;
898 
899  string line = NStr::TruncateSpaces( record );
900  list< string > values;
901  if ( NStr::Split( line, " \t", values, NStr::fSplit_Tokenize ).size() < MIN_VALUE_COUNT ) {
902  return false;
903  }
904 
905  try {
906  // 1: "SW score"
907  list<string>::iterator it = values.begin();
908  mask_data.sw_score = NStr::StringToUInt( *it );
909 
910  // 2: "perc div."
911  ++it;
912  mask_data.perc_div = NStr::StringToDouble( *it );
913 
914  // 3: "perc del."
915  ++it;
916  mask_data.perc_del = NStr::StringToDouble( *it );
917 
918  // 4: "perc ins."
919  ++it;
920  mask_data.perc_ins = NStr::StringToDouble( *it );
921 
922  // 5: "query sequence"
923  ++it;
924  mask_data.query_sequence = *it;
925  CSeq_id_Handle idh(m_SeqIdResolver->ResolveSeqId(mask_data.query_sequence));
927  if (! id) return false;
928  mask_data.query_location.Reset(new CSeq_loc);
930  location.SetId().Assign(*id);
931 
932  // 6: "position begin"
933  ++it;
934  TSeqPos pos_begin = NStr::StringToUInt(*it);
935  if (pos_begin == 0) return false;
936  location.SetFrom(pos_begin - 1);
937 
938  // 7: "in end"
939  ++it;
940  TSeqPos pos_end = NStr::StringToUInt(*it);
941  if (pos_end == 0 || pos_end < pos_begin) return false;
942  location.SetTo(pos_end - 1);
943 
944  // 8: "query (left)"
945  ++it;
946  StripParens(*it);
947  mask_data.query_left = NStr::StringToUInt( *it );
948 
949  // 9: "" (meaning "strand")
950  ++it;
951  // Having the strand, we now have all fields to populate the location.
952  location.SetStrand(*it == "C" ? eNa_strand_minus : eNa_strand_plus);
953 
954  // 10: "matching repeat"
955  ++it;
956  mask_data.matching_repeat = *it;
957 
958  // 11: "repeat class/family"
959  ++it;
960  string class_family = *it;
961  NStr::SplitInTwo(class_family, "/",
962  mask_data.rpt_class, mask_data.rpt_family);
963 
964  // 12: "position in"
965  ++it;
966  string field12 = *it;
967 
968  // 13: "in end"
969  ++it;
970  mask_data.rpt_pos_end = NStr::StringToUInt( *it );
971 
972  // 14: "repeat left"
973  ++it;
974  string field14 = *it;
975 
976  // fields position 12 and 14 flip depending on the strand value.
977  string rpt_left;
978  if (mask_data.IsReverseStrand()) {
979  mask_data.rpt_pos_begin = NStr::StringToInt( field14 );
980  rpt_left = field12;
981  } else {
982  mask_data.rpt_pos_begin = NStr::StringToInt( field12 );
983  rpt_left = field14;
984  }
985 
986  StripParens(rpt_left);
987  mask_data.rpt_left = NStr::StringToInt(rpt_left);
988 
989  // 15: "ID"
990  ++it;
991  mask_data.rpt_id = NStr::StringToUInt(*it);
992 
993  // 16: overlapped (higher score repeat overlaps)
994  ++it;
995  mask_data.overlapped = (it != values.end() && (*it) == "*");
996  }
997  catch( ... ) {
998  return false;
999  }
1000 
1001  return true;
1002 }
1003 
1005 {
1006  //
1007  // This would be the place for any higher level checks of the mask data
1008  // collected from the record ...
1009  //
1010  return true;
1011 }
1012 
1013 
1014 CRmReader::CRmReader(CNcbiIstream& istr) : m_Istr(istr)
1015 {
1016 }
1017 
1019 {
1020  //
1021  // This is the point to make sure we are dealing with the right file type and
1022  // to allocate the specialist reader for any subtype (OUT, HTML) we encouter.
1023  // When this function returns the file pointer should be past the file header
1024  // and at the beginning of the actual mask data.
1025  //
1026  // Note:
1027  // If something goes wrong during header processing then the file pointer will
1028  // still be modified. It's the caller's job to restore the file pointer if this
1029  // is possible for this type of stream.
1030  //
1031 
1032  //
1033  // 2006-03-31: Only supported file type at this time: ReadMasker OUT.
1034  //
1035  return new CRmReader(istr);
1036 }
1037 
1039 {
1040  delete reader;
1041 }
1042 
1044  TFlags flags, size_t errors)
1045 {
1046  annot->Reset();
1048  CMessageListenerWithLog error_container(DIAG_COMPILE_INFO);
1049  CRef<CSeq_annot> result(impl.ReadSeqAnnot(m_Istr, &error_container));
1050  annot->Assign(*result, eShallow);
1051 }
1052 
1053 
1054 END_objects_SCOPE
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
User-defined methods of the data storage class.
User-defined methods of the data storage class.
AutoPtr –.
Definition: ncbimisc.hpp:401
Definition: Dbtag.hpp:53
Default implementation of a Seq-id resolver, which knows about FASTA-formatted sequence identifiers.
CSeq_id_Handle ResolveSeqId(const string &id) const
Returns a normalized representation of a sequence identifier, as Seq-id handle.
Definition: rm_reader.cpp:93
CFeat_id –.
Definition: Feat_id.hpp:66
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
*** Import *********************************************** * * Features imported from other databases...
Definition: Imp_feat_.hpp:77
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Definition: line_error.cpp:194
Default implementation for a generator of identifiers, as integers, mashalled as CFeat_id objects.
void ProcessError(CObjReaderLineException &, ILineErrorListener *)
virtual void xAddConversionInfo(CSeq_annot &, ILineErrorListener *)
Class acting as an interface to a RepeatMasker library.
Definition: rm_reader.hpp:421
TSpecificity2Taxid m_Specificity2TaxId
Definition: rm_reader.hpp:449
CConstIRef< ITaxonomyResolver > m_Taxonomy
Definition: rm_reader.hpp:447
bool TestSpecificityMatchesName(TRepeat::TTaxId taxid, const string &name) const
Check if a given taxid's scientific name matches the original specificity string.
Definition: rm_reader.cpp:376
string m_Release
Definition: rm_reader.hpp:450
void Read(CNcbiIstream &stream)
Reads a library from the RepeatMaskerLib.embl-style input.
Definition: rm_reader.cpp:270
bool Get(const string &name, TRepeat &dest) const
Gets information about a given repeat, specified by name.
Definition: rm_reader.cpp:262
Implements a concrete class for reading RepeatMasker output from tabular form and rendering it as ASN...
Definition: rm_reader.hpp:690
void SetSeqIdResolver(ISeqIdResolver &seqid_resolver)
Use specified delegate for Seq-id resolution.
Definition: rm_reader.cpp:756
virtual CRef< CSerialObject > ReadObject(CNcbiIstream &istr, ILineErrorListener *pErrors=nullptr)
Read an object from a given input stream, render it as the most appropriate Genbank object.
virtual bool VerifyData(const SRepeatRegion &mask_data)
Definition: rm_reader.cpp:1004
CRepeatMaskerReader(TFlags flags=fDefaults, CConstRef< TRepeatLibrary > lib=null, const ISeqIdResolver &seqid_resolver= *(CConstIRef< ISeqIdResolver >(new CFastaIdsResolver)), TIdGenerator &ids= *(CIRef< TIdGenerator >(new COrdinalFeatIdGenerator)))
Implement CReaderBase.
Definition: rm_reader.cpp:738
virtual bool ParseRecord(const string &record, SRepeatRegion &mask_data)
Definition: rm_reader.cpp:895
virtual bool IsHeaderLine(const string &line)
Definition: rm_reader.cpp:840
TConverter & SetConverter()
Delegate for conversion from IRepeatRegion to ASN.1.
Definition: rm_reader.cpp:761
TConverter m_ToFeat
Definition: rm_reader.hpp:740
virtual bool IsIgnoredLine(const string &line)
Definition: rm_reader.cpp:871
void ResetSeqIdResolver()
Use default Seq-id resolution.
Definition: rm_reader.cpp:751
virtual CRef< CSeq_annot > ReadSeqAnnot(CNcbiIstream &istr, ILineErrorListener *pErrors=nullptr)
Read an object from a given input stream, render it as a single Seq-annot.
CConstIRef< ISeqIdResolver > m_SeqIdResolver
Definition: rm_reader.hpp:739
virtual ~CRepeatMaskerReader(void)
Definition: rm_reader.cpp:747
Class which, given an input IRepeatRegion, can generate an appropriate and normalized NCBI ASN....
Definition: rm_reader.hpp:628
void ResetRepeatLibrary()
Clear out any repeat library which may be used to add additional attributes to repeats.
Definition: rm_reader.cpp:459
CRepeatToFeat(TFlags flags=fDefaults, CConstRef< TRepeatLibrary > lib=null, TIdGenerator &ids= *(CIRef< TIdGenerator >(new COrdinalFeatIdGenerator)))
Definition: rm_reader.cpp:450
void SetRepeatLibrary(const TRepeatLibrary &lib)
Set a repeat library which may be used to add additional attributes to repeats.
Definition: rm_reader.cpp:464
TIdMap m_IdMap
Definition: rm_reader.hpp:682
CRef< CSeq_feat > operator()(const IRepeatRegion &repeat)
Transforms the input repeat into a repeat feature.
Definition: rm_reader.cpp:485
TFlags m_Flags
Definition: rm_reader.hpp:679
void ResetIdGenerator()
Reset the Feature-id generator, do use a default implementation which will generate unique integer lo...
Definition: rm_reader.cpp:469
void AssertReferencesResolved()
Asserts that all forward/backward references between any objects visited have now been resolved.
Definition: rm_reader.cpp:479
CIRef< TIdGenerator > m_Ids
Definition: rm_reader.hpp:681
void SetIdGenerator(TIdGenerator &generator)
Set the Feature-id generator which will be used to assign unique feature IDs.
Definition: rm_reader.cpp:474
CConstRef< TRepeatLibrary > m_Library
Definition: rm_reader.hpp:680
Deprecated, old API for loading RepeatMasker output.
Definition: rm_reader.hpp:748
void Read(CRef< CSeq_annot > annot, TFlags flags=fDefaults, size_t errors=kMax_UInt)
Definition: rm_reader.cpp:1043
static CRmReader * OpenReader(CNcbiIstream &istr)
Definition: rm_reader.cpp:1018
CRmReader(CNcbiIstream &istr)
Definition: rm_reader.cpp:1014
CNcbiIstream & m_Istr
Definition: rm_reader.hpp:757
static void CloseReader(CRmReader *reader)
Definition: rm_reader.cpp:1038
void SetImp(TImp &v)
CSeqFeatXref –.
Definition: SeqFeatXref.hpp:66
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
virtual TSeqPos GetRptPosBegin() const =0
virtual TPercent GetPercIns() const =0
virtual TRptId GetRptId() const =0
string GetRptClassFamily() const
Covenience function to get the class and family as one value, the way that RepeatMasker emits them.
Definition: rm_reader.cpp:103
unsigned int TRptId
Definition: rm_reader.hpp:176
unsigned long TScore
Definition: rm_reader.hpp:177
virtual bool IsOverlapped() const =0
Flag that there is a higher-scoring match whose domain partly (<80%) includes the domain of this matc...
virtual TPercent GetPercDiv() const =0
virtual TScore GetSwScore() const =0
virtual TSeqPos GetRptPosEnd() const =0
virtual TSeqPos GetRptLeft() const =0
virtual TSeqPos GetSeqLeft() const =0
virtual TPercent GetPercDel() const =0
Interface defining a read-only RepeatMasker repeat feature.
Definition: rm_reader.hpp:230
virtual bool IsReverseStrand() const
Convenience functions that gets the strand on the sequence, without dealing with a Seq-loc.
Definition: rm_reader.cpp:150
virtual string GetSeqIdString() const
Gets the sequence from the location of the repeat, without dealing with a Seq-loc.
Definition: rm_reader.cpp:135
virtual TSeqPos GetSeqPosBegin() const
Convenience function that gets the position start on the sequence, without dealing with a Seq-loc.
Definition: rm_reader.cpp:140
virtual TSeqPos GetSeqPosEnd() const
Convenience functions that gets the position end on the sequence, without dealing with a Seq-loc.
Definition: rm_reader.cpp:145
virtual CConstRef< CSeq_loc > GetLocation(void) const =0
Gets the location of this repeat.
virtual string GetRptFamily() const =0
Gets repeat family, or empty string if not known.
virtual string GetRptClass() const =0
Gets repeat class, or empty string if not known.
virtual string GetRptName() const =0
Gets repeat name.
ITaxonomyResolver::TTaxId TTaxId
Definition: rm_reader.hpp:90
@ fIncludeRepeatId
Store original RepeatMasker repeat_id.
Definition: rm_reader.hpp:545
@ fIncludeRepeatSpecificity
Store the specificity from the RepeatMasker library, if provided.
Definition: rm_reader.hpp:550
@ fIncludeRepeatPos
Store the repeat position, that is, the interval on the repeat sequence.
Definition: rm_reader.hpp:541
@ fRemoveRedundancy
Removes redundant fields.
Definition: rm_reader.hpp:482
@ fAllowNonstandardQualifiers
Avoid user objects and instead, put selected information in non-standard and invalid GenBank qualifie...
Definition: rm_reader.hpp:497
@ fStandardizeNomenclature
Translate RepeatMasker output to INSDC standard nomenclature for repeats.
Definition: rm_reader.hpp:475
@ fIncludeCoreStatistics
Store core statistics, which include the scores of sw_score, perc_div, perc_del, perc_ins,...
Definition: rm_reader.hpp:511
@ fIncludeRepeatRepbaseId
Store the RepbaseID from the RepeatMasker library, if provided.
Definition: rm_reader.hpp:559
@ fIncludeRepeatLength
Store the repeat length as reported in the library.
Definition: rm_reader.hpp:554
@ fSetComment
Selected attributes beyond what is stored in GenBank standard qualifiers will be included as comments...
Definition: rm_reader.hpp:505
@ fIncludeExtraStatistics
Store extra statistics, which includes the length of the query (or query_left, equivalently),...
Definition: rm_reader.hpp:517
Interface for resolving a sequence identifier given a textual representation.
Structure implementing the IRepeatRegion API as a simple store of data memebers.
Definition: rm_reader.hpp:351
string rpt_family
Definition: rm_reader.hpp:409
string GetRptClass() const
Gets repeat class, or empty string if not known.
Definition: rm_reader.cpp:189
TPercent perc_del
Definition: rm_reader.hpp:403
TPercent GetPercDiv() const
Definition: rm_reader.cpp:222
TSeqPos query_left
Definition: rm_reader.hpp:401
TSeqPos rpt_pos_begin
Definition: rm_reader.hpp:410
TSeqPos rpt_pos_end
Definition: rm_reader.hpp:411
TPercent GetPercDel() const
Definition: rm_reader.cpp:227
string GetSeqIdString() const
Overridden version returns the orginal unparsed sequence identifier, if it was set (non-empty).
Definition: rm_reader.cpp:170
bool IsOverlapped() const
Flag that there is a higher-scoring match whose domain partly (<80%) includes the domain of this matc...
Definition: rm_reader.cpp:257
TPercent GetPercIns() const
Definition: rm_reader.cpp:232
string rpt_class
Definition: rm_reader.hpp:408
TRptId GetRptId() const
Definition: rm_reader.cpp:212
TSeqPos GetSeqLeft() const
Definition: rm_reader.cpp:252
string GetRptName() const
Gets repeat name.
Definition: rm_reader.cpp:179
string GetRptFamily() const
Gets repeat family, or empty string if not known.
Definition: rm_reader.cpp:184
TSeqPos GetRptLeft() const
Definition: rm_reader.cpp:247
string matching_repeat
Definition: rm_reader.hpp:407
TScore GetSwScore() const
Definition: rm_reader.cpp:217
TTaxId GetRptSpecificity() const
Returns 0, not known.
Definition: rm_reader.cpp:194
TSeqPos GetRptPosEnd() const
Definition: rm_reader.cpp:242
TSeqPos GetRptLength() const
Gets repeat length, or kInvalidSeqPos if not known.
Definition: rm_reader.cpp:198
string GetRptRepbaseId() const
Returns an empty string, not known.
Definition: rm_reader.cpp:208
TSeqPos GetRptPosBegin() const
Definition: rm_reader.cpp:237
string query_sequence
Definition: rm_reader.hpp:405
CRef< CSeq_loc > query_location
Definition: rm_reader.hpp:399
string GetRptSpecificityName() const
Returns an empty string, not known.
Definition: rm_reader.cpp:204
TSeqPos rpt_left
Definition: rm_reader.hpp:412
TScore sw_score
Definition: rm_reader.hpp:400
CConstRef< CSeq_loc > GetLocation(void) const
Gets the location of this repeat.
Definition: rm_reader.cpp:155
TPercent perc_div
Definition: rm_reader.hpp:402
TPercent perc_ins
Definition: rm_reader.hpp:404
CConstRef< CFeat_id > GetId() const
Gets the more general feature ID for this repeat, which identifies a single repeat,...
Definition: rm_reader.cpp:160
Implementation of IRepeat backed by a simple structure.
Definition: rm_reader.hpp:122
string m_RptName
Definition: rm_reader.hpp:136
string GetRptRepbaseId() const
Gets the RepbaseID, or empty string if not known.
Definition: rm_reader.hpp:132
string m_RptSpecificityName
Definition: rm_reader.hpp:141
string m_RptClass
Definition: rm_reader.hpp:138
TTaxId m_RptSpecificity
Definition: rm_reader.hpp:140
TSeqPos m_RptLength
Definition: rm_reader.hpp:139
string GetRptSpecificityName() const
Gets specificity as a name, or empty string if not known.
Definition: rm_reader.hpp:131
string m_RptFamily
Definition: rm_reader.hpp:137
TSeqPos GetRptLength() const
Gets repeat length, or kInvalidSeqPos if not known.
Definition: rm_reader.hpp:129
string m_RptRepbaseId
Definition: rm_reader.hpp:142
TTaxId GetRptSpecificity() const
Gets specificity as a taxonomy ID, or 0 if not known.
Definition: rm_reader.hpp:130
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
#define T(s)
Definition: common.h:230
static const char location[]
Definition: config.c:97
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
string
Definition: cgiapp.hpp:687
#define DIAG_COMPILE_INFO
Make compile time diagnostic information object to use in CNcbiDiag and CException.
Definition: ncbidiag.hpp:170
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
@ eShallow
Assign/Compare pointers only.
Definition: serialdef.hpp:193
virtual Uint8 GetLineNumber(void) const =0
Returns the current line number (counting from 1, not 0).
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
CConstRef< CSeq_id > GetSeqIdOrNull(void) const
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
Definition: Seq_id.cpp:2603
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
static int Score(const CRef< CSeq_id > &id)
Wrappers for use with FindBestChoice from <corelib/ncbiutil.hpp>
Definition: Seq_id.hpp:772
bool IsReverseStrand(void) const
Return true if all ranges have reverse strand.
Definition: Seq_loc.hpp:995
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define kEmptyStr
Definition: ncbistr.hpp:123
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static bool MatchesMask(CTempString str, CTempString mask, ECase use_case=eCase)
Match "str" against the "mask".
Definition: ncbistr.cpp:389
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1387
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static SIZE_TYPE FindCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case sensitive search.
Definition: ncbistr.hpp:5490
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3554
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
void SetType(TType &value)
Assign a value to Type data member.
TXref & SetXref(void)
Assign a value to Xref data member.
Definition: Seq_feat_.hpp:1314
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
Definition: Seq_feat_.hpp:1339
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
TExts & SetExts(void)
Assign a value to Exts data member.
Definition: Seq_feat_.hpp:1483
void ResetExts(void)
Reset Exts data member.
Definition: Seq_feat_.cpp:206
void SetId(TId &value)
Assign a value to Id data member.
Definition: Seq_feat_.cpp:73
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
vector< CRef< CGb_qual > > TQual
Definition: Seq_feat_.hpp:117
TQual & SetQual(void)
Assign a value to Qual data member.
Definition: Seq_feat_.hpp:1153
void ResetQual(void)
Reset Qual data member.
Definition: Seq_feat_.cpp:136
void SetKey(const TKey &value)
Assign a value to Key data member.
Definition: Imp_feat_.hpp:268
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
virtual void Reset(void)
Reset the whole object.
Definition: Seq_annot_.cpp:249
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
Definition of all error codes used in objtools libraries.
int i
Lightweight interface for getting lines of data with minimal memory copying.
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const char * tag
Defines NCBI C++ exception handling.
Multi-threading – classes, functions, and features.
Useful/utility classes and methods.
bool eq(T x_, T y_, T round_)
Definition: njn_approx.hpp:79
static bool s_StandardizeNomenclature(const IRepeatRegion &repeat, CSeq_feat::TQual &qual_list)
Translate RepeatMasker output to INSDC standard nomenclature for repeats.
Definition: rm_reader.cpp:401
static void StripParens(string &s)
Definition: rm_reader.cpp:881
static void s_SetQual(CSeq_feat::TQual &qual_list, const string &qual, const T val)
Definition: rm_reader.cpp:383
Definition: inftrees.h:24
else result
Definition: token2.c:20
#define ftable
Definition: utilfeat.h:37
const value_slice::CValueConvert< value_slice::SRunTimeCP, FROM > Convert(const FROM &value)
Modified on Wed Apr 24 14:19:30 2024 by modify_doxy.py rev. 669887