1 /* $Id: rm_reader.cpp 94568 2021-08-17 14:20:37Z stakhovv $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig, Wratko Hlavina
27  *
28  * File Description:
29  * Repeat Masker file reader
30  *
31  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbithr.hpp>
36 #include <corelib/ncbiutil.hpp>
37 #include <corelib/ncbiexpt.hpp>
39 #include <util/line_reader.hpp>
40 #include <util/value_convert.hpp>
41 #include <util/static_map.hpp>
43 #include <serial/iterator.hpp>
44 #include <serial/objistrasn.hpp>
83 #include <objtools/error_codes.hpp>
85 #include <algorithm>
88 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
91 BEGIN_objects_SCOPE
94 {
96  CBioseq::TId ids;
97  CSeq_id::ParseFastaIds(ids, id);
99  if (best) result = CSeq_id_Handle::GetHandle(*best);
100  return result;
101 }
104 {
105  string family(GetRptFamily());
106  if (family.empty()) {
107  return GetRptClass();
108  } else {
109  return GetRptClass() + '/' + family;
110  }
111 }
113 /*
114 IRepeatRegion::TTaxId IRepeatRegion::GetRptSpecificity() const
115 {
116  return 0;
117 }
119 string IRepeatRegion::GetRptSpecificityName() const
120 {
121  return kEmptyStr;
122 }
123 */
126 {
128  if (location) {
129  result.Assign(*location);
130  } else {
131  result.Reset();
132  }
133 }
136 {
137  return GetLocation()->GetId()->AsFastaString();
138 }
141 {
143 }
146 {
147  return GetLocation()->GetStop(eExtreme_Positional) + 1;
148 }
151 {
152  return GetLocation()->IsReverseStrand();
153 }
156 {
157  return query_location;
158 }
161 {
163  result->SetLocal().SetId(GetRptId());
164  return result;
165 }
167 /// Overridden version returns the orginal unparsed
168 /// sequence identifier, if it was set (non-empty).
169 ///
171 {
172  if (! query_sequence.empty()) return query_sequence;
173  return TParent::GetSeqIdString();
174 }
176 // Implement IRepeatRegion interface. If it weren't for virtual
177 // methods, all of the following could be inlined.
180 {
181  return matching_repeat;
182 }
185 {
186  return rpt_family;
187 }
190 {
191  return rpt_class;
192 }
195  return 0;
196 }
199  if (GetRptPosEnd() == kInvalidSeqPos ||
201  return GetRptPosEnd() + GetRptLeft();
202 }
205  return kEmptyStr;
206 }
209  return kEmptyStr;
210 }
213 {
214  return rpt_id;
215 }
218 {
219  return sw_score;
220 }
223 {
224  return perc_div;
225 }
228 {
229  return perc_del;
230 }
233 {
234  return perc_ins;
235 }
238 {
239  return rpt_pos_begin;
240 }
243 {
244  return rpt_pos_end;
245 }
248 {
249  return rpt_left;
250 }
253 {
254  return query_left;
255 }
258 {
259  return overlapped;
260 }
262 bool CRepeatLibrary::Get(const string& name, TRepeat& dest) const
263 {
264  TMap::const_iterator it(m_Map.find(name));
265  if (it == m_Map.end()) return false;
266  dest = it->second;
267  return true;
268 }
271 {
272  TRepeat repeat;
273  string line;
274  vector<string> tokens;
276  while (! stream.eof()) {
277  NcbiGetlineEOL(stream, line);
278  if (NStr::StartsWith(line, "//")) {
279  // Perl equivalent from
280  // # Repeats with undefined specificity can be skipped because RepeatMasker
281  // # only uses repeats that have a "Species" set in searches. The
282  // # specificity will be undefined when the "Species:" field is empty.
283  // if ( ($class eq "Simple_repeat" || $class eq "Low_complexity")
284  // && $specificity eq "universal" && $family eq "") {
285  // $length = ""; # length of database sequence is arbitrary
286  // }
287  if ((repeat.m_RptClass == "Simple_repeat" ||
288  repeat.m_RptClass == "Low_complexity") &&
289  repeat.m_RptSpecificityName == "universal" &&
290  repeat.m_RptFamily == "") repeat.m_RptLength = kInvalidSeqPos;
291  m_Map[repeat.m_RptName] = repeat;
292  continue;
293  }
295  // As per EMBL Release 3.4:
296  //
297  // Each line begins with a two-character line type code.
298  // This code is always followed by three blanks, so that the
299  // actual information in each line begins in character position 6.
300  if (line.length() < 6 || line.substr(2, 3) != " ") continue;
301  string code(line.substr(0, 2));
302  string value(line.substr(5));
305  if (code == "ID") {
306  // NOTE: Violates specs as per EMBL Release 3.4.1.
307  // There should be 7 fields.
308  //
309  // Perl equivalent from
310  // if (m/^ID\s/) {
311  // die "Multiple ID lines found in one record.\nLine $.: $_\n" if defined $name or defined $length;
312  // ($name, $length) = m/^ID\s+(\S+).*\s([1-9][0-9]*) BP\.$/;
313  // die "Failed to extract a repeat name and length from line:\nLine $.: $_\n"
314  // unless defined $name and $name and defined $length and $length;
315  // }
316  repeat.m_RptName = value.substr(0, value.find_first_of(" ;"));
317  string bp(value.substr(value.rfind(';') + 1));
319  repeat.m_RptLength = Convert(bp.substr(0, bp.find(' ')));
320  } else if (code == "DE") {
321  // DE RepbaseID: ACROBAT1
322  if (NStr::StartsWith(value, "RepbaseID:")) {
323  repeat.m_RptRepbaseId = NStr::TruncateSpaces(value.substr(10));
324  }
325  } else if (code == "CC") {
326  if (NStr::MatchesMask(value, "RELEASE *;*")) {
327  m_Release = value.substr(8, value.find(';') - 8);
328  } else if (NStr::StartsWith(value, "Type:")) {
329  // Perl equivalent from
330  // if (m/^CC +Type:\s*((.*\S)*)\s*$/) {
331  // die "Multiple Type lines found in one record.\nLine $.: $_\n" if defined $class;
332  // $class = $1;
333  // die "Failed to extract a repeat class from line:\nLine $.: $_\n" unless defined $class and $class;
334  // }
335  repeat.m_RptClass = NStr::TruncateSpaces(value.substr(5));
336  if (repeat.m_RptClass.empty()) {
337  repeat.m_RptClass = "Unknown";
338  }
339  } else if (NStr::StartsWith(value, "SubType:")) {
340  // Perl equivalent from
341  // if (m/^CC +SubType:\s*((.*\S)*)\s*$/) {
342  // die "Multiple SubType lines found in one record.\nLine $.: $_\n" if defined $family;
343  // $family = $1 || ""; # NULL indicates unknown
344  // }
345  repeat.m_RptFamily = NStr::TruncateSpaces(value.substr(8));
346  } else if (NStr::StartsWith(value, "Species:")) {
347  // Perl equivalent from
348  // if (m/^CC +Species:\s*((.*\S)*)\s*$/) {
349  // die "Multiple Species lines found in one record.\nLine $.: $_\n" if defined $specificity;
350  // $specificity = $1;
351  // $specificity =~ s/_/ /g;
352  // $specificity = "universal" if $specificity eq "root";
353  // }
354  repeat.m_RptSpecificityName = NStr::TruncateSpaces(value.substr(8));
355  if (m_Taxonomy && repeat.m_RptSpecificityName.size()) {
356  pair<TSpecificity2Taxid::iterator, bool> i_specificity =
358  if (i_specificity.second) {
359  i_specificity.first->second = m_Taxonomy->GetTaxId(repeat.m_RptSpecificityName);
360  if (! i_specificity.first->second) {
362  << "RepeatMasker library species failed lookup to taxonomy: "
363  << repeat.m_RptSpecificityName);
364  }
365  }
366  repeat.m_RptSpecificity = i_specificity.first->second;
367  }
368  }
369  }
370  }
372  // Don't need specificity to taxonomy lookups anymore.
374  }
377  const string& name) const
378 {
379  return m_Taxonomy && m_Taxonomy->GetName(taxid) == name;
380 }
382 template <typename T>
383 static void s_SetQual(CSeq_feat::TQual& qual_list,
384  const string& qual, const T val)
385 {
387  result->SetQual(qual);
388  string s = Convert(val).operator string();
389  result->SetVal(s);
390  qual_list.push_back(result);
391 }
393 /// Translate RepeatMasker output to INSDC standard
394 /// nomenclature for repeats. This includes remapping repeat
395 /// family to satellite and mobile element qualifiers, as
396 /// appropriate.
397 ///
398 /// Available INSDC qualifiers are:
399 /// rpt_family, rpt_type, rpt_unit_seq, satellite, standard_name
400 ///
401 static bool s_StandardizeNomenclature(const IRepeatRegion& repeat,
402  CSeq_feat::TQual& qual_list)
403 {
404  string val;
406  string klass = repeat.GetRptClass();
407  string family = repeat.GetRptFamily();
409  if (NStr::EqualNocase(klass, "Satellite")) {
410  val = "satellite:";
411  if (! family.empty()) val += family;
412  val += ' ';
413  val += repeat.GetRptName();
414  s_SetQual(qual_list, "satellite", val);
415  if (! family.empty()) s_SetQual(qual_list, "rpt_family", family);
416  return true;
417  }
419  if (NStr::EqualNocase(klass, "Simple_repeat")) {
420  // Simple_repeat is the family in ASN.1, not the class, based on
421  // evidence of prior submissions to GenBank. For example:
422  // GI:45269107, although this is weak evidence (stuffing
423  // RepeatMasker into Genbank qualifiers without much
424  // effort at standardization).
425  //
426  // Do not expect Simple_repeat/xxx.
427  s_SetQual(qual_list, "rpt_family", klass);
428  s_SetQual(qual_list, "rpt_unit", repeat.GetRptName());
429  return true;
430  }
432  if (NStr::EqualNocase(klass, "SINE") ||
433  NStr::EqualNocase(klass, "LINE") ||
434  NStr::EqualNocase(klass, "LTR")) {
435  // Other valid INSDC mobile elements:
436  // "transposon", "retrotransposon", "integron",
437  // "insertion sequence", "non-LTR retrotransposon",
438  // "MITE", "other"
439  val = klass;
440  val += ':';
441  val += repeat.GetRptName();
442  s_SetQual(qual_list, "mobile_element", val);
443  if (! family.empty()) s_SetQual(qual_list, "rpt_family", family);
444  return true;
445  }
447  return false;
448 }
452  TIdGenerator& ids)
453  : m_Flags(flags)
454  , m_Library(lib)
455  , m_Ids(&ids)
456 {
457 }
460 {
461  m_Library.Reset();
462 }
465 {
466  m_Library.Reset(&lib);
467 }
470 {
471  m_Ids.Reset(new COrdinalFeatIdGenerator);
472 }
475 {
476  m_Ids.Reset(&generator);
477 }
480 {
481  // We can forget old IDs once references have been resolved.
482  m_IdMap.clear();
483 }
486 {
487  CRef<CSeq_feat> feat(new CSeq_feat);
489  // data:
490  CSeqFeatData& sfdata = feat->SetData();
491  CImp_feat_Base& imp = sfdata.SetImp();
492  imp.SetKey("repeat_region");
494  CRef<CFeat_id> id(m_Ids->GenerateId());
495  feat->SetId(*id);
496  TIdMap::iterator id_it(m_IdMap.find(repeat.GetRptId()));
497  if (id_it == m_IdMap.end()) {
498  m_IdMap[repeat.GetRptId()] = id;
499  } else {
501  ref->SetId().Assign(*id_it->second);
502  feat->SetXref().push_back(ref);
503  }
505  // location:
506  repeat.GetLocation(feat->SetLocation());
508  // qualifiers & ext's.
509  if (m_Flags) {
510  // Record if attributes were modified to conform with INSDC standards.
511  bool standardized(false);
514  if (m_Library) m_Library->Get(repeat.GetRptName(), extra);
516  CSeq_feat::TQual& qual_list = feat->SetQual();
520  standardized = s_StandardizeNomenclature(repeat, qual_list);
521  }
523  if (! standardized) {
524  // Did not succeed in standardizing nomenclature
525  // from RepeatMasker to INSDC standards. Fall back to
526  // storing the class/family verbatim.
527  s_SetQual(qual_list, "rpt_family", repeat.GetRptClassFamily());
528  }
529  }
531  if (m_Flags & fIncludeRepeatName && ! standardized) {
532  s_SetQual(qual_list, "standard_name", repeat.GetRptName());
533  }
535  if (m_Flags & fIncludeRepeatPos) {
536  s_SetQual(qual_list, "rpt_unit_range",
538  ".." + NStr::IntToString(repeat.GetRptPosEnd()));
539  }
541  // Get specificity and check it for redundancy (taxid vs name).
542  bool include_specificity_name(false);
544  const IRepeat::TTaxId specificity(extra.GetRptSpecificity());
545  const string specificity_name(extra.GetRptSpecificityName());
546  include_specificity_name = ! specificity_name.empty();
547  if (specificity) {
548  CRef<CDbtag> tag(new CDbtag);
549  // eDbtagType_taxon except the enum is almost useless,
550  // being available to only one function in the Dbtag API.
551  tag->SetDb("taxon");
552  tag->SetTag().SetId(specificity);
553  feat->SetDbxref().push_back(tag);
554  if ((m_Flags & fRemoveRedundancy) && m_Library &&
555  m_Library->TestSpecificityMatchesName(
556  specificity,
557  specificity_name)) {
558  // Name matches taxonomy exactly, so don't store both.
559  include_specificity_name=false;
560  }
561  }
562  }
564  // Get repeat length and check it for redundancy with rpt_left.
565  TSeqPos rpt_length(extra.GetRptLength());
566  if (rpt_length == kInvalidSeqPos) {
567  rpt_length = repeat.GetRptPosEnd() +
568  repeat.GetRptLeft();
569  }
570  bool include_rpt_left(m_Flags & fIncludeCoreStatistics);
571  if ((m_Flags & fRemoveRedundancy) &&
573  (rpt_length == repeat.GetRptPosEnd() +
574  repeat.GetRptLeft())) {
575  // Do not store rpt_left if we know the repeat length,
576  // rpt_left matches it (so it's redundant), and we
577  // want to remove redundancy.
578  include_rpt_left = false;
579  }
581  // Store anything beyond what is possible in INDSC-approved
582  // qualifiers using either non-standard qualifiers or user objects.
583  // There are two options.
586  // Option 1: Use Genbank qualifiers beyond the INDSC-approved set.
589  s_SetQual(qual_list, "sw_score", repeat.GetSwScore());
590  s_SetQual(qual_list, "perc_div", repeat.GetPercDiv());
591  s_SetQual(qual_list, "perc_del", repeat.GetPercDel());
592  s_SetQual(qual_list, "perc_ins", repeat.GetPercIns());
593  if (include_rpt_left) {
594  s_SetQual(qual_list, "rpt_left", repeat.GetRptLeft());
595  }
596  }
599  if (! (m_Flags & fRemoveRedundancy)) {
600  // Query length is always redundant, since sequences
601  // have a bioseq length, and we know the location.
602  s_SetQual(qual_list, "query_length",
603  repeat.GetSeqPosEnd() + repeat.GetSeqLeft());
604  }
605  if (repeat.IsOverlapped()) {
606  s_SetQual(qual_list, "overlapped", true);
607  }
608  }
610  if (m_Flags & fIncludeRepeatId) {
611  s_SetQual(qual_list, "rpt_id", repeat.GetRptId());
612  }
615  s_SetQual(qual_list, "rpt_length", rpt_length);
616  }
618  if (include_specificity_name) {
619  s_SetQual(qual_list, "specificity",
620  extra.GetRptSpecificityName());
621  }
623  } else {
624  // Option 2: Use user objects.
627  feat->SetExts().push_back(uo);
628  uo->SetType().SetStr("RepeatMasker");
631  uo->AddField("sw_score", static_cast<double>(repeat.GetSwScore()));
632  uo->AddField("perc_div", repeat.GetPercDiv());
633  uo->AddField("perc_del", repeat.GetPercDel());
634  uo->AddField("perc_ins", repeat.GetPercIns());
635  if (include_rpt_left) {
636  uo->AddField("rpt_left", static_cast<int>(repeat.GetRptLeft()));
637  }
638  }
641  if (! (m_Flags & fRemoveRedundancy)) {
642  // Query length is always redundant, since sequences
643  // have a bioseq length, and we know the location.
644  uo->AddField("query_length", static_cast<int>(
645  repeat.GetSeqPosEnd() + repeat.GetSeqLeft()));
646  }
647  if (repeat.IsOverlapped()) {
648  uo->AddField("overlapped", true);
649  }
650  }
652  if (m_Flags & fIncludeRepeatId) {
653  uo->AddField("rpt_id", static_cast<int>(repeat.GetRptId()));
654  }
657  uo->AddField("rpt_length", static_cast<int>(rpt_length));
658  }
660  if (include_specificity_name) {
661  uo->AddField("specificity", extra.GetRptSpecificityName());
662  }
664  // Clear out storage of empty user objects.
665  if (! uo->IsSetData()) feat->ResetExts();
666  }
668  // Clear out storage if empty Genbank qualifier lists.
669  if (qual_list.empty()) feat->ResetQual();
672  ! extra.GetRptRepbaseId().empty()) {
673  CRef<CDbtag> tag(new CDbtag);
674  tag->SetDb("REPBASE");
675  tag->SetTag().SetStr(extra.GetRptRepbaseId());
676  feat->SetDbxref().push_back(tag);
677  }
679  if (m_Flags & fSetComment) {
680  // Redundantly, store comments with original information.
681  // The comment tries to stay close to RepeatMasker native
682  // nomenclature. For example, query_left is reported,
683  // rather than the normalized query_length as stored
684  // in user objects or Genbank qualifiers. To accommodate
685  // the possibility the annotation is remapped, the original
686  // query identifier is preserved.
688  CNcbiOstrstream comment;
689  const char eq('='), sep(' ');
691  comment << "source=RepeatMasker";
692  if (m_Flags & fIncludeRepeatName) {
693  comment << sep
694  << "rpt_name" << eq << repeat.GetRptName();
695  }
697  comment << sep
698  << "sw_score" << eq << repeat.GetSwScore() << sep
699  << "perc_div" << eq << repeat.GetPercDiv() << sep
700  << "perc_del" << eq << repeat.GetPercDel() << sep
701  << "perc_ins" << eq << repeat.GetPercIns() << sep
702  << "rpt_left" << eq << repeat.GetRptLeft();
703  }
705  comment << sep
706  << "query" << eq << repeat.GetSeqIdString() << sep
707  << "query_range" << eq;
708  bool reverse(repeat.IsReverseStrand());
709  if (reverse) comment << "complement(";
710  comment << repeat.GetSeqPosBegin()
711  << ".." << repeat.GetSeqPosEnd();
712  if (reverse) comment << ")";
713  comment << sep
714  << "query_left" << eq << repeat.GetSeqLeft();
715  }
716  if (m_Flags & fIncludeRepeatId) {
717  comment << sep
718  << "ID" << eq << repeat.GetRptId();
719  }
720  if (m_Flags & fIncludeExtraStatistics && repeat.IsOverlapped()) {
721  comment << " *";
722  }
723  if (! extra.GetRptSpecificityName().empty()) {
724  comment << sep
725  << "specificity" << eq << extra.GetRptSpecificityName();
726  }
727  if (extra.GetRptLength() != kInvalidSeqPos) {
728  comment << sep
729  << "rpt_length" << eq << extra.GetRptLength();
730  }
731  feat->SetComment(CNcbiOstrstreamToString(comment));
732  }
733  }
735  return feat;
736 }
740  const ISeqIdResolver& seqid_resolver,
741  TIdGenerator& ids)
742  : m_SeqIdResolver(&seqid_resolver)
743  , m_ToFeat(flags, lib, ids)
744 {
745 }
748 {
749 }
752 {
754 }
757 {
758  m_SeqIdResolver.Reset(&seqid_resolver);
759 }
762 {
763  return m_ToFeat;
764 }
768 {
769  CRef<CSerialObject> object(
770  ReadSeqAnnot(lr, pMessageListener).ReleaseOrNull());
771  return object;
772 }
776 {
777  CRef<CSeq_annot> annot(new CSeq_annot);
778  // CRef<CAnnot_descr> desc(new CAnnot_descr);
779  // annot->SetDesc(*desc);
780  CSeq_annot::C_Data::TFtable& ftable = annot->SetData().SetFtable();
782  string line;
783  size_t record_counter = 0;
785  while ( ! lr.AtEOF() ) {
786  line = *++lr;
788  if ( IsHeaderLine( line ) || IsIgnoredLine( line ) ) {
789  continue;
790  }
791  ++record_counter;
792  //if (record_counter == 91555) {
793  // cerr << "";
794  //}
796  SRepeatRegion mask_data;
797  if ( ! ParseRecord( line, mask_data ) ) {
800  eDiag_Error,
801  lr.GetLineNumber(),
802  "RepeatMasker Reader: Parse error in record = " + line) );
803  ProcessError(*pErr, pMessageListener);
804  continue;
805  }
807  if ( ! VerifyData( mask_data ) ) {
810  eDiag_Error,
811  lr.GetLineNumber(),
812  "RepeatMasker Reader: Verification error in record = " + line) );
813  ProcessError(*pErr, pMessageListener);
814  continue;
815  }
817  CRef<CSeq_feat> feat(m_ToFeat(mask_data));
818  if ( ! feat ) {
821  eDiag_Error,
822  lr.GetLineNumber(),
823  "RepeatMasker Reader: Aborting file import, "
824  "unable to create feature table for record = " + line) );
825  ProcessError(*pErr, pMessageListener);
826  // we don't tolerate even a few errors here!
827  break;
828  }
830  ftable.push_back(feat);
831  }
832  // if (! record_counter) annot.Reset();
833  if (annot) {
834  xAddConversionInfo(*annot, pMessageListener);
835  }
836  return annot;
837 }
840 bool CRepeatMaskerReader::IsHeaderLine(const string& line)
841 {
842  string labels_1st_line[] = { "perc", "query", "position", "matching", "" };
843  string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
845  // try to identify 1st line of column labels:
846  size_t current_offset = 0;
847  size_t i = 0;
848  for ( ; labels_1st_line[i] != ""; ++i ) {
849  current_offset = NStr::FindCase( line, labels_1st_line[i], current_offset );
850  if ( NPOS == current_offset ) {
851  break;
852  }
853  }
854  if ( labels_1st_line[i] == "" ) {
855  return true;
856  }
858  // try to identify 2nd line of column labels:
859  current_offset = 0;
860  i = 0;
861  for ( ; labels_2nd_line[i] != ""; ++i ) {
862  current_offset = NStr::FindCase( line, labels_2nd_line[i], current_offset );
863  if ( NPOS == current_offset ) {
864  return false;
865  }
866  }
867  return true;
868 }
871 bool CRepeatMaskerReader::IsIgnoredLine(const string& line)
872 {
873  if ( NStr::StartsWith(line, "There were no repetitive sequences detected in "))
874  return true;
875  if ( NStr::FindCase(line, "only contains ambiguous bases") != NPOS)
876  return true;
877  return ( NStr::TruncateSpaces( line ).length() == 0 );
878 }
881 static void StripParens(string& s)
882 {
883  SIZE_TYPE b = 0;
884  SIZE_TYPE e = s.size();
885  if (e > 0 && s[b] == '(') {
886  ++b;
887  if (s[e - 1] == ')') --e;
888  if (e == b)
889  s = kEmptyStr;
890  else
891  s = s.substr(b, e - b);
892  }
893 }
895 bool CRepeatMaskerReader::ParseRecord(const string& record, SRepeatRegion& mask_data)
896 {
897  const size_t MIN_VALUE_COUNT = 15;
899  string line = NStr::TruncateSpaces( record );
900  list< string > values;
901  if ( NStr::Split( line, " \t", values, NStr::fSplit_Tokenize ).size() < MIN_VALUE_COUNT ) {
902  return false;
903  }
905  try {
906  // 1: "SW score"
907  list<string>::iterator it = values.begin();
908  mask_data.sw_score = NStr::StringToUInt( *it );
910  // 2: "perc div."
911  ++it;
912  mask_data.perc_div = NStr::StringToDouble( *it );
914  // 3: "perc del."
915  ++it;
916  mask_data.perc_del = NStr::StringToDouble( *it );
918  // 4: "perc ins."
919  ++it;
920  mask_data.perc_ins = NStr::StringToDouble( *it );
922  // 5: "query sequence"
923  ++it;
924  mask_data.query_sequence = *it;
925  CSeq_id_Handle idh(m_SeqIdResolver->ResolveSeqId(mask_data.query_sequence));
927  if (! id) return false;
928  mask_data.query_location.Reset(new CSeq_loc);
930  location.SetId().Assign(*id);
932  // 6: "position begin"
933  ++it;
934  TSeqPos pos_begin = NStr::StringToUInt(*it);
935  if (pos_begin == 0) return false;
936  location.SetFrom(pos_begin - 1);
938  // 7: "in end"
939  ++it;
940  TSeqPos pos_end = NStr::StringToUInt(*it);
941  if (pos_end == 0 || pos_end < pos_begin) return false;
942  location.SetTo(pos_end - 1);
944  // 8: "query (left)"
945  ++it;
946  StripParens(*it);
947  mask_data.query_left = NStr::StringToUInt( *it );
949  // 9: "" (meaning "strand")
950  ++it;
951  // Having the strand, we now have all fields to populate the location.
952  location.SetStrand(*it == "C" ? eNa_strand_minus : eNa_strand_plus);
954  // 10: "matching repeat"
955  ++it;
956  mask_data.matching_repeat = *it;
958  // 11: "repeat class/family"
959  ++it;
960  string class_family = *it;
961  NStr::SplitInTwo(class_family, "/",
962  mask_data.rpt_class, mask_data.rpt_family);
964  // 12: "position in"
965  ++it;
966  string field12 = *it;
968  // 13: "in end"
969  ++it;
970  mask_data.rpt_pos_end = NStr::StringToUInt( *it );
972  // 14: "repeat left"
973  ++it;
974  string field14 = *it;
976  // fields position 12 and 14 flip depending on the strand value.
977  string rpt_left;
978  if (mask_data.IsReverseStrand()) {
979  mask_data.rpt_pos_begin = NStr::StringToInt( field14 );
980  rpt_left = field12;
981  } else {
982  mask_data.rpt_pos_begin = NStr::StringToInt( field12 );
983  rpt_left = field14;
984  }
986  StripParens(rpt_left);
987  mask_data.rpt_left = NStr::StringToInt(rpt_left);
989  // 15: "ID"
990  ++it;
991  mask_data.rpt_id = NStr::StringToUInt(*it);
993  // 16: overlapped (higher score repeat overlaps)
994  ++it;
995  mask_data.overlapped = (it != values.end() && (*it) == "*");
996  }
997  catch( ... ) {
998  return false;
999  }
1001  return true;
1002 }
1005 {
1006  //
1007  // This would be the place for any higher level checks of the mask data
1008  // collected from the record ...
1009  //
1010  return true;
1011 }
1014 CRmReader::CRmReader(CNcbiIstream& istr) : m_Istr(istr)
1015 {
1016 }
1019 {
1020  //
1021  // This is the point to make sure we are dealing with the right file type and
1022  // to allocate the specialist reader for any subtype (OUT, HTML) we encouter.
1023  // When this function returns the file pointer should be past the file header
1024  // and at the beginning of the actual mask data.
1025  //
1026  // Note:
1027  // If something goes wrong during header processing then the file pointer will
1028  // still be modified. It's the caller's job to restore the file pointer if this
1029  // is possible for this type of stream.
1030  //
1032  //
1033  // 2006-03-31: Only supported file type at this time: ReadMasker OUT.
1034  //
1035  return new CRmReader(istr);
1036 }
1039 {
1040  delete reader;
1041 }
1044  TFlags flags, size_t errors)
1045 {
1046  annot->Reset();
1048  CMessageListenerWithLog error_container(DIAG_COMPILE_INFO);
1049  CRef<CSeq_annot> result(impl.ReadSeqAnnot(m_Istr, &error_container));
1050  annot->Assign(*result, eShallow);
1051 }
1054 END_objects_SCOPE
