NCBI C++ ToolKit
discrepancy_context.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: discrepancy_context.cpp 99033 2023-02-06 18:11:31Z foleyjp $
2  * =========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Sema Kachalo
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include "discrepancy_core.hpp"
32 #include "utils.hpp"
33 #include <sstream>
42 #include <objects/seq/Seq_ext.hpp>
48 #include <objmgr/seqdesc_ci.hpp>
49 #include <objmgr/seq_vector.hpp>
50 #include <util/xregexp/regexp.hpp>
52 
56 
57 
58 const CSeqSummary& CDiscrepancyContext::CurrentBioseqSummary() const
59 {
60  return *m_CurrentNode->m_BioseqSummary;
61 }
62 
63 
64 void CDiscrepancyContext::SetSuspectRules(const string& name, bool read)
65 {
66  if (!m_ProductRules || m_SuspectRules != name) {
67  m_SuspectRules = name;
68  if (read) {
70  }
71  }
72 }
73 
74 
76 {
77  if (!m_ProductRules) {
79  }
80  return m_ProductRules;
81 }
82 
83 
85 {
88  }
90 }
91 
92 
94 {
95  if (NStr::Equal(taxname, "uncultured organism") ||
96  NStr::Equal(taxname, "uncultured microorganism") ||
97  NStr::Equal(taxname, "uncultured bacterium") ||
98  NStr::Equal(taxname, "uncultured archaeon")) {
99  return true;
100  } else {
101  return false;
102  }
103 
104 }
105 
106 
107 bool CDiscrepancyContext::HasLineage(const CBioSource& biosrc, const string& def_lineage, const string& type)
108 {
109  return NStr::FindNoCase(def_lineage, type) != NPOS || (def_lineage.empty() && biosrc.IsSetLineage() && NStr::FindNoCase(biosrc.GetLineage(), type) != NPOS);
110 }
111 
112 
113 bool CDiscrepancyContext::HasLineage(const CBioSource* biosrc, const string& lineage) const
114 {
115  return biosrc ? HasLineage(*biosrc, GetLineage(), lineage) : false;
116 }
117 
118 
120  if (biosrc && biosrc->IsSetGenome()) {
121  int genome = biosrc->GetGenome();
122  return genome == CBioSource::eGenome_chloroplast
126  || genome == CBioSource::eGenome_cyanelle
128  || genome == CBioSource::eGenome_apicoplast
129  || genome == CBioSource::eGenome_leucoplast
130  || genome == CBioSource::eGenome_proplastid
132  || genome == CBioSource::eGenome_plastid
134  }
135  return false;
136 }
137 
138 
140 {
141  if (biosrc) {
142  int genome = biosrc->GetGenome();
143  return genome != CBioSource::eGenome_mitochondrion
145  && genome != CBioSource::eGenome_plastid
146  && genome != CBioSource::eGenome_apicoplast
147  && HasLineage(*biosrc, GetLineage(), "Eukaryota");
148  }
149  return false;
150 }
151 
152 
154 {
155  return biosrc ? HasLineage(biosrc, "Bacteria") : false;
156 }
157 
158 
159 bool CDiscrepancyContext::IsViral(const CBioSource* biosrc) const
160 {
161  return biosrc ? HasLineage(biosrc, "Viruses") : false;
162 }
163 
164 
165 string CSeqSummary::GetStats() const
166 {
167  if (Stats.empty()) {
168  Stats = "(length " + NStr::NumericToString(Len);
169  if (N + Other) {
170  Stats += ", " + NStr::NumericToString(N + Other) + " other";
171  }
172  if (Gaps > 0) {
173  Stats += ", " + NStr::NumericToString(Gaps) + " gap";
174  }
175  Stats += ")";
176  }
177  return Stats;
178 }
179 
180 
181 inline static void _notN(CSeqSummary& sum)
182 {
183  if (sum._Ns >= 10) {
184  sum.NRuns.push_back({ sum._Pos + 1 - sum._Ns, sum._Pos });
185  }
186  sum._Ns = 0;
187 }
188 
189 inline static void _QualityScore(CSeqSummary& sum)
190 {
191  sum._QS++;
192  size_t q = sum._QS;
193  if (sum._Pos > CSeqSummary::WINDOW_SIZE) {
194  for (size_t r = sum._CBread; r != sum._CBwrite && sum._CBposition[r] <= sum._Pos; r = r >= CSeqSummary::WINDOW_SIZE - 1 ? 0 : r + 1) {
195  sum._CBread = r;
196  q = sum._QS - sum._CBscore[r];
197  }
198  }
199  if (q > sum.MinQ) { // yes, ">"!
200  sum.MinQ = q;
201  }
202  sum._CBscore[sum._CBwrite] = sum._QS;
204  sum._CBwrite++;
205  if (sum._CBwrite >= CSeqSummary::WINDOW_SIZE) {
206  sum._CBwrite = 0;
207  }
208 }
209 
210 inline static void sA(CSeqSummary& sum) { sum.A++; _notN(sum); }
211 inline static void sG(CSeqSummary& sum) { sum.G++; _notN(sum); }
212 inline static void sC(CSeqSummary& sum) { sum.C++; _notN(sum); }
213 inline static void sT(CSeqSummary& sum) { sum.T++; _notN(sum); }
214 
215 inline static void sZ(CSeqSummary& sum)
216 {
217  sum.Other++;
218  _notN(sum);
219  _QualityScore(sum);
220 }
221 
222 inline static void sN(CSeqSummary& sum)
223 {
224  sum.N++;
225  sum._Ns++;
226  if (sum._Ns > sum.MaxN) {
227  sum.MaxN = sum._Ns;
228  }
229  if (sum.First) {
230  sum.StartsWithGap = true;
231  }
232  sum.EndsWithGap = true;
233  _QualityScore(sum);
234 }
235 
236 
237 static void CountNucleotides(const CSeq_data& seq_data, TSeqPos pos, TSeqPos len, CSeqSummary& sum)
238 {
239 /*
240  enum E_Choice {
241  e_not_set = 0, ///< No variant selected
242  e_Iupacna, ///< IUPAC 1 letter nuc acid code
243  e_Iupacaa, ///< IUPAC 1 letter amino acid code
244  e_Ncbi2na, ///< 2 bit nucleic acid code
245  e_Ncbi4na, ///< 4 bit nucleic acid code
246  e_Ncbi8na, ///< 8 bit extended nucleic acid code
247  e_Ncbipna, ///< nucleic acid probabilities
248  e_Ncbi8aa, ///< 8 bit extended amino acid codes
249  e_Ncbieaa, ///< extended ASCII 1 letter aa codes
250  e_Ncbipaa, ///< amino acid probabilities
251  e_Ncbistdaa, ///< consecutive codes for std aas
252  e_Gap ///< gap types
253  };
254 */
255  sum._Pos = pos;
256  switch (seq_data.Which()) {
258  //cout << "> e_Ncbi2na\n";
259  {
260  vector<char>::const_iterator it = seq_data.GetNcbi2na().Get().begin();
261  unsigned char mask = 0xc0;
262  unsigned char shift = 6;
263  for (size_t n = 0; n < len; n++, sum._Pos++) {
264  unsigned char c = ((*it) & mask) >> shift;
265  mask >>= 2;
266  shift -= 2;
267  if (!mask) {
268  mask = 0xc0;
269  shift = 6;
270  ++it;
271  }
272  switch (c) {
273  case 0:
274  sA(sum);
275  break;
276  case 1:
277  sC(sum);
278  break;
279  case 2:
280  sG(sum);
281  break;
282  case 3:
283  sT(sum);
284  break;
285  }
286  }
287  if (len) {
288  sum.First = false;
289  sum.EndsWithGap = false;
290  }
291  }
292  return;
294  //cout << "> e_Ncbi4na\n";
295  {
296  vector<char>::const_iterator it = seq_data.GetNcbi4na().Get().begin();
297  unsigned char mask = 0xf0;
298  unsigned char shift = 4;
299  for (size_t n = 0; n < len; n++, sum._Pos++) {
300  unsigned char c = ((*it) & mask) >> shift;
301  mask >>= 4;
302  shift -= 4;
303  if (!mask) {
304  mask = 0xf0;
305  shift = 4;
306  ++it;
307  }
308  sum.EndsWithGap = false;
309  switch (c) {
310  case 1:
311  sA(sum);
312  break;
313  case 2:
314  sC(sum);
315  break;
316  case 4:
317  sG(sum);
318  break;
319  case 8:
320  sT(sum);
321  break;
322  case 15:
323  sN(sum);
324  break;
325  default:
326  sZ(sum);
327  break;
328  }
329  sum.First = false;
330  }
331  }
332  return;
334  //cout << "> e_Iupacna\n";
335  {
336  const string& s = seq_data.GetIupacna().Get();
337  for (size_t n = 0; n < len; n++, sum._Pos++) {
338  sum.EndsWithGap = false;
339  switch (s[n]) {
340  case 'A':
341  sA(sum);
342  break;
343  case 'C':
344  sC(sum);
345  break;
346  case 'G':
347  sG(sum);
348  break;
349  case 'T':
350  sT(sum);
351  break;
352  case 'N':
353  sN(sum);
354  break;
355  default:
356  sZ(sum);
357  break;
358  }
359  sum.First = false;
360  }
361  }
362  return;
364  case CSeq_data::e_Ncbipna: // no test data available; resorting to "slow" method.
365  //cout << (seq_data.Which() == CSeq_data::e_Ncbi8na ? "> e_Ncbi8na\n" : "> e_Ncbipna\n");
366  {
367  CSeq_data iupacna;
368  if (!CSeqportUtil::Convert(seq_data, &iupacna, CSeq_data::e_Iupacna)) {
369  return;
370  }
371  const string& s = iupacna.GetIupacna().Get();
372  for (size_t n = 0; n < len; n++, sum._Pos++) {
373  sum.EndsWithGap = false;
374  switch (s[n]) {
375  case 'A':
376  sA(sum);
377  break;
378  case 'C':
379  sC(sum);
380  break;
381  case 'G':
382  sG(sum);
383  break;
384  case 'T':
385  sT(sum);
386  break;
387  case 'N':
388  sN(sum);
389  break;
390  default:
391  sZ(sum);
392  break;
393  }
394  sum.First = false;
395  }
396  }
397  return;
398  default:
399  return;
400  }
401 }
402 
403 
405 {
406  summary.clear();
407 
408  // Length
409  summary.Len = bs.GetInst().GetLength();
410 
411  // Label
412  CConstRef<CSeq_id> best_id;
413  int best_score = CSeq_id::kMaxScore;
414  for (auto id: bs.GetId()) {
415  if (id->Which() == CSeq_id::e_Genbank || id->Which() == CSeq_id::e_Ddbj || id->Which() == CSeq_id::e_Embl || id->Which() == CSeq_id::e_Other) {
416  best_id = id;
417  break;
418  }
419  else {
420  if (best_score > id->BaseBestRankScore()) {
421  best_id = id;
422  best_score = id->BaseBestRankScore();
423  }
424  }
425  }
426  best_id->GetLabel(&summary.Label, CSeq_id::eContent);
427 
428  // Stats
429  const CRef<CSeqMap> seq_map = CSeqMap::CreateSeqMapForBioseq(bs);
430  SSeqMapSelector sel;
432  for (CSeqMap_CI seq_iter(seq_map, &GetScope(), sel); seq_iter; ++seq_iter) {
433  switch (seq_iter.GetType()) {
434  case CSeqMap::eSeqData:
435  CountNucleotides(seq_iter.GetData(), seq_iter.GetPosition(), seq_iter.GetLength(), summary);
436  break;
437  case CSeqMap::eSeqGap:
438  _notN(summary);
439  if (summary.First) {
440  summary.First = false;
441  }
442  summary.Gaps += seq_iter.GetLength();
443  break;
444  case CSeqMap::eSeqRef:
445  _notN(summary);
446  summary.First = false;
447  summary.EndsWithGap = false;
448  summary.HasRef = true;
449  break;
450  default:
451  _notN(summary);
452  break;
453  }
454  }
455  _notN(summary);
456 }
457 
458 
460 {
461  static vector<string> G;
462  if (G.empty()) {
463  G.resize(eSource_location_chromatophore + 1);
465  string str = ENUM_METHOD_NAME(ESource_location)()->FindName(i, true);
466  G[i] = (str == "unknown") ? kEmptyStr : ((str == "extrachrom") ? "extrachromosomal" : str);
467  }
468  }
469  return n < G.size() ? G[n] : kEmptyStr;
470 }
471 
472 
473 string CDiscrepancyContext::GetAminoacidName(const CSeq_feat& obj) // from tRNA
474 {
475  string aa;
477  size_t n = aa.find_last_of('-'); // cut off the "tRNA-" prefix
478  if (n != string::npos) {
479  aa = aa.substr(n + 1); // is there any better way to get the aminoacid name?
480  }
481  return aa;
482 }
483 
484 
485 bool CDiscrepancyContext::IsBadLocusTagFormat(const string& locus_tag) const
486 {
487  // Optimization: compile regexp only once by making it static.
488  static CRegexp regexp("^[A-Za-z][0-9A-Za-z]{2,}_[0-9A-Za-z]+$");
489 
490  // Locus tag format documentation:
491  // https://www.ncbi.nlm.nih.gov/genomes/locustag/Proposal.pdf
492 
493  return !regexp.IsMatch(locus_tag);
494 }
495 
496 
498 {
499  const CBioseq& bioseq = CurrentBioseq();
500  if (bioseq.IsSetId()) {
501  for (auto& id : bioseq.GetId()) {
502  if (id->IsOther()) {
503  return true;
504  }
505  }
506  }
507  return false;
508 }
509 
510 
512 {
513  for (auto& desc : GetAllSeqdesc()) {
514  if (desc.IsUser()) {
515  const CUser_object& user = desc.GetUser();
516  if (FIELD_IS_SET_AND_IS(user, Type, Str) && NStr::EqualNocase(user.GetType().GetStr(), "StructuredComment")) {
517  CConstRef<CUser_field> prefix_field = user.GetFieldRef("StructuredCommentPrefix", ".", NStr::eNocase);
518  if (prefix_field && FIELD_IS_SET_AND_IS(*prefix_field, Data, Str) && NStr::EqualNocase(prefix_field->GetData().GetStr(), "##Genome-Annotation-Data-START##")) {
519  CConstRef<CUser_field> pipeline_field = user.GetFieldRef("Annotation Pipeline", ".", NStr::eNocase);
520  if (pipeline_field && FIELD_IS_SET_AND_IS(*pipeline_field, Data, Str) && NStr::EqualNocase(pipeline_field->GetData().GetStr(), "NCBI Prokaryotic Genome Annotation Pipeline")) {
521  return true;
522  }
523  }
524  }
525  }
526  }
527  return false;
528 #if 0
529  CSeqdesc_CI user_desc_ci(m_Scope->GetBioseqHandle(*GetCurrentBioseq()), CSeqdesc::e_User);
530  for(; !IsBGPipe_is_bgpipe && user_desc_ci; ++user_desc_ci) {
531  const CUser_object& user_desc = user_desc_ci->GetUser();
532  // only look at structured comments
533  if(!FIELD_IS_SET_AND_IS(user_desc, Type, Str) || !NStr::EqualNocase(user_desc.GetType().GetStr(), "StructuredComment")) {
534  continue;
535  }
536  CConstRef<CUser_field> struccmt_prefix_field = user_desc.GetFieldRef("StructuredCommentPrefix", ".", NStr::eNocase);
537  if(!struccmt_prefix_field || !FIELD_IS_SET_AND_IS(*struccmt_prefix_field, Data, Str) || !NStr::EqualNocase(struccmt_prefix_field->GetData().GetStr(), "##Genome-Annotation-Data-START##") ) {
538  continue;
539  }
540  CConstRef<CUser_field> annot_pipeline_field = user_desc.GetFieldRef("Annotation Pipeline", ".", NStr::eNocase);
541  if (!annot_pipeline_field || !FIELD_IS_SET_AND_IS(*annot_pipeline_field, Data, Str) || !NStr::EqualNocase(annot_pipeline_field->GetData().GetStr(), "NCBI Prokaryotic Genome Annotation Pipeline")) {
542  continue;
543  }
544  IsBGPipe_is_bgpipe = true;
545  return IsBGPipe_is_bgpipe;
546  }
547  return IsBGPipe_is_bgpipe;
548 #endif
549 }
550 
551 
553 {
554  auto gene = GeneForFeature(*FindNode(feat));
555  return gene ? dynamic_cast<const CSeq_feat*>(&*gene->m_Obj) : nullptr;
556 }
557 
558 
560 {
561  return ProdForFeature(*FindNode(feat));
562 }
563 
564 
566 {
567  if (node.m_Info & CParseNode::eKnownGene) {
568  return node.m_Gene;
569  }
571  const CSeq_feat& feat = dynamic_cast<const CSeq_feat&>(*node.m_Obj);
572  auto gene = sequence::GetGeneForFeature(feat, *m_Scope);
573  node.m_Gene = gene ? FindNode(*gene) : nullptr;
574  return node.m_Gene;
575 }
576 
577 
579 {
580  if (node.m_Info & CParseNode::eKnownProduct) {
581  return node.m_Product;
582  }
584  const CSeq_feat& feat = dynamic_cast<const CSeq_feat&>(*node.m_Obj);
585  node.m_Product = GetProductName(feat, *m_Scope);
586  return node.m_Product;
587 }
588 
589 
591 {
592  return IsPseudo(*FindNode(feat));
593 }
594 
595 
597 {
598  if (node.m_Info & CParseNode::eKnownPseudo) {
599  return node.m_Info & CParseNode::eIsPseudo;
600  }
602  const CSeq_feat& feat = dynamic_cast<const CSeq_feat&>(*node.m_Obj);
603  if (feat.IsSetPseudo() && feat.GetPseudo()) {
605  return true;
606  }
607  if (feat.IsSetQual()) {
608  for (auto& it : feat.GetQual()) {
609  if (it->IsSetQual() && NStr::EqualNocase(it->GetQual(), "pseudogene")) {
611  return true;
612  }
613  }
614  }
615  if (feat.GetData().IsGene()) {
616  if (feat.GetData().GetGene().IsSetPseudo() && feat.GetData().GetGene().GetPseudo()) {
618  return true;
619  }
620  }
621  else {
622  if (feat.IsSetXref()) {
623  for (auto& it : feat.GetXref()) {
624  if (it->IsSetData() && it->GetData().IsGene() && it->GetData().GetGene().IsSetPseudo() && it->GetData().GetGene().GetPseudo()) {
626  return true;
627  }
628  }
629  }
630  auto gene = GeneForFeature(node);
631  if (gene && IsPseudo(*gene)) {
633  return true;
634  }
635  }
636  return false;
637 }
638 
639 
641 {
642  m_FeatAll.clear();
643  m_FeatGenes.clear();
644  m_FeatPseudo.clear();
645  m_FeatCDS.clear();
646  m_FeatMRNAs.clear();
647  m_FeatRRNAs.clear();
648  m_FeatTRNAs.clear();
649  m_Feat_RNAs.clear();
650  m_FeatExons.clear();
651  m_FeatIntrons.clear();
652  m_FeatMisc.clear();
653 }
654 
655 
657 {
658  m_FeatAll.push_back(CConstRef<CSeq_feat>(&feat));
659  switch (feat.GetData().GetSubtype()) {
661  m_FeatGenes.push_back(&feat);
662  break;
664  m_FeatCDS.push_back(&feat);
665  break;
667  m_FeatMRNAs.push_back(&feat);
668  break;
670  m_FeatTRNAs.push_back(&feat);
671  break;
673  m_FeatRRNAs.push_back(&feat);
674  break;
676  m_FeatExons.push_back(&feat);
677  break;
679  m_FeatIntrons.push_back(&feat);
680  break;
682  m_FeatMisc.push_back(&feat);
683  break;
684  default:
685  break;
686  }
687  if (feat.GetData().IsRna()) {
688  m_Feat_RNAs.push_back(&feat);
689  }
690  if (IsPseudo(feat)) {
691  m_FeatPseudo.push_back(&feat);
692  }
693 }
694 
695 
697 {
698  try { // CSeq_loc::GetTotalRange() throws an exception for multi-sequence locations.
699  CSeq_loc::TRange r1 = loc1.GetTotalRange();
700  CSeq_loc::TRange r2 = loc2.GetTotalRange();
701  if (r1.GetFrom() >= r2.GetToOpen() || r2.GetFrom() >= r1.GetToOpen()) {
702  return sequence::eNoOverlap;
703  }
704  }
705  catch (const CSeqLocException& /* ignored */) {}
707 }
708 
709 
711 {
712  if (m_CurrentNode->m_Type == eSubmit) {
713  const CSeq_submit* sub = static_cast<const CSeq_submit*>(&*m_CurrentNode->m_Obj);
714  if (sub->IsSetSub() && sub->GetSub().IsSetContact() && sub->GetSub().GetContact().IsSetContact() && sub->GetSub().GetContact().GetContact().IsSetName()) {
715  return &sub->GetSub().GetContact().GetContact().GetName();
716  }
717  }
718  return nullptr;
719 }
720 
721 
723 {
724  if (m_CurrentNode->m_Type == eSubmit) {
725  const CSeq_submit* sub = static_cast<const CSeq_submit*>(&*m_CurrentNode->m_Obj);
726  if (sub->IsSetSub()) {
727  return &sub->GetSub();
728  }
729  }
730  return nullptr;
731 }
732 
733 
734 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
735 
737 {
738  if (obj) {
739  const CSeq_feat* feat = dynamic_cast<const CSeq_feat*>(obj);
740  if (feat) {
741  return FindNode(*feat);
742  }
743  const CSeqdesc* desc = dynamic_cast<const CSeqdesc*>(obj);
744  if (desc) {
745  return FindNode(*desc);
746  }
747  for (auto node = m_CurrentNode; node; node = node->m_Parent) {
748  if (&*node->m_Obj == obj) {
749  return &*node;
750  }
751  }
752  }
753  return nullptr;
754 }
755 
756 
758 {
759  auto it = node.m_FeatureMap.find(&feat);
760  return it == node.m_FeatureMap.end() ? nullptr : it->second;
761 }
762 
763 
765 {
766  for (auto node = m_CurrentNode; node; node = node->m_Parent) {
767  auto it = node->m_FeatureMap.find(&feat);
768  if (it != node->m_FeatureMap.end()) {
769  return it->second;
770  }
771  if (node->m_Type == eSeqSet_NucProt || node->m_Type == eSeqSet_GenProd) {
772  for (auto& child : node->m_Children) {
773  CDiscrepancyContext::CParseNode* found = FindLocalNode(*child, feat);
774  if (found) {
775  return found;
776  }
777  }
778  }
779  }
780  return nullptr;
781 }
782 
783 
785 {
786  for (auto node = m_CurrentNode; node; node = node->m_Parent) {
787  auto it = node->m_DescriptorMap.find(&desc);
788  if (it != node->m_DescriptorMap.end()) {
789  return it->second;
790  }
791  }
792  return nullptr;
793 }
794 
795 
797 {
798  CRefNode* ret = nullptr;
799  for (CRefNode* r = ref.m_Parent.GetPointer(); r; r = r->m_Parent.GetPointer()) {
800  if (!ret && IsSeqSet(r->m_Type)) {
801  ret = r;
802  }
803  if (r->m_Type == eSeqSet_NucProt || r->m_Type == eSeqSet_GenProd) {
804  return r;
805  }
806  }
807  return ret ? ret : ref.m_Parent.GetPointer();
808 }
809 
810 
812 {
813  CRefNode* fixref = nullptr;
814  if (fix == eFixSelf) {
815  fixref = &*m_CurrentNode->m_Ref;
816  }
817  else if (fix == eFixParent) {
818  fixref = &*m_CurrentNode->m_Ref->m_Parent;
819  }
820  else if (fix == eFixSet) {
821  fixref = ContainingSet(*m_CurrentNode->m_Ref);
822  }
823  CRef<CDiscrepancyObject> obj(new CDiscrepancyObject(m_CurrentNode->m_Ref, fixref, more));
824  return obj;
825 }
826 
827 
829 {
830  CRef<CDiscrepancyObject> obj(new CDiscrepancyObject(m_CurrentNode->m_Ref, fix ? &*m_CurrentNode->m_Ref : nullptr, more));
831  return obj;
832 }
833 
834 
836 {
837  _ASSERT(m_CurrentNode->m_Type == eSubmit);
838  CRef<CDiscrepancyObject> obj(new CDiscrepancyObject(m_CurrentNode->m_Ref, fix ? &*m_CurrentNode->m_Ref : nullptr, more));
839  return obj;
840 }
841 
842 
844 {
846  auto node = FindNode(feat);
847  if (node) {
848  if (node->m_Ref->m_Text.empty()) {
849  node->m_Ref->m_Text = CDiscrepancyObject::GetTextObjectDescription(feat, *m_Scope);
850  }
851  CRefNode* fixref = nullptr;
852  if (fix == eFixSelf) {
853  fixref = &*node->m_Ref;
854  }
855  else if (fix == eFixParent) {
856  fixref = &*node->m_Ref->m_Parent;
857  }
858  else if (fix == eFixSet) {
859  fixref = ContainingSet(*node->m_Ref);
860  }
861  obj.Reset(new CDiscrepancyObject(node->m_Ref, fixref, more));
862  }
863  return obj;
864 }
865 
866 
868 {
870  auto node = FindNode(feat);
871  if (node) {
872  if (node->m_Ref->m_Text.empty()) {
873  node->m_Ref->m_Text = CDiscrepancyObject::GetTextObjectDescription(feat, *m_Scope);
874  }
875  obj.Reset(new CDiscrepancyObject(node->m_Ref, fix ? FindNode(fix)->m_Ref.GetPointer() : nullptr, more));
876  }
877  return obj;
878 }
879 
880 
882 {
884  auto node = FindNode(desc);
885  if (node) {
886  if (node->m_Ref->m_Text.empty()) {
887  node->m_Ref->m_Text = CDiscrepancyObject::GetTextObjectDescription(desc);
888  }
889  obj.Reset(new CDiscrepancyObject(node->m_Ref, fix ? &*FindNode(fix)->m_Ref : nullptr, more));
890  }
891  return obj;
892 }
893 
894 
896 {
898  for (auto node = m_CurrentNode; node; node = node->m_Parent) {
899  auto it = node->m_BiosourceMap.find(&biosrc);
900  if (it != node->m_BiosourceMap.end()) {
901  if (it->second->m_Ref->m_Text.empty()) {
902  if (it->second->m_Type == eSeqFeat) {
903  it->second->m_Ref->m_Text = CDiscrepancyObject::GetTextObjectDescription(static_cast<const CSeq_feat&>(*it->second->m_Obj), *m_Scope);
904  }
905  else { // eSeqDesc
906  it->second->m_Ref->m_Text = CDiscrepancyObject::GetTextObjectDescription(static_cast<const CSeqdesc&>(*it->second->m_Obj));
907  }
908  }
909  obj.Reset(new CDiscrepancyObject(it->second->m_Ref, fix ? &*it->second->m_Ref : nullptr, more));
910  return obj;
911  }
912  }
913  return obj;
914 }
915 
916 
918 {
920  auto it = m_CurrentNode->m_PubdescMap.find(&pubdesc);
921  _ASSERT(it != m_CurrentNode->m_PubdescMap.end());
922  if (it->second->m_Ref->m_Text.empty()) {
923  if (it->second->m_Type == eSeqFeat) {
924  it->second->m_Ref->m_Text = CDiscrepancyObject::GetTextObjectDescription(static_cast<const CSeq_feat&>(*it->second->m_Obj), *m_Scope);
925  }
926  else if (it->second->m_Type == eSeqDesc) {
927  it->second->m_Ref->m_Text = CDiscrepancyObject::GetTextObjectDescription(static_cast<const CSeqdesc&>(*it->second->m_Obj));
928  }
929  }
930  obj.Reset(new CDiscrepancyObject(it->second->m_Ref, fix ? &*it->second->m_Ref : nullptr, more));
931  return obj;
932 }
933 
934 
936 {
938  auto it = m_CurrentNode->m_AuthorMap.find(&authors);
939  _ASSERT(it != m_CurrentNode->m_AuthorMap.end());
940  if (it->second->m_Ref->m_Text.empty()) {
941  if (it->second->m_Type == eSeqFeat) {
942  it->second->m_Ref->m_Text = CDiscrepancyObject::GetTextObjectDescription(static_cast<const CSeq_feat&>(*it->second->m_Obj), *m_Scope);
943  }
944  else if (it->second->m_Type == eSeqDesc) {
945  it->second->m_Ref->m_Text = CDiscrepancyObject::GetTextObjectDescription(static_cast<const CSeqdesc&>(*it->second->m_Obj));
946  }
947  else {
948  //cout << "===============================================!\n";
949  }
950  }
951  obj.Reset(new CDiscrepancyObject(it->second->m_Ref, fix ? &*it->second->m_Ref : nullptr, more));
952  return obj;
953 }
954 
955 
957 {
959  obj.Reset(new CDiscrepancyObject(m_CurrentNode->m_Ref, fix ? &*FindNode(fix)->m_Ref : nullptr, more));
960  return obj;
961 }
962 
963 
964 CRef<CDiscrepancyProduct> CDiscrepancyContext::RunTests(const TTestNamesSet& testnames, const CSerialObject& object, const string& filename)
965 {
966  for (auto n: testnames)
967  AddTest(n);
968 
969  Parse(object, filename);
970  return GetProduct();
971 }
972 
974 {
975  auto product = Ref(new CDiscrepancyProductImpl);
976  product->m_Tests = m_Tests;
977  return product;
978 }
979 
980 void CDiscrepancyContext::Parse(const CSerialObject& root, const string& fname)
981 {
982  Push(root, fname);
984 }
985 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
ncbi::TMaskedQueryRegions mask
@Auth_list.hpp User-defined methods of the data storage class.
Definition: Auth_list.hpp:57
const string & GetLineage(void) const
Definition: BioSource.cpp:360
bool IsSetLineage(void) const
Definition: BioSource.cpp:355
const CParseNode * GeneForFeature(const CParseNode &node)
string GetProdForFeature(const CSeq_feat &feat)
CSeqdesc_run GetAllSeqdesc()
void AddTest(eTestNames name) override
vector< const CSeq_feat * > m_Feat_RNAs
static bool HasLineage(const CBioSource &biosrc, const string &def_lineage, const string &type)
bool IsViral(const CBioSource *biosrc) const
CRef< CDiscrepancyObject > BiosourceObjRef(const CBioSource &biosrc, bool fix=false, const CObject *more=nullptr)
CRef< objects::CScope > m_Scope
CParseNode * FindNode(const CRefNode &obj)
CRef< CDiscrepancyObject > AuthorsObjRef(const CAuth_list &authors, bool fix=false, const CObject *more=nullptr)
TDiscrepancyCoreMap m_Tests
void Push(const CSerialObject &root, const string &fname) override
CRef< CParseNode > m_RootNode
CRef< CDiscrepancyObject > BioseqObjRef(EFixType fix=eFixNone, const CObject *more=nullptr)
const CPerson_id * GetPerson_id() const
CRef< CDiscrepancyObject > SeqFeatObjRef(const CSeq_feat &feat, EFixType fix=eFixNone, const CObject *more=nullptr)
static string GetGenomeName(unsigned n)
CConstRef< CSuspect_rule_set > GetOrganelleProductRules()
CRef< CParseNode > m_CurrentNode
vector< const CSeq_feat * > m_FeatAll
CRefNode * ContainingSet(CRefNode &ref)
vector< const CSeq_feat * > m_FeatRRNAs
objects::CScope & GetScope() const
vector< const CSeq_feat * > m_FeatCDS
CParseNode * FindLocalNode(const CParseNode &node, const CSeq_feat &feat) const
CRef< CDiscrepancyObject > StringObjRef(const CObject *fix=nullptr, const CObject *more=nullptr)
const CSeq_feat * GetGeneForFeature(const CSeq_feat &feat)
bool IsEukaryotic(const CBioSource *biosrc) const
const CBioseq & CurrentBioseq() const
vector< const CSeq_feat * > m_FeatGenes
const CSubmit_block * GetSubmit_block() const
void BuildSeqSummary(const CBioseq &bs, CSeqSummary &summary)
bool IsBacterial(const CBioSource *biosrc) const
vector< const CSeq_feat * > m_FeatMRNAs
void CollectFeature(const CSeq_feat &feat)
static bool IsOrganelle(const CBioSource *biosrc)
void SetSuspectRules(const string &name, bool read=true) override
CRef< CDiscrepancyObject > SubmitBlockObjRef(bool fix=false, const CObject *more=nullptr)
friend class CDiscrepancyObject
CConstRef< CSuspect_rule_set > m_OrganelleProductRules
CRef< CDiscrepancyObject > SeqdescObjRef(const CSeqdesc &desc, const CObject *fix=nullptr, const CObject *more=nullptr)
CConstRef< CSuspect_rule_set > GetProductRules()
vector< const CSeq_feat * > m_FeatTRNAs
bool IsPseudo(const CSeq_feat &feat)
vector< const CSeq_feat * > m_FeatExons
void ParseAll(CParseNode &node)
CRef< CDiscrepancyObject > BioseqSetObjRef(bool fix=false, const CObject *more=nullptr)
CRef< CDiscrepancyProduct > GetProduct() override
CConstRef< CSuspect_rule_set > m_ProductRules
static bool IsUnculturedNonOrganelleName(const string &taxname)
vector< const CSeq_feat * > m_FeatIntrons
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2) const
vector< const CSeq_feat * > m_FeatPseudo
static string GetAminoacidName(const CSeq_feat &feat)
string ProdForFeature(const CParseNode &node)
static bool IsSeqSet(EObjType n)
bool IsBadLocusTagFormat(const string &locus_tag) const
CRef< CDiscrepancyObject > PubdescObjRef(const CPubdesc &pubdesc, bool fix=false, const CObject *more=nullptr)
vector< const CSeq_feat * > m_FeatMisc
static string GetTextObjectDescription(const CSeq_feat &seq_feat, CScope &scope)
const string & GetLineage() const
CObject –.
Definition: ncbiobj.hpp:180
@Pubdesc.hpp User-defined methods of the data storage class.
Definition: Pubdesc.hpp:54
CRegexp –.
Definition: regexp.hpp:70
ESubtype GetSubtype(void) const
Seq-loc exceptions.
Definition: Seq_loc.hpp:74
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
Base class for all serializable objects.
Definition: serialbase.hpp:150
CSubmit_block –.
CConstRef< CUser_field > GetFieldRef(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Definition: User_object.cpp:84
#define G(x, y, z)
Definition: md4.c:179
CConstRef< objects::CSuspect_rule_set > GetProductRules(const string &name="")
CConstRef< objects::CSuspect_rule_set > GetOrganelleProductRules(const string &name="")
static void _QualityScore(CSeqSummary &sum)
USING_SCOPE(objects)
static void sA(CSeqSummary &sum)
static void sN(CSeqSummary &sum)
static void sG(CSeqSummary &sum)
static void sT(CSeqSummary &sum)
static void CountNucleotides(const CSeq_data &seq_data, TSeqPos pos, TSeqPos len, CSeqSummary &sum)
static void sC(CSeqSummary &sum)
static void sZ(CSeqSummary &sum)
static void _notN(CSeqSummary &sum)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
const TPrim & Get(void) const
Definition: serialbase.hpp:347
#define ENUM_METHOD_NAME(EnumName)
Definition: serialbase.hpp:994
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2039
int BaseBestRankScore(void) const
Definition: Seq_id.cpp:3076
string GetLabel(const CSeq_id &id)
@ kMaxScore
Definition: Seq_id.hpp:701
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:573
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
@ fFGL_Content
Include its content if there is any.
Definition: feature.hpp:73
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
ECompare
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eNoOverlap
CSeq_locs do not overlap or abut.
CConstRef< CSeq_feat > GetGeneForFeature(const CSeq_feat &feat, CScope &scope)
Finds gene for feature, but obeys SeqFeatXref directives.
Definition: sequence.cpp:1529
SSeqMapSelector & SetFlags(TFlags flags)
Select segment type(s)
Definition: seq_map_ci.hpp:179
static CRef< CSeqMap > CreateSeqMapForBioseq(const CBioseq &seq)
Definition: seq_map.cpp:1122
@ fFindGap
Definition: seq_map.hpp:130
@ fFindLeafRef
Definition: seq_map.hpp:131
@ fFindData
Definition: seq_map.hpp:129
@ eSeqData
real sequence data
Definition: seq_map.hpp:98
@ eSeqGap
gap
Definition: seq_map.hpp:97
@ eSeqRef
reference to Bioseq
Definition: seq_map.hpp:100
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
position_type GetToOpen(void) const
Definition: range.hpp:138
bool IsMatch(CTempString str, TMatch flags=fMatch_default)
Check existence substring which match a specified pattern.
Definition: regexp.cpp:193
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
#define NPOS
Definition: ncbistr.hpp:133
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5383
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
const TName & GetName(void) const
Get the Name member data.
Definition: Author_.hpp:352
bool IsSetName(void) const
Author, Primary or Secondary Check if a value has been assigned to Name data member.
Definition: Author_.hpp:340
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
bool IsSetPseudo(void) const
pseudogene Check if a value has been assigned to Pseudo data member.
Definition: Gene_ref_.hpp:681
TPseudo GetPseudo(void) const
Get the Pseudo member data.
Definition: Gene_ref_.hpp:706
const TStr & GetStr(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TType & GetType(void) const
Get the Type member data.
ESource_location
Access to ESource_location's attributes (values, names) as defined in spec.
@ eSource_location_unknown
@ eSource_location_chromatophore
bool IsSetQual(void) const
qualifiers Check if a value has been assigned to Qual data member.
Definition: Seq_feat_.hpp:1135
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Seq_feat_.hpp:1147
bool IsSetXref(void) const
cite other relevant features Check if a value has been assigned to Xref data member.
Definition: Seq_feat_.hpp:1296
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
TPseudo GetPseudo(void) const
Get the Pseudo member data.
Definition: Seq_feat_.hpp:1365
bool IsSetPseudo(void) const
annotated on pseudogene? Check if a value has been assigned to Pseudo data member.
Definition: Seq_feat_.hpp:1346
const TGene & GetGene(void) const
Get the variant data.
const TXref & GetXref(void) const
Get the Xref member data.
Definition: Seq_feat_.hpp:1308
bool IsRna(void) const
Check if variant Rna is selected.
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TIupacna & GetIupacna(void) const
Get the variant data.
Definition: Seq_data_.hpp:510
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
const TNcbi4na & GetNcbi4na(void) const
Get the variant data.
Definition: Seq_data_.hpp:570
const TNcbi2na & GetNcbi2na(void) const
Get the variant data.
Definition: Seq_data_.hpp:550
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
Definition: Bioseq_.hpp:278
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ e_Ncbipna
nucleic acid probabilities
Definition: Seq_data_.hpp:109
@ e_Ncbi2na
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
@ e_Ncbi8na
8 bit extended nucleic acid code
Definition: Seq_data_.hpp:108
@ e_Ncbi4na
4 bit nucleic acid code
Definition: Seq_data_.hpp:107
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
const TContact & GetContact(void) const
Get the Contact member data.
const TSub & GetSub(void) const
Get the Sub member data.
bool IsSetSub(void) const
Check if a value has been assigned to Sub data member.
const TContact & GetContact(void) const
Get the Contact member data.
bool IsSetContact(void) const
who to contact Check if a value has been assigned to Contact data member.
bool IsSetContact(void) const
WARNING: this will replace the above Check if a value has been assigned to Contact data member.
int i
yy_size_t n
int len
static string GetProductName(const CProt_ref &prot)
Definition: utils.cpp:62
#define nullptr
Definition: ncbimisc.hpp:45
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
Utility macros and typedefs for exploring NCBI objects from seq.asn.
#define FIELD_IS_SET_AND_IS(Var, Fld, Chs)
FIELD_IS_SET_AND_IS base macro.
static const char * str(char *buf, int n)
Definition: stats.c:84
CConstRef< CSerialObject > m_Obj
map< const CSeq_feat *, CParseNode * > m_FeatureMap
map< const CSeqdesc *, CParseNode * > m_DescriptorMap
vector< pair< size_t, size_t > > NRuns
string GetStats() const
size_t _CBposition[WINDOW_SIZE]
static const size_t WINDOW_SIZE
size_t _CBscore[WINDOW_SIZE]
Selector used in CSeqMap methods returning iterators.
Definition: seq_map_ci.hpp:113
Definition: type.c:6
#define _ASSERT
#define Type
C++ wrappers for the Perl-compatible regular expression (PCRE) library.
#define const
Definition: zconf.h:230
Modified on Sat Sep 30 23:18:07 2023 by modify_doxy.py rev. 669887