NCBI C++ ToolKit
validerror_bioseq.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validerror_bioseq.cpp 101515 2023-12-22 19:02:03Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat ......
27  *
28  * File Description:
29  * validation of bioseq
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbitime.hpp>
37 #include <corelib/ncbimisc.hpp>
38 
42 #include <objtools/error_codes.hpp>
43 
45 
46 #include <objmgr/seqdesc_ci.hpp>
47 #include <objmgr/graph_ci.hpp>
48 #include <objmgr/util/sequence.hpp>
49 
51 
52 #include <optional>
53 
54 
55 #define NCBI_USE_ERRCODE_X Objtools_Validator
56 
59 BEGIN_SCOPE(validator)
60 USING_SCOPE(sequence);
61 USING_SCOPE(feature);
62 
63 class CCdsMatchInfo;
64 
65 class CMrnaMatchInfo : public CObject
66 {
67 public:
68  CMrnaMatchInfo(const CSeq_feat& mrna, CScope* scope);
69  const CSeq_feat& GetSeqfeat() const;
70  bool Overlaps(const CSeq_feat& cds) const;
71  void SetMatch();
72  bool HasMatch() const;
73  void SetPseudo(bool val = true) { m_IsPseudo = val; }
74  bool OkWithoutCds(bool isGenbank = false) const;
75 
76 private:
78 
80  bool m_HasMatch;
81  bool m_IsPseudo;
82 };
83 
84 
85 class CCdsMatchInfo : public CObject
86 {
87 public:
88  CCdsMatchInfo(const CSeq_feat& cds, CScope* scope);
89  const CSeq_feat& GetSeqfeat() const;
90  bool Overlaps(const CSeq_feat& mrna) const;
91  bool AssignXrefMatch(TmRNAList& unmatched_mrnas, const CTSE_Handle& tse);
92  bool AssignOverlapMatch(TmRNAList& unmatched_mrnas, CScope& scope);
93  void UpdateOtherMrnas(const TmRNAList& unmatched_mrnas);
94  size_t CountOtherMrnas() { return m_OtherMrnas.size(); }
97  bool AssignMatch(TmRNAList& mrna_map, CFeatTree& feat_tree, CScope& scope);
98  bool HasMatch() const;
99  void NeedsMatch(bool needs_match);
100  bool NeedsMatch() const;
101  const CMrnaMatchInfo& GetMatch() const;
102  bool IsPseudo() const;
103  void SetPseudo();
104 
105 private:
108 
113  list<CConstRef<CSeq_feat>> m_OtherMrnas;
115 };
116 
117 
118 // =============================================================================
119 // Public
120 // =============================================================================
121 
123  CValidError_base(imp), m_AnnotValidator(imp), m_DescrValidator(imp), m_FeatValidator(imp), m_GeneIt(nullptr), m_AllFeatIt(nullptr)
124 {
125 }
126 
127 
129 {
130 }
131 
133 {
136 
137  if (bsh.IsSetInst_Repr()) {
138  repr = bsh.GetInst_Repr();
139  }
140 
142  while (m) {
143  const CSeqdesc::TMolinfo& mi = m->GetMolinfo();
144  if (mi.IsSetTech()) {
145  tech = mi.GetTech();
146  }
147 
148  ++m;
149  }
150 
151  for (auto id : bsh.GetId()) {
152  CSeq_id::EAccessionInfo acc_info = id.IdentifyAccession();
153  unsigned int acc_div = acc_info & CSeq_id::eAcc_division_mask;
154  if (acc_div == CSeq_id::eAcc_wgs && tech == CMolInfo::eTech_wgs && repr == CSeq_inst::eRepr_virtual) {
155  bool is_wgs_master = (acc_info & CSeq_id::fAcc_master) != 0;
156  if (is_wgs_master) {
157  m_report_short_seq = false;
158  }
159  }
160  }
161 
163  while (d) {
164  const CSeqdesc::TSource& source = d->GetSource();
165 
166  // look for chromosome, prokaryote, linkage group
168  if ((*it)->IsSetSubtype() && (*it)->IsSetName() && !NStr::IsBlank((*it)->GetName())) {
169  if ((*it)->GetSubtype() == CSubSource::eSubtype_chromosome) {
171  } else if ((*it)->GetSubtype() == CSubSource::eSubtype_linkage_group) {
173  }
174  }
175  }
176  if (source.IsSetLineage()) {
177  string lineage = source.GetLineage();
178  if (NStr::StartsWith(lineage, "Bacteria; ") ||
179  NStr::StartsWith(lineage, "Archaea; ")) {
182  m_is_bact_or_arch = true;
183  }
184  if (NStr::StartsWith(lineage, "Viruses; ")) {
186  }
187  }
188  if (source.IsSetDivision()) {
189  string div = source.GetDivision();
190  if (NStr::Equal(div, "BCT") || NStr::Equal(div, "VRL")) {
193  }
194  }
195  if (source.IsSetGenome()) {
196  CBioSource::TGenome genome = source.GetGenome();
197  // check for organelle
198  if (IsOrganelle(genome)) {
200  }
201  m_is_plasmid = (genome == NCBI_GENOME(plasmid));
202  m_is_chromosome = (genome == NCBI_GENOME(chromosome));
203  m_is_extrachrom = (genome == NCBI_GENOME(extrachrom));
204  }
205 
206  ++d;
207  }
208 }
209 
210 
212 {
213  m_splicing_not_expected = false;
215  m_report_short_seq = true;
216  m_is_bact_or_arch = false;
217  m_is_plasmid = false;
218  m_is_chromosome = false;
219  m_is_extrachrom = false;
220 
221  try {
223 
225 
226  CSeq_entry_Handle appropriate_parent;
227  if (m_Imp.ShouldSubdivide()) {
229  }
230  if (appropriate_parent) {
231  CRef<CScope> tmp_scope(new CScope(*(CObjectManager::GetInstance())));
232  tmp_scope->AddDefaults();
233  CSeq_entry_Handle this_seh = tmp_scope->AddTopLevelSeqEntry(*(appropriate_parent.GetCompleteSeq_entry()));
234  m_FeatValidator.SetScope(*tmp_scope);
235  m_FeatValidator.SetTSE(this_seh);
236  } else {
239  }
240 
241  try {
242  CCacheImpl::SFeatKey gene_key(
244  m_GeneIt = &GetCache().GetFeatFromCache(gene_key);
245 
246  CCacheImpl::SFeatKey all_feat_key(
248  m_AllFeatIt = &GetCache().GetFeatFromCache(all_feat_key);
249  } catch (const exception&) {
250  // sequence might be too broken to validate features
251  m_GeneIt = nullptr;
252  m_AllFeatIt = nullptr;
253  }
254  ValidateSeqIds(seq);
255  ValidateInst(seq);
257  ValidateHistory(seq);
258  FOR_EACH_ANNOT_ON_BIOSEQ (annot, seq) {
261  }
262  if (seq.IsSetDescr()) {
263  if (m_CurrentHandle) {
265  if (ctx) {
266  m_DescrValidator.ValidateSeqDescr(seq.GetDescr(), *(ctx.GetCompleteSeq_entry()));
267  }
268  }
269  }
270  if (IsWGSMaster(seq, m_CurrentHandle.GetScope())) {
272  }
273  if (appropriate_parent) {
276  }
277 
278  } catch (const exception& e) {
280  string("Exception while validating bioseq. EXCEPTION: ") +
281  e.what(), seq);
282  }
284  if (m_GeneIt) {
285  m_GeneIt = nullptr;
286  }
287  if (m_AllFeatIt) {
288  m_AllFeatIt = nullptr;
289  }
290 }
291 
292 
293 static bool s_IsSkippableDbtag(const CDbtag& dbt)
294 {
295  if (! dbt.IsSetDb()) {
296  return false;
297  }
298  const string& db = dbt.GetDb();
299  if (NStr::EqualNocase(db, "TMSMART")
300  || NStr::EqualNocase(db, "BankIt")
301  || NStr::EqualNocase(db, "NCBIFILE")) {
302  return true;
303  } else {
304  return false;
305  }
306 }
307 
308 static char CheckForBadSeqIdChars(const string& id)
309 {
310  FOR_EACH_CHAR_IN_STRING(itr, id) {
311  const char& ch = *itr;
312  if (ch == '|' || ch == ',') return ch;
313  }
314  return '\0';
315 }
316 
317 // VR-748
318 static char CheckForBadLocalIdChars(const string& id)
319 {
320  for (size_t i = 0; i < id.length(); i++) {
321  if (! CSeq_id::IsValidLocalID(id.substr(i, 1))) {
322  return id.c_str()[i];
323  }
324  }
325  return '\0';
326 }
327 
328 
329 static char CheckForBadFileIDSeqIdChars(const string& id)
330 {
331  FOR_EACH_CHAR_IN_STRING(itr, id) {
332  const char& ch = *itr;
333  if (ch == '|' || ch == ',') return ch;
334  }
335  return '\0';
336 }
337 
338 
339 // validation for individual Seq-id
340 void CValidError_bioseq::ValidateSeqId(const CSeq_id& id, const CBioseq& ctx, bool longer_general)
341 {
342  // see if ID can be used to find ctx
343  CBioseq_Handle ctx_handle = m_Scope->GetBioseqHandle(ctx);
344  if (! ctx_handle) {
345  if (! m_Imp.IsPatent()) {
347  "BioseqFind (" + id.AsFastaString() +
348  ") unable to find itself - possible internal error", ctx);
349  }
350  return;
351  }
352  CTSE_Handle tse = ctx_handle.GetTSE_Handle();
353  CBioseq_Handle bsh = tse.GetBioseqHandle(id);
354 
355  if (bsh) {
356  CConstRef<CBioseq> core = bsh.GetBioseqCore();
357  if (! core) {
358  if (! m_Imp.IsPatent()) {
360  "BioseqFind (" + id.AsFastaString() +
361  ") unable to find itself - possible internal error", ctx);
362  }
363  } else if (core.GetPointer() != &ctx) {
365  "SeqID " + id.AsFastaString() +
366  " is present on multiple Bioseqs in record", ctx);
367  }
368  } else {
370  "BioseqFind (" + id.AsFastaString() +
371  ") unable to find itself - possible internal error", ctx);
372  }
373 
374  //check formatting
375  const CTextseq_id* tsid = id.GetTextseq_Id();
376 
377  switch (id.Which()) {
378  case CSeq_id::e_Tpg:
379  case CSeq_id::e_Tpe:
380  case CSeq_id::e_Tpd:
381  if (IsHistAssemblyMissing(ctx) && ctx.IsNa()) {
383  "TPA record " + ctx.GetId().front()->AsFastaString() +
384  " should have Seq-hist.assembly for PRIMARY block",
385  ctx);
386  }
387  // Fall thru
389  case CSeq_id::e_Genbank:
390  case CSeq_id::e_Embl:
391  case CSeq_id::e_Ddbj:
392  if (tsid && tsid->IsSetAccession()) {
393  const string& acc = tsid->GetAccession();
394  const char badch = CheckForBadSeqIdChars (acc);
395  if (badch != '\0') {
397  "Bad character '" + string(1, badch) + "' in accession '" + acc + "'", ctx);
398  }
400  if (info == CSeq_id::eAcc_unknown ||
401  (ctx.IsNa() && (info & CSeq_id::fAcc_prot)) ||
402  (ctx.IsAa() && (info & CSeq_id::fAcc_nuc))) {
404  "Bad accession " + acc, ctx);
405  }
406  // Check for secondary conflicts
409  }
410  // Fall thru
412  case CSeq_id::e_Other:
413  if (tsid) {
414  if (tsid->IsSetName()) {
415  const string& name = tsid->GetName();
416  ITERATE (string, s, name) {
417  if (isspace((unsigned char)(*s))) {
420  "Seq-id.name '" + name + "' should be a single "
421  "word without any spaces", ctx);
422  break;
423  }
424  }
425  }
426 
427  if (tsid->IsSetAccession() && id.IsOther()) {
428  const string& acc = tsid->GetAccession();
429  const char badch = CheckForBadSeqIdChars (acc);
430  if (badch != '\0') {
432  "Bad character '" + string(1, badch) + "' in accession '" + acc + "'", ctx);
433  }
434  size_t num_letters = 0;
435  size_t num_digits = 0;
436  size_t num_underscores = 0;
437  bool bad_id_chars = false;
438  bool is_NZ = (NStr::CompareNocase(acc, 0, 3, "NZ_") == 0);
439  size_t i = 0;
440  bool letter_after_digit = false;
441 
442  if (is_NZ) {
443  i = 3;
444  }
445 
446  for (; i < acc.length(); ++i) {
447  if (isupper((unsigned char)acc[i])) {
448  num_letters++;
449  } else if (isdigit((unsigned char)acc[i])) {
450  num_digits++;
451  } else if (acc[i] == '_') {
452  num_underscores++;
453  if (num_digits > 0 || num_underscores > 1) {
454  letter_after_digit = true;
455  }
456  } else {
457  bad_id_chars = true;
458  }
459  }
460 
461  if (letter_after_digit || bad_id_chars) {
463  "Bad accession " + acc, ctx);
464  } else if (is_NZ && (num_letters == 4 || num_letters == 6) &&
465  (num_digits >= 8 && num_digits <= 11) && num_underscores == 0) {
466  // valid accession - do nothing!
467  } else if (is_NZ && ValidateAccessionString(acc, false) == eAccessionFormat_valid) {
468  // valid accession - do nothing!
469  } else if (num_letters == 2 &&
470  (num_digits == 6 || num_digits == 8 || num_digits == 9) &&
471  num_underscores == 1) {
472  // valid accession - do nothing!
473  } else if (num_letters == 4 && num_digits == 10 && ctx.IsNa()) {
474  } else {
476  "Bad accession " + acc, ctx);
477  }
478  }
479  }
480  // Fall thru
482  case CSeq_id::e_Pir:
484  case CSeq_id::e_Prf:
485  if (tsid) {
486  if (ctx.IsNa() &&
487  (! tsid->IsSetAccession() || tsid->GetAccession().empty())) {
488  if (ctx.GetInst().GetRepr() != CSeq_inst::eRepr_seg ||
489  m_Imp.IsGI()) {
490  if (! id.IsDdbj() ||
491  ctx.GetInst().GetRepr() != CSeq_inst::eRepr_seg) {
492  string msg = "Missing accession for " + id.AsFastaString();
495  msg, ctx);
496  }
497  }
498  }
499  } else {
501  "Seq-id type not handled", ctx);
502  }
503  break;
504  case CSeq_id::e_Gi:
505  if (id.GetGi() <= ZERO_GI) {
507  "Invalid GI number", ctx);
508  }
509  break;
510  case CSeq_id::e_General:
511  if (! id.GetGeneral().IsSetDb() || NStr::IsBlank(id.GetGeneral().GetDb())) {
512  PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, "General identifier missing database field", ctx);
513  }
514  if (id.GetGeneral().IsSetDb()) {
515  const CDbtag& dbt = id.GetGeneral();
516  size_t dblen = dbt.GetDb().length();
517  EDiagSev sev = eDiag_Error;
518  if (m_Imp.IsLocalGeneralOnly()) {
519  sev = eDiag_Critical;
520  } else if (m_Imp.IsRefSeq()) {
521  sev = eDiag_Error;
522  } else if (m_Imp.IsINSDInSep()) {
523  sev = eDiag_Error;
524  } else if (m_Imp.IsIndexerVersion()) {
525  sev = eDiag_Error;
526  }
527  static const auto max_dblen = CSeq_id::kMaxGeneralDBLength;
528  if (dblen > max_dblen) {
529  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "General database longer than " + NStr::NumericToString(max_dblen) + " characters", ctx);
530  }
531  if (! s_IsSkippableDbtag(dbt)) {
532  if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
533  size_t idlen = dbt.GetTag().GetStr().length();
534  static const auto maxlen = CSeq_id::kMaxGeneralTagLength;
535  if (longer_general) {
536  if (idlen > 100 && ! m_Imp.IsGI()) {
537  PostErr(sev, eErr_SEQ_INST_BadSeqIdLength, "General identifier longer than " + NStr::NumericToString(100) + " characters", ctx);
538  }
539  } else {
540  if (idlen > maxlen && ! m_Imp.IsGI()) {
541  PostErr(sev, eErr_SEQ_INST_BadSeqIdLength, "General identifier longer than " + NStr::NumericToString(maxlen) + " characters", ctx);
542  }
543  }
544  if (idlen == 0) {
545  PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, "General identifier must not be an empty string", ctx);
546  }
547  }
548  }
549  if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
550  const string& acc = dbt.GetTag().GetStr();
551  char badch;
552  if (dbt.IsSetDb() && (NStr::Equal(dbt.GetDb(), "NCBIFILE") || NStr::Equal(dbt.GetDb(), "BankIt"))) {
553  badch = CheckForBadFileIDSeqIdChars(acc);
554  } else {
555  badch = CheckForBadLocalIdChars(acc);
556  if (badch == '\0' && dbt.IsSetDb()) {
557  badch = CheckForBadLocalIdChars(dbt.GetDb());
558  }
559  }
560  if (badch != '\0') {
562  "Bad character '" + string(1, badch) + "' in sequence ID '" + id.AsFastaString() + "'", ctx);
563  }
564  }
565  }
566  break;
567  case CSeq_id::e_Local:
568  if (id.IsLocal() && id.GetLocal().IsStr() && id.GetLocal().GetStr().length() > CSeq_id::kMaxLocalIDLength) {
569  EDiagSev sev = eDiag_Error;
570  if (! m_Imp.IsINSDInSep()) {
571  sev = eDiag_Critical;
572  } else if (! m_Imp.IsIndexerVersion()) {
573  sev = eDiag_Error;
574  }
575  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "Local identifier longer than " + NStr::NumericToString(CSeq_id::kMaxLocalIDLength) + " characters", ctx);
576  }
577  if (id.IsLocal() && id.GetLocal().IsStr()) {
578  const string& acc = id.GetLocal().GetStr();
579  const char badch = CheckForBadLocalIdChars(acc);
580  if (badch != '\0') {
582  "Bad character '" + string(1, badch) + "' in local ID '" + acc + "'", ctx);
583  }
584  }
585  break;
586  case CSeq_id::e_Pdb:
587  if (id.IsPdb()) {
588  const CPDB_seq_id& pdb = id.GetPdb();
589  if (pdb.IsSetChain() && pdb.IsSetChain_id()) {
590  int chain = pdb.GetChain();
591  const string& chain_id = pdb.GetChain_id();
592  if (chain_id.size() == 1 && chain_id[0] == chain) {
593  break; // OK (straightforward match)
594  } else if (islower(chain) && chain_id.size() == 2
595  && chain_id[0] == chain_id[1]
596  && chain_id[0] == toupper(chain)) {
597  break; // OK (historic special case)
598  } else if (chain == '|' && chain_id == "VB") {
599  break; // OK (likewise)
600  } else {
602  "PDB Seq-id contains mismatched \'chain\' and"
603  " \'chain-id\' slots", ctx);
604  }
605  }
606  }
607  break;
608  default:
609  break;
610  }
611 
612 #if 0
613  // disabled for now
614  if (! IsNCBIFILESeqId(**i)) {
615  string label;
616  (*i)->GetLabel(&label);
617  if (label.length() > 40) {
619  "Sequence ID is unusually long (" +
620  NStr::IntToString(label.length()) + "): " + label,
621  seq);
622  }
623  }
624 #endif
625 
626 }
627 
628 static bool x_IsWgsSecondary(const CBioseq& seq)
629 {
631  const list< string > *extra_acc = nullptr;
632  const CSeqdesc& desc = **sd;
633  switch (desc.Which()) {
634  case CSeqdesc::e_Genbank:
635  if (desc.GetGenbank().IsSetExtra_accessions()) {
636  extra_acc = &(desc.GetGenbank().GetExtra_accessions());
637  }
638  break;
639  case CSeqdesc::e_Embl:
640  if (desc.GetEmbl().IsSetExtra_acc()) {
641  extra_acc = &(desc.GetEmbl().GetExtra_acc());
642  }
643  break;
644  default:
645  break;
646  }
647  if (extra_acc) {
648  FOR_EACH_STRING_IN_LIST (acc, *extra_acc) {
651  && (info & CSeq_id::fAcc_master) != 0) {
652  return true;
653  }
654  }
655  }
656  }
657  return false;
658 }
659 
660 // VR-728
661 // cannot have only seq-ids that will be stripped when loading to ID
663 {
664  bool found_good = false;
665  ITERATE(CBioseq::TId, id_it, seq.GetId()) {
666  if (! IsTemporary(**id_it)) {
667  found_good = true;
668  }
669  }
670  if (! found_good) {
672  "The only ids on this Bioseq will be stripped during ID load", seq);
673  }
674 }
675 
676 
678 {
679  // Ensure that CBioseq has at least one CSeq_id
680  if (! seq.IsSetId() || seq.GetId().empty()) {
682  "No ids on a Bioseq", seq);
683  return;
684  }
685 
686  CSeq_inst::ERepr repr = seq.GetInst().GetRepr();
687 
688  // Loop thru CSeq_ids for this CBioseq. Determine if seq has
689  // gi, NG, or NC. Check that the same CSeq_id not included more
690  // than once.
691  bool has_gi = false;
692  bool is_lrg = false;
693  bool has_ng = false;
694  bool wgs_tech_needs_wgs_accession = false;
695  bool is_segset_accession = false;
696  bool has_wgs_general = false;
697  bool is_eb_db = false;
698  bool longer_general = false;
699 
700  FOR_EACH_SEQID_ON_BIOSEQ (i, seq) {
701  if ((*i)->IsOther() || (*i)->IsEmbl() || (*i)->IsTpe()) {
702  longer_general = true;
703  }
704  }
705 
706  FOR_EACH_SEQID_ON_BIOSEQ (i, seq) {
707  // first, do standalone validation
708  ValidateSeqId(**i, seq, longer_general);
709 
710  if ((*i)->IsGeneral() && (*i)->GetGeneral().IsSetDb()) {
711  if (NStr::EqualNocase((*i)->GetGeneral().GetDb(), "LRG")) {
712  is_lrg = true;
713  }
714  if (NStr::StartsWith((*i)->GetGeneral().GetDb(), "WGS:")) {
715  has_wgs_general = true;
716  }
717  } else if ((*i)->IsOther() && (*i)->GetOther().IsSetAccession()) {
718  const string& acc = (*i)->GetOther().GetAccession();
719  if (NStr::StartsWith(acc, "NG_")) {
720  has_ng = true;
721  wgs_tech_needs_wgs_accession = true;
722  } else if (NStr::StartsWith(acc, "NM_")
723  || NStr::StartsWith(acc, "NP_")
724  || NStr::StartsWith(acc, "NR_")) {
725  wgs_tech_needs_wgs_accession = true;
726  }
727  } else if ((*i)->IsEmbl() && (*i)->GetEmbl().IsSetAccession()) {
728  is_eb_db = true;
729  } else if ((*i)->IsDdbj() && (*i)->GetDdbj().IsSetAccession()) {
730  is_eb_db = true;
731  }
732 
733  // Check that no two CSeq_ids for same CBioseq are same type
734  CBioseq::TId::const_iterator j;
735  for (j = i, ++j; j != seq.GetId().end(); ++j) {
736  if ((**i).Compare(**j) != CSeq_id::e_DIFF) {
737  CNcbiOstrstream os;
738  os << "Conflicting ids on a Bioseq: (";
739  (**i).WriteAsFasta(os);
740  os << " - ";
741  (**j).WriteAsFasta(os);
742  os << ")";
744  CNcbiOstrstreamToString (os) /* os.str() */, seq);
745  }
746  }
747 
748  if ((*i)->IsGenbank() || (*i)->IsEmbl() || (*i)->IsDdbj()) {
749  wgs_tech_needs_wgs_accession = true;
750  }
751 
752  if ((*i)->IsGi()) {
753  has_gi = true;
754  }
755 
756  if ((*i)->IdentifyAccession() == CSeq_id::eAcc_segset) {
757  is_segset_accession = true;
758  }
759 
760  }
761  if (is_lrg && ! has_ng) {
763  "LRG sequence needs NG_ accession", seq);
764  }
765 
766 
767  // Loop thru CSeq_ids to check formatting
768  bool is_wgs = false;
769  unsigned int gi_count = 0;
770  unsigned int accn_count = 0;
771  unsigned int lcl_count = 0;
772  FOR_EACH_SEQID_ON_BIOSEQ (k, seq) {
773  const CTextseq_id* tsid = (*k)->GetTextseq_Id();
774  switch ((**k).Which()) {
775  case CSeq_id::e_Local:
776  lcl_count++;
777  break;
778  case CSeq_id::e_Tpg:
779  case CSeq_id::e_Tpe:
780  case CSeq_id::e_Tpd:
781  case CSeq_id::e_Genbank:
782  case CSeq_id::e_Embl:
783  case CSeq_id::e_Ddbj:
784  if (tsid && tsid->IsSetAccession()) {
785  if ((*k)->IsGenbank() || (*k)->IsEmbl() || (*k)->IsDdbj()) {
786  is_wgs |= IsWGSAccession(**k);
787  }
788 
789  if (has_gi) {
790  if (tsid->IsSetVersion() && tsid->GetVersion() == 0) {
791  const string& acc = tsid->GetAccession();
793  "Accession " + acc + " has 0 version", seq);
794  }
795  }
796  }
797  // Fall thru
799  case CSeq_id::e_Other:
800  if (tsid) {
801  if (has_gi && ! tsid->IsSetAccession() && tsid->IsSetName()) {
802  if ((*k)->IsDdbj() && repr == CSeq_inst::eRepr_seg) {
803  // Don't report ddbj segmented sequence missing accessions
804  } else {
806  "Missing accession for " + tsid->GetName(), seq);
807  }
808  }
809  accn_count++;
810  }
811  break;
812 
813  case CSeq_id::e_Pir:
815  case CSeq_id::e_Prf:
816  if (tsid) {
817  if ((! tsid->IsSetAccession() || NStr::IsBlank(tsid->GetAccession())) &&
818  (! tsid->IsSetName() || NStr::IsBlank(tsid->GetName())) &&
819  seq.GetInst().IsAa()) {
820  string label = (*k)->AsFastaString();
822  "Missing identifier for " + label, seq);
823  }
824  accn_count++;
825  }
826  break;
827  case CSeq_id::e_Gi:
828  gi_count++;
829  break;
830  default:
831  break;
832  }
833  }
834 
836  if (! SeqIsPatent(seq) && ! seq.IsAa()) {
837  if (is_wgs) {
838  if (! mi || ! mi->IsSetTech() ||
839  (mi->GetTech() != CMolInfo::eTech_wgs &&
840  mi->GetTech() != CMolInfo::eTech_tsa &&
841  mi->GetTech() != CMolInfo::eTech_targeted)) {
843  "WGS accession should have Mol-info.tech of wgs", seq);
844  }
845  } else if (mi && mi->IsSetTech() &&
846  mi->GetTech() == CMolInfo::eTech_wgs &&
847  wgs_tech_needs_wgs_accession &&
848  ! is_segset_accession &&
849  ! has_wgs_general &&
850  ! x_IsWgsSecondary(seq)) {
851  EDiagSev sev = eDiag_Error;
852  if (is_eb_db) {
853  sev = eDiag_Warning;
854  }
855  if (! is_eb_db) {
857  "Mol-info.tech of wgs should have WGS accession", seq);
858  }
859  }
860 
861  if ((IsNTNCNWACAccession(seq) || IsNG(seq)) && mi && seq.IsNa()
862  && (! mi->IsSetBiomol()
863  || (mi->GetBiomol() != CMolInfo::eBiomol_genomic
864  && mi->GetBiomol() != CMolInfo::eBiomol_cRNA))) {
866  "genomic RefSeq accession should use genomic or cRNA moltype",
867  seq);
868  }
869  }
870  if (seq.GetInst().GetMol() == CSeq_inst::eMol_dna) {
871  if (mi && mi->IsSetBiomol()) {
872  switch (mi->GetBiomol()) {
885  "Molecule type (DNA) does not match biomol (RNA)", seq);
886  break;
887  default:
888  break;
889  }
890  }
891  }
892 
893  // Check that a sequence with a gi number has exactly one accession
894  if (gi_count > 0 && accn_count == 0 && ! m_Imp.IsPDB() &&
895  repr != CSeq_inst::eRepr_virtual) {
897  "No accession on sequence with gi number", seq);
898  }
899  if (gi_count > 0 && accn_count > 1) {
901  "Multiple accessions on sequence with gi number", seq);
902  }
903 
904  x_CheckGeneralIDs(seq);
905 
906  if (m_Imp.IsValidateIdSet()) {
908  }
909 
910  // C toolkit ensures that there is exactly one CBioseq for a CSeq_id
911  // Not done here because object manager will not allow
912  // the same Seq-id on multiple Bioseqs
913 
914 }
915 
916 
918 {
919  bool rval = false;
920  const CSeq_inst& inst = seq.GetInst();
921  if (inst.IsSetHist() && inst.GetHist().IsSetAssembly()) {
922  return false;
923  }
924  CSeq_inst::TRepr repr = inst.CanGetRepr() ?
926 
927  if (seq.IsNa() && repr != CSeq_inst::eRepr_seg) {
928  rval = true;
929  // look for keyword
931  CSeqdesc_CI genbank_i(bsh, CSeqdesc::e_Genbank);
932  if (genbank_i && genbank_i->GetGenbank().IsSetKeywords()) {
933  CGB_block::TKeywords::const_iterator keyword = genbank_i->GetGenbank().GetKeywords().begin();
934  while (keyword != genbank_i->GetGenbank().GetKeywords().end() && rval) {
935  if (NStr::EqualNocase(*keyword, "TPA:reassembly")) {
936  rval = false;
937  }
938  ++keyword;
939  }
940  }
941  if (rval) {
942  CSeqdesc_CI embl_i(bsh, CSeqdesc::e_Embl);
943  if (embl_i && embl_i->GetEmbl().IsSetKeywords()) {
944  CEMBL_block::TKeywords::const_iterator keyword = embl_i->GetEmbl().GetKeywords().begin();
945  while (keyword != embl_i->GetEmbl().GetKeywords().end() && rval) {
946  if (NStr::EqualNocase(*keyword, "TPA:reassembly")) {
947  rval = false;
948  }
949  ++keyword;
950  }
951  }
952  }
953  }
954  return rval;
955 }
956 
957 
959  const string& primary_acc,
960  const CBioseq& seq,
961  int choice)
962 {
963  CSeqdesc_CI sd(m_Scope->GetBioseqHandle(seq), static_cast<CSeqdesc::E_Choice>(choice));
964  for (; sd; ++sd) {
965  const list<string>* extra_acc = nullptr;
966  if (choice == CSeqdesc::e_Genbank &&
968  extra_acc = &(sd->GetGenbank().GetExtra_accessions());
969  } else if (choice == CSeqdesc::e_Embl &&
970  sd->GetEmbl().IsSetExtra_acc()) {
971  extra_acc = &(sd->GetEmbl().GetExtra_acc());
972  }
973 
974  if (extra_acc) {
975  FOR_EACH_STRING_IN_LIST (acc, *extra_acc) {
976  if (NStr::CompareNocase(primary_acc, *acc) == 0) {
977  // If the same post error
980  primary_acc + " used for both primary and"
981  " secondary accession", seq);
982  }
983  }
984  }
985  }
986 }
987 
988 
990 {
991  for (CSeqdesc_CI it(bsh, CSeqdesc::e_User); it; ++it) {
992  if (it->GetUser().GetObjectType() == CUser_object::eObjectType_Unverified) {
993  return true;
994  }
995  }
996  return false;
997 }
998 
999 
1001 {
1005 
1006  bool has_barcode_tech = false;
1007 
1009  if (di && di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetTech() == CMolInfo::eTech_barcode) {
1010  has_barcode_tech = true;
1011  }
1012 
1013  bool has_barcode_keyword = false;
1014  for (CSeqdesc_CI it(bsh, CSeqdesc::e_Genbank); it; ++it) {
1015  FOR_EACH_KEYWORD_ON_GENBANKBLOCK (k, it->GetGenbank()) {
1016  if (NStr::EqualNocase(*k, "BARCODE")) {
1017  has_barcode_keyword = true;
1018  break;
1019  }
1020  }
1021  if (has_barcode_keyword && ! has_barcode_tech) {
1023  "BARCODE keyword without Molinfo.tech barcode",
1024  *ctx, *it);
1025  }
1026  }
1027  if (has_barcode_tech && ! has_barcode_keyword && di) {
1029  "Molinfo.tech barcode without BARCODE keyword",
1030  *ctx, *di);
1031  }
1032  if (has_barcode_keyword && HasUnverified(bsh)) {
1034  "Sequence has both BARCODE and UNVERIFIED keywords",
1035  seq);
1036  }
1037 }
1038 
1039 
1041 {
1042  const CSeq_inst& inst = seq.GetInst();
1043 
1044  // Check representation
1045  if (! ValidateRepr(inst, seq)) {
1046  return;
1047  }
1048 
1049  // Check molecule, topology, and strand
1050  if (! inst.IsSetMol()) {
1051  PostErr(eDiag_Error, eErr_SEQ_INST_MolNotSet, "Bioseq.mol is 0",
1052  seq);
1053  } else {
1054  const CSeq_inst::EMol& mol = inst.GetMol();
1055  switch (mol) {
1056 
1057  case CSeq_inst::eMol_na:
1059  "Bioseq.mol is type nucleic acid", seq);
1060  break;
1061 
1062  case CSeq_inst::eMol_aa:
1063  if (inst.IsSetTopology() &&
1067  "Non-linear topology set on protein", seq);
1068  }
1069  if (inst.IsSetStrand() &&
1070  inst.GetStrand() != CSeq_inst::eStrand_ss &&
1073  "Protein not single stranded", seq);
1074  }
1075  break;
1076 
1077  case CSeq_inst::eMol_dna:
1078  if (seq.IsSetInst() && seq.GetInst().IsSetTopology() && seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular) {
1079  if (m_is_bact_or_arch) {
1080  if (! m_is_plasmid && ! m_is_chromosome && ! m_is_extrachrom) {
1081  EDiagSev sev = eDiag_Error;
1082  if (IsRefSeq(seq) || m_Imp.IsRefSeqConventions()) {
1083  sev = eDiag_Error;
1084  } else if (IsEmblOrDdbj(seq)) {
1085  sev = eDiag_Warning;
1086  }
1088  "Circular Bacteria or Archaea should be chromosome, or plasmid, or extrachromosomal", seq);
1089  }
1090  }
1091  }
1092  break;
1093 
1095  PostErr(eDiag_Error, eErr_SEQ_INST_MolNotSet, "Bioseq.mol is 0",
1096  seq);
1097  break;
1098 
1099  case CSeq_inst::eMol_other:
1101  "Bioseq.mol is type other", seq);
1102  break;
1103 
1104  default:
1105  break;
1106  }
1107  }
1108 
1109  CSeq_inst::ERepr rp = seq.GetInst().GetRepr();
1110 
1111  if (rp == CSeq_inst::eRepr_raw || rp == CSeq_inst::eRepr_const) {
1112  // Validate raw and constructed sequences
1113  ValidateRawConst(seq);
1114  }
1115 
1116  // per VR-779
1117 #if 1
1118  if (rp == CSeq_inst::eRepr_seg) {
1119  PostErr(eDiag_Critical, eErr_SEQ_INST_ReprInvalid, "Segmented set format is not supported", seq);
1120  } else if (rp == CSeq_inst::eRepr_ref) {
1121  PostErr(eDiag_Critical, eErr_SEQ_INST_ReprInvalid, "Repr_ref format is not supported", seq);
1122  }
1123 #else
1124  if (rp == CSeq_inst::eRepr_seg || rp == CSeq_inst::eRepr_ref) {
1125  // Validate segmented and reference sequences
1126  ValidateSegRef(seq);
1127  }
1128 #endif
1129 
1130  if (rp == CSeq_inst::eRepr_delta) {
1131  // Validate delta sequences
1132  ValidateDelta(seq);
1133  }
1134 
1135  if (rp == CSeq_inst::eRepr_seg && seq.GetInst().IsSetExt() &&
1136  seq.GetInst().GetExt().IsSeg()) {
1137  // Validate part of segmented sequence
1138  ValidateSeqParts(seq);
1139  }
1140 
1141  if (rp == CSeq_inst::eRepr_raw || rp == CSeq_inst::eRepr_delta) {
1142  x_ValidateBarcode(seq);
1143  }
1144 
1145  x_ValidateTitle(seq);
1146  /*if (seq.IsAa()) {
1147  Validate protein title(amino acids only)
1148  ValidateProteinTitle(seq);
1149  }*/
1150 
1151  if (seq.IsNa()) {
1152  // check for N bases at start or stop of sequence,
1153  // or sequence entirely made of Ns
1154  ValidateNsAndGaps(seq);
1155 
1156  GapByGapInst(seq);
1157  }
1158 
1159  // Validate sequence length
1160  ValidateSeqLen(seq);
1161 
1162  // proteins should not have gaps
1163  if (seq.IsAa() && x_HasGap(seq)) {
1164  PostErr(eDiag_Error, eErr_SEQ_INST_ProteinShouldNotHaveGaps, "Protein sequences should not have gaps", seq);
1165  }
1166 }
1167 
1168 
1170 {
1171  bool is_wgs = false;
1172  bool is_grc = false;
1173 
1175  CSeqdesc_CI user(bsh, CSeqdesc::e_User);
1176  while (user) {
1178  user->GetUser().HasField("BioProject", ".", NStr::eNocase)) {
1179  // bioproject field found
1180  return false;
1181  }
1182  ++user;
1183  }
1184 
1185  CSeqdesc_CI ti(bsh, CSeqdesc::e_Title);
1186  if (ti) {
1187  while (ti) {
1188  if (NStr::StartsWith(ti->GetTitle(), "GRC")) {
1189  is_grc = true;
1190  break;
1191  }
1192  ++ti;
1193  }
1194  } else {
1195  sequence::CDeflineGenerator defline_generator;
1196  string title = defline_generator.GenerateDefline(seq, *m_Scope, sequence::CDeflineGenerator::fIgnoreExisting);
1197  if (! NStr::IsBlank(title)) {
1198  if (NStr::StartsWith(title, "GRC")) {
1199  is_grc = true;
1200  }
1201  }
1202  }
1203 
1204  is_wgs = IsWGS(bsh);
1205 
1206  bool is_gb = false, /* is_eb_db = false, */ is_refseq = false, is_ng = false;
1207 
1208  FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, seq) {
1209  const CSeq_id& sid = **sid_itr;
1210  switch (sid.Which()) {
1211  case CSeq_id::e_Genbank:
1212  case CSeq_id::e_Embl:
1213  // is_eb_db = true;
1214  // fall through
1215  case CSeq_id::e_Ddbj:
1216  is_gb = true;
1217  break;
1218  case CSeq_id::e_Other: {
1219  is_refseq = true;
1220  if (sid.GetOther().IsSetAccession()) {
1221  string acc = sid.GetOther().GetAccession().substr(0, 3);
1222  if (acc == "NG_") {
1223  is_ng = true;
1224  }
1225  }
1226  } break;
1227  default:
1228  break;
1229  }
1230  }
1231 
1232  if (is_refseq || m_Imp.IsRefSeqConventions()) {
1233  if (is_ng)
1234  return false;
1235  } else if (is_gb) {
1236  if (! is_wgs && ! is_grc)
1237  return false;
1238  } else {
1239  return false;
1240  }
1241 
1242  const CSeq_inst& inst = seq.GetInst();
1243  CSeq_inst::TRepr repr = inst.GetRepr();
1244 
1245  if (repr == CSeq_inst::eRepr_delta) {
1246  if (x_IsDeltaLitOnly(inst))
1247  return false;
1248  } else if (repr != CSeq_inst::eRepr_map) {
1249  return false;
1250  }
1251 
1252  return true;
1253 }
1254 
1256 {
1258 
1259  // Check that proteins in nuc_prot set have a CdRegion
1260  if (CdError(bsh)) {
1261  EDiagSev sev = eDiag_Error;
1263  if (bssh) {
1264  CBioseq_Handle nbsh = GetNucBioseq (bssh);
1265  if (nbsh) {
1266  CSeqdesc_CI desc( nbsh, CSeqdesc::e_Molinfo );
1267  const CMolInfo* mi = desc ? &(desc->GetMolinfo()) : nullptr;
1268  if (mi) {
1269  CMolInfo::TTech tech = mi->IsSetTech() ?
1271  if (tech == CMolInfo::eTech_wgs) {
1272  sev = eDiag_Critical;
1273  }
1274  }
1275  }
1276  }
1278  "No CdRegion in nuc-prot set points to this protein",
1279  seq);
1280  }
1281 
1282  bool is_patent = SeqIsPatent (seq);
1283 
1284  bool is_complete = false;
1285  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
1286  if (desc) {
1287  const CMolInfo& mi = desc->GetMolinfo();
1289  is_complete = true;
1290  }
1291  }
1292 
1293  try {
1294 
1295  // if there are no Seq-ids, the following tests can't be run
1296  if (seq.IsSetId()) {
1297 
1298  ValidateSeqFeatContext(seq, is_complete);
1299 
1300  // Check for duplicate features and overlapping peptide features.
1302 
1303  // Check for introns within introns.
1304  ValidateTwintrons(seq);
1305 
1306  // check for tRNA contained in tmRNA features
1308 
1309  // check for equivalent source features
1311 
1312  // check for equivalen pub features
1313  x_ValidatePubFeatures(bsh);
1314 
1315  // Check for colliding genes
1317 
1318  // Detect absence of BioProject DBLink for complete bacterial genomes
1320  }
1321 
1322  m_dblink_count = 0;
1323  m_taa_count = 0;
1324  m_bs_count = 0;
1325  m_as_count = 0;
1326  m_pdb_count = 0;
1327  m_sra_count = 0;
1328  m_bp_count = 0;
1329  m_unknown_count = 0;
1330 
1331  // Validate descriptors that affect this bioseq
1333 
1334 
1335  if (m_dblink_count > 1) {
1337  NStr::IntToString(m_dblink_count) + " DBLink user objects apply to a Bioseq", seq);
1338  }
1339 
1340  if (m_taa_count > 1) {
1342  "Trace Assembly Archive entries appear in " + NStr::IntToString(m_taa_count) + " DBLink user objects", seq);
1343  }
1344 
1345  if (m_bs_count > 1) {
1347  "BioSample entries appear in " + NStr::IntToString(m_bs_count) + " DBLink user objects", seq);
1348  }
1349 
1350  if (m_as_count > 1) {
1352  "Assembly entries appear in " + NStr::IntToString(m_as_count) + " DBLink user objects", seq);
1353  }
1354 
1355  if (m_pdb_count > 1) {
1357  "ProbeDB entries appear in " + NStr::IntToString(m_pdb_count) + " DBLink user objects", seq);
1358  }
1359 
1360  if (m_sra_count > 1) {
1362  "Sequence Read Archive entries appear in " + NStr::IntToString(m_sra_count) + " DBLink user objects", seq);
1363  }
1364 
1365  if (m_bp_count > 1) {
1367  "BioProject entries appear in " + NStr::IntToString(m_bp_count) + " DBLink user objects", seq);
1368  }
1369 
1370  if (m_unknown_count > 1) {
1372  "Unrecognized entries appear in " + NStr::IntToString(m_unknown_count) + " DBLink user objects", seq);
1373  } else if (m_unknown_count > 0) {
1375  "Unrecognized entries appear in " + NStr::IntToString(m_unknown_count) + " DBLink user object", seq);
1376  }
1377 
1378  // make sure that there is a pub on this bioseq
1380  CheckForPubOnBioseq(seq);
1381  }
1382  // make sure that there is a source on this bioseq
1384  CheckSourceDescriptor(bsh);
1385  // CheckForBiosourceOnBioseq(seq);
1386  }
1387 
1388  if (x_ShowBioProjectWarning(seq)) {
1390  "BioProject entries not present on CON record", seq);
1391  }
1392 
1393  } catch (const exception& e) {
1394  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1396  string("Exception while validating BioseqContext. EXCEPTION: ") +
1397  e.what(), seq);
1398  }
1399  }
1400 
1401  if (! is_patent) {
1402  // flag missing molinfo even if not in Sequin
1404  }
1405 
1406  CValidError_graph graph_validator(m_Imp);
1407  graph_validator.ValidateGraphsOnBioseq(seq);
1408 
1409  CheckTpaHistory(seq);
1410 
1411  // check for multiple publications with identical identifiers
1413 
1414  // look for orphaned proteins
1415  if (seq.IsAa() && bsh && ! GetNucProtSetParent(bsh) && ! AllowOrphanedProtein(seq, m_Imp.IsRefSeqConventions())) {
1417  "Orphaned stand-alone protein", seq);
1418  }
1419 
1420  // look for extra protein features
1421  if (seq.IsAa()) {
1422  CCacheImpl::SFeatKey prot_key(
1424  const CCacheImpl::TFeatValue& prot_feats =
1425  GetCache().GetFeatFromCache(prot_key);
1426 
1427  if (prot_feats.size() > 1) {
1428  ITERATE(CCacheImpl::TFeatValue, feat, prot_feats) {
1430  "Protein sequence has multiple unprocessed protein features",
1431  feat->GetOriginalFeature());
1432  }
1433  }
1434  }
1435 
1436  if (! m_Imp.IsNoCitSubPubs() && ! x_HasCitSub(bsh) && ! m_Imp.IsSeqSubmitParent()) {
1438  "Expected submission citation is missing for this Bioseq", seq);
1439  }
1440 
1441  // RW-1053 check sig_peptides and mat_peptides with instantiated products
1442  if (seq.IsAa()) {
1443 
1447  try {
1448  for (CFeat_CI feat_ci(bsh, sel); feat_ci; ++feat_ci) {
1449 
1450  const CSeq_feat& matpeptide = feat_ci->GetOriginalFeature();
1451  if (matpeptide.IsSetProduct()) {
1452  const CSeq_loc& loc = matpeptide.GetLocation();
1453  const CSeq_loc& prd = matpeptide.GetProduct();
1454 
1455  TSeqPos matlen = GetLength(loc, m_Scope);
1456  TSeqPos prdlen = GetLength(prd, m_Scope);
1457  if (matlen != prdlen) {
1459  "Mat_peptide does not match length of instantiated product",
1460  matpeptide);
1461  }
1462 
1465 
1466  TSeqPos len = matlen;
1467  if (len > prdlen) {
1468  len = prdlen;
1469  }
1470 
1471  for (TSeqPos i = 0; i < len; ++i) {
1472  CSeqVectorTypes::TResidue m_res = mat_vec[i];
1473  CSeqVectorTypes::TResidue p_res = prd_vec[i];
1474 
1475  if (m_res != p_res) {
1477  "Mismatch in mat_peptide (" + string(1, (char)m_res) + ") and instantiated product (" + \
1478  string(1, (char)p_res) + ") at position " + NStr::NumericToString(i + 1),
1479  matpeptide);
1480  }
1481  }
1482  }
1483  }
1484  } catch (CException&) {
1485  }
1486  }
1487 }
1488 
1489 
1491 {
1492  ITERATE(CPub_equiv::Tdata, it, pub.Get()) {
1493  if (x_HasCitSub(**it)) {
1494  return true;
1495  }
1496  }
1497  return false;
1498 }
1499 
1500 
1502 {
1503  if (pub.IsSub()) {
1504  return true;
1505  } else if (pub.IsEquiv() && x_HasCitSub(pub.GetEquiv())) {
1506  return true;
1507  } else {
1508  return false;
1509  }
1510 }
1511 
1512 
1514 {
1515  bool has_cit_sub = false;
1516  CSeqdesc_CI p(bsh, CSeqdesc::e_Pub);
1517  while (p && !has_cit_sub) {
1518  if (p->GetPub().IsSetPub()) {
1519  has_cit_sub = x_HasCitSub(p->GetPub().GetPub());
1520  }
1521  ++p;
1522  }
1523 
1524  return has_cit_sub;
1525 }
1526 
1527 
1528 template <class Iterator, class Predicate>
1529 bool lists_match(Iterator iter1, Iterator iter1_stop, Iterator iter2, Iterator iter2_stop, Predicate pred)
1530 {
1531  while (iter1 != iter1_stop && iter2 != iter2_stop) {
1532  if (! pred(*iter1, *iter2)) {
1533  return false;
1534  }
1535  ++iter1;
1536  ++iter2;
1537  }
1538  if (iter1 != iter1_stop || iter2 != iter2_stop) {
1539  return false;
1540  } else {
1541  return true;
1542  }
1543 }
1544 
1545 
1546 static bool s_OrgModEqual(
1547  const CRef<COrgMod>& om1,
1548  const CRef<COrgMod>& om2)
1549 {
1550  const COrgMod& omd1 = *(om1);
1551  const COrgMod& omd2 = *(om2);
1552 
1553  const string& str1 = omd1.GetSubname();
1554  const string& str2 = omd2.GetSubname();
1555 
1556  if (NStr::CompareNocase (str1, str2) != 0) return false;
1557 
1558  TORGMOD_SUBTYPE chs1 = omd1.GetSubtype();
1559  TORGMOD_SUBTYPE chs2 = omd2.GetSubtype();
1560 
1561  if (chs1 == chs2) return true;
1562  if (chs2 == NCBI_ORGMOD(other)) return true;
1563 
1564  return false;
1565 }
1566 
1567 
1568 bool s_DbtagEqual(const CRef<CDbtag>& dbt1, const CRef<CDbtag>& dbt2)
1569 {
1570  // is dbt1 == dbt2
1571  return dbt1->Compare(*dbt2) == 0;
1572 }
1573 
1574 
1575 // Two OrgRefs are identical if the taxnames are identical, the dbxrefs are identical,
1576 // and the orgname orgmod lists are identical
1577 static bool s_OrgrefEquivalent(const COrg_ref& org1, const COrg_ref& org2)
1578 {
1579  if ((org1.IsSetTaxname() && ! org2.IsSetTaxname())
1580  || (! org1.IsSetTaxname() && org2.IsSetTaxname())
1581  || (org1.IsSetTaxname() && org2.IsSetTaxname()
1582  && ! NStr::EqualNocase(org1.GetTaxname(), org2.GetTaxname()))) {
1583  return false;
1584  }
1585 
1586  if ((org1.IsSetDb() && ! org2.IsSetDb())
1587  || (! org1.IsSetDb() && org2.IsSetDb())
1588  || (org1.IsSetDb() && org2.IsSetDb()
1589  && !lists_match (org1.GetDb().begin(), org1.GetDb().end(),
1590  org2.GetDb().begin(), org2.GetDb().end(),
1591  s_DbtagEqual))) {
1592  return false;
1593  }
1594 
1595  if ((org1.IsSetOrgname() && ! org2.IsSetOrgname())
1596  || (! org1.IsSetOrgname() && org2.IsSetOrgname())) {
1597  return false;
1598  }
1599  if (org1.IsSetOrgname() && org2.IsSetOrgname()) {
1600  const COrgName& on1 = org1.GetOrgname();
1601  const COrgName& on2 = org2.GetOrgname();
1602  if ((on1.IsSetMod() && ! on2.IsSetMod())
1603  || (! on1.IsSetMod() && on2.IsSetMod())
1604  || (on1.IsSetMod() && on2.IsSetMod()
1605  && !lists_match (on1.GetMod().begin(), on1.GetMod().end(),
1606  on2.GetMod().begin(), on2.GetMod().end(),
1607  s_OrgModEqual))) {
1608  return false;
1609  }
1610  }
1611 
1612  return true;
1613 }
1614 
1615 
1616 // Two SubSources are equal and duplicates if:
1617 // they have the same subtype
1618 // and the same name (or don't require a name).
1619 
1621  const CRef<CSubSource>& st1,
1622  const CRef<CSubSource>& st2)
1623 {
1624  const CSubSource& sbs1 = *(st1);
1625  const CSubSource& sbs2 = *(st2);
1626 
1627  TSUBSOURCE_SUBTYPE chs1 = sbs1.GetSubtype();
1628  TSUBSOURCE_SUBTYPE chs2 = sbs2.GetSubtype();
1629 
1630  if (chs1 != chs2)
1631  return false;
1632  if (CSubSource::NeedsNoText(chs2))
1633  return true;
1634 
1635  if (sbs1.IsSetName() && sbs2.IsSetName()) {
1636  if (NStr::CompareNocase(sbs1.GetName(), sbs2.GetName()) == 0)
1637  return true;
1638  }
1639  if (! sbs1.IsSetName() && ! sbs2.IsSetName())
1640  return true;
1641 
1642  return false;
1643 }
1644 
1645 
1646 static bool s_BiosrcFullLengthIsOk(const CBioSource& src)
1647 {
1648  if (src.IsSetIs_focus()) {
1649  return true;
1650  }
1652  if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == CSubSource::eSubtype_transgenic) {
1653  return true;
1654  }
1655  }
1656  return false;
1657 }
1658 
1659 
1661 {
1662  if (! src.IsSetOrg() || ! src.GetOrg().IsSetTaxname()) {
1663  return false;
1664  }
1665  if (NStr::EqualNocase(src.GetOrg().GetTaxname(), "unidentified phage")) {
1666  return true;
1667  }
1668  if (src.GetOrg().IsSetOrgname() && src.GetOrg().GetOrgname().IsSetLineage()
1669  && NStr::StartsWith(src.GetOrg().GetOrgname().GetLineage(), "Viruses", NStr::eNocase)) {
1670  return true;
1671  }
1672 #if 0
1673  if (! src.GetOrg().IsSetOrgname()) {
1674  printf ("Orgname not set!\n");
1675  } else if (! src.GetOrg().GetOrgname().IsSetLineage()) {
1676  printf ("Lineage not set!\n");
1677  } else {
1678  printf ("Lineage is %s!\n", src.GetOrg().GetOrgname().GetLineage().c_str());
1679  }
1680 #endif
1681  return false;
1682 }
1683 
1684 
1685 bool s_OverlapOrAbut(const CSeq_loc& loc1, const CSeq_loc& loc2, CScope* scope)
1686 {
1687  TSeqPos start1 = loc1.GetStart(eExtreme_Positional);
1688  TSeqPos stop1 = loc1.GetStop(eExtreme_Positional);
1689  TSeqPos start2 = loc2.GetStart(eExtreme_Positional);
1690  TSeqPos stop2 = loc2.GetStop(eExtreme_Positional);
1691 
1692  if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1693  // abut
1694  return true;
1695  } else if (TestForOverlapEx(loc1, loc2, eOverlap_Simple, scope) >= 0) {
1696  return true;
1697  } else {
1698  return false;
1699  }
1700 }
1701 
1702 
1703 bool s_ContainedIn(const CSeq_loc& loc1, const CSeq_loc& loc2, CScope* scope)
1704 {
1705  TSeqPos start1 = loc1.GetStart(eExtreme_Positional);
1706  TSeqPos stop1 = loc1.GetStop(eExtreme_Positional);
1707  TSeqPos start2 = loc2.GetStart(eExtreme_Positional);
1708  TSeqPos stop2 = loc2.GetStop(eExtreme_Positional);
1709 
1710  if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1711  // abut
1712  return false;
1713  } else if (TestForOverlapEx(loc1, loc2, eOverlap_Contained, scope) >= 0) {
1714  return true;
1715  } else {
1716  return false;
1717  }
1718 }
1719 
1720 
1721 bool s_CheckIntervals(const CSeq_loc& loc1, const CSeq_loc& loc2, CScope* scope)
1722 {
1723  TSeqPos start1 = loc1.GetStart(eExtreme_Positional);
1724  TSeqPos stop1 = loc1.GetStop(eExtreme_Positional);
1725  TSeqPos start2 = loc2.GetStart(eExtreme_Positional);
1726  TSeqPos stop2 = loc2.GetStop(eExtreme_Positional);
1727 
1728  if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1729  // abut
1730  return false;
1731  } else if (TestForOverlapEx(loc1, loc2, eOverlap_CheckIntervals, scope) >= 0) {
1732  return true;
1733  } else {
1734  return false;
1735  }
1736 }
1737 
1738 
1740 {
1741  // don't bother if can't build all feature iterator
1742  if (! m_AllFeatIt) {
1743  return;
1744  }
1745  try {
1746  CCacheImpl::SFeatKey rna_key(
1748  const CCacheImpl::TFeatValue & rnas = GetCache().GetFeatFromCache(rna_key);
1749  CCacheImpl::TFeatValue::const_iterator feat = rnas.begin();
1750  if (feat != rnas.end()) {
1751 
1752  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1753  ++feat;
1754  for (; feat != rnas.end(); ++feat_prev, ++feat) {
1755 
1756  if (! s_OverlapOrAbut(feat_prev->GetLocation(),
1757  feat->GetLocation(), m_Scope)) {
1758  continue;
1759  }
1760 
1761  const CRNA_ref& tm = feat_prev->GetData().GetRna();
1762  const CRNA_ref& tr = feat->GetData().GetRna();
1763  if (tm.IsSetType() && tm.GetType() == CRNA_ref::eType_tmRNA) {
1764  if (tr.IsSetType() && tr.GetType() == CRNA_ref::eType_tRNA) {
1765  if (s_ContainedIn(feat_prev->GetLocation(),
1766  feat->GetLocation(), m_Scope)) {
1768  "tRNA contained within tmRNA",
1769  feat->GetOriginalFeature());
1770  }
1771  }
1772  }
1773  }
1774  }
1775  } catch (const exception& e) {
1776  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1778  string("Exception while validating RNA features. EXCEPTION: ") +
1779  e.what(), *(bsh.GetCompleteBioseq()));
1780  }
1781  }
1782 }
1783 
1784 
1786 {
1787  // don't bother if can't build all feature iterator
1788  if (! m_AllFeatIt) {
1789  return;
1790  }
1791  try {
1792  CCacheImpl::SFeatKey biosrc_key(
1794  const CCacheImpl::TFeatValue & biosrcs = GetCache().GetFeatFromCache(biosrc_key);
1795  CCacheImpl::TFeatValue::const_iterator feat = biosrcs.begin();
1796  if (feat != biosrcs.end()) {
1797  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1799  if (di) {
1800  if (! s_BiosrcFullLengthIsOk(di->GetSource())) {
1802  "Source feature is full length, should be descriptor",
1803  feat->GetOriginalFeature());
1804  }
1805  }
1806  }
1807 
1808  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1809  ++feat;
1810  for (; feat != biosrcs.end(); ++feat_prev, ++feat) {
1811  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1813  "Multiple full-length source features, should only be one if descriptor is transgenic",
1814  feat->GetOriginalFeature());
1815  }
1816 
1817  if (! s_OverlapOrAbut(feat_prev->GetLocation(),
1818  feat->GetLocation(), m_Scope)) {
1819  // not close enough
1820  continue;
1821  }
1822 
1823  // compare to see if feature sources are identical
1824  bool are_identical = true;
1825  if (feat_prev->IsSetComment() && feat->IsSetComment()
1826  && ! NStr::EqualNocase(feat_prev->GetComment(), feat->GetComment())) {
1827  are_identical = false;
1828  } else {
1829  const CBioSource& src_prev = feat_prev->GetData().GetBiosrc();
1830  const CBioSource& src = feat->GetData().GetBiosrc();
1831  if ((src.IsSetIs_focus() && ! src_prev.IsSetIs_focus())
1832  || (! src.IsSetIs_focus() && src_prev.IsSetIs_focus())) {
1833  are_identical = false;
1834  } else if ((src.IsSetSubtype() && ! src_prev.IsSetSubtype())
1835  || (! src.IsSetSubtype() && src_prev.IsSetSubtype())
1836  || (src.IsSetSubtype() && src_prev.IsSetSubtype()
1837  && ! lists_match(src.GetSubtype().begin(), src.GetSubtype().end(),
1838  src_prev.GetSubtype().begin(), src_prev.GetSubtype().end(),
1840  are_identical = false;
1841  } else if ((src.IsSetOrg() && ! src_prev.IsSetOrg())
1842  || (! src.IsSetOrg() && src_prev.IsSetOrg())
1843  || (src.IsSetOrg() && src_prev.IsSetOrg()
1844  && ! s_OrgrefEquivalent (src.GetOrg(), src_prev.GetOrg()))) {
1845  are_identical = false;
1846  }
1847  }
1848  if (are_identical && ! s_SuppressMultipleEquivBioSources(feat->GetData().GetBiosrc())) {
1850  "Multiple equivalent source features should be combined into one multi-interval feature",
1851  feat->GetOriginalFeature());
1852  }
1853  }
1854  }
1855  } catch (const exception& e) {
1856  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1858  string("Exception while validating source features. EXCEPTION: ") +
1859  e.what(), *(bsh.GetCompleteBioseq()));
1860  }
1861  }
1862 
1863 }
1864 
1865 
1866 static void s_MakePubLabelString(const CPubdesc& pd, string& label)
1867 {
1868  label = "";
1869 
1870  FOR_EACH_PUB_ON_PUBDESC (it, pd) {
1871  if ((*it)->IsGen() && (*it)->GetGen().IsSetCit()
1872  && ! (*it)->GetGen().IsSetCit()
1873  && ! (*it)->GetGen().IsSetJournal()
1874  && ! (*it)->GetGen().IsSetDate()
1875  && (*it)->GetGen().IsSetSerial_number()) {
1876  // skip over just serial number
1877  } else {
1878  (*it)->GetLabel(&label, CPub::eContent, CPub::fLabel_Unique);
1879  break;
1880  }
1881  }
1882 }
1883 
1884 
1886 {
1887  // don't bother if can't build feature iterator at all
1888  if (! m_AllFeatIt) {
1889  return;
1890  }
1891  try {
1892  CCacheImpl::SFeatKey pub_key(
1894  const CCacheImpl::TFeatValue& pubs =
1895  GetCache().GetFeatFromCache(pub_key);
1896  CCacheImpl::TFeatValue::const_iterator feat = pubs.begin();
1897  if (feat != pubs.end()) {
1898  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1900  "Publication feature is full length, should be descriptor",
1901  feat->GetOriginalFeature());
1902  }
1903 
1904  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1905  string prev_label;
1906  if (feat_prev != pubs.end()) {
1907  s_MakePubLabelString(feat_prev->GetData().GetPub(), prev_label);
1908  ++feat;
1909  }
1910  for (; feat != pubs.end(); ++feat, ++feat_prev) {
1911  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1913  "Publication feature is full length, should be descriptor",
1914  feat->GetOriginalFeature());
1915  }
1916  // compare to see if feature sources are identical
1917  bool are_identical = true;
1918  if (feat_prev->IsSetComment() && feat->IsSetComment()
1919  && ! NStr::EqualNocase(feat_prev->GetComment(), feat->GetComment())) {
1920  are_identical = false;
1921  } else {
1922  string label;
1923  s_MakePubLabelString(feat->GetData().GetPub(), label);
1924  if (! NStr::IsBlank(label) && ! NStr::IsBlank(prev_label)
1925  && ! NStr::EqualNocase(label, prev_label)) {
1926  are_identical = false;
1927  }
1928 
1929  // swap is faster than assignment
1930  prev_label.swap(label);
1931 
1932  // TODO: also check authors
1933  }
1934 
1935  if (are_identical) {
1937  "Multiple equivalent publication features should be combined into one multi-interval feature",
1938  feat->GetOriginalFeature());
1939  }
1940  }
1941  }
1942  } catch (const exception& e) {
1943  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1945  string("Exception while validating pub features. EXCEPTION: ") +
1946  e.what(), *(bsh.GetCompleteBioseq()));
1947  }
1948  }
1949 
1950 }
1951 
1952 
1954 {
1955 public:
1956  // faster than lexicographical order
1957  bool operator()(const CTempString& lhs, const CTempString& rhs) const
1958  {
1959  if (lhs.length() != rhs.length()) {
1960  return (lhs.length() < rhs.length());
1961  }
1962  return NStr::CompareNocase(lhs, rhs) < 0;
1963  }
1964 };
1965 
1967 {
1968 public:
1969  bool operator()(const CTempString& lhs, const CTempString& rhs) const
1970  {
1971  return NStr::CompareNocase(lhs, rhs) < 0;
1972  }
1973 };
1974 
1975 
1977  const CBioseq& seq, const vector<CTempString>& labels)
1978 {
1979  if (labels.size() <= 1) {
1980  // optimize fast case
1981  return;
1982  }
1983  if (m_Imp.IsRefSeqConventions() || m_Imp.IsRefSeq()) {
1984  return;
1985  }
1986 
1987  static const string kWarningPrefix =
1988  "Multiple equivalent publications annotated on this sequence [";
1989  static const string::size_type kMaxSummaryLen = 100;
1990 
1991  // TTempStringCount maps a CTempString to the number of times it appears
1992  // (Note case-insensitivity and non-lexicographical order)
1994  TLabelCount label_count;
1995 
1996  ITERATE(vector<CTempString>, label_it, labels) {
1997  ++label_count[*label_it];
1998  }
1999 
2000  // put the dups into a vector and sort
2001  vector<CTempString> sorted_dup_labels;
2002  ITERATE(TLabelCount, label_count_it, label_count) {
2003  int num_appearances = label_count_it->second;
2004  _ASSERT(num_appearances > 0);
2005  if (num_appearances > 1) {
2006  const CTempString& dup_label = label_count_it->first;
2007  sorted_dup_labels.push_back(dup_label);
2008  }
2009  }
2010  sort(BEGIN_COMMA_END(sorted_dup_labels), SCaseInsensitiveLess());
2011 
2012  // find all that appear multiple times
2013  string err_msg = kWarningPrefix; // avoid create and destroy on each iter'n
2014  ITERATE(vector<CTempString>, dup_label_it, sorted_dup_labels) {
2015  const CTempString& summary = *dup_label_it;
2016 
2017  err_msg.resize(kWarningPrefix.length());
2018  if (summary.length() > kMaxSummaryLen) {
2019  err_msg += summary.substr(0, kMaxSummaryLen);
2020  err_msg += "...";
2021  } else {
2022  err_msg += summary;
2023  }
2024  err_msg += "]";
2026  err_msg, seq);
2027  }
2028 }
2029 
2030 
2032 {
2033  // used to check for dups. Currently only deals with cases where
2034  // there's an otherpub, but check if this comment is out of date.
2035  set<TEntrezId> muids_seen;
2036  set<TEntrezId> pmids_seen;
2037 
2038  vector<int> serials;
2039  vector<CTempString> published_labels;
2040  vector<CTempString> unpublished_labels;
2041 
2044 
2045  const CBioseq& seq = *(bsh.GetCompleteBioseq());
2046 
2047  for (CSeqdesc_CI it(bsh, CSeqdesc::e_Pub); it; ++it) {
2048  CConstRef<CPubdesc> pub = ConstRef(&it->GetPub());
2049  // first, try to receive from cache
2050  const CCacheImpl::CPubdescInfo& pubdesc_info =
2051  GetCache().GetPubdescToInfo(pub);
2052  // note that some (e.g. pmids are ignored other than maybe storing
2053  // in the cache above)
2054  copy(BEGIN_COMMA_END(pubdesc_info.m_published_labels),
2055  back_inserter(published_labels));
2057  back_inserter(unpublished_labels));
2058 
2059  TEntrezId muid = ZERO_ENTREZ_ID;
2060  TEntrezId pmid = ZERO_ENTREZ_ID;
2061  bool otherpub = false;
2062  FOR_EACH_PUB_ON_PUBDESC (pub_it, *pub) {
2063  switch ((*pub_it)->Which()) {
2064  case CPub::e_Muid:
2065  muid = (*pub_it)->GetMuid();
2066  break;
2067  case CPub::e_Pmid:
2068  pmid = (*pub_it)->GetPmid();
2069  break;
2070  default:
2071  otherpub = true;
2072  break;
2073  }
2074  }
2075 
2076  if (otherpub) {
2077  bool collision = false;
2078  if (muid > ZERO_ENTREZ_ID) {
2079  if (muids_seen.find(muid) != muids_seen.end()) {
2080  collision = true;
2081  } else {
2082  muids_seen.insert(muid);
2083  }
2084  }
2085  if (pmid > ZERO_ENTREZ_ID) {
2086  if (pmids_seen.find(pmid) != pmids_seen.end()) {
2087  collision = true;
2088  } else {
2089  pmids_seen.insert(pmid);
2090  }
2091  }
2092  if (collision && ! m_Imp.IsRefSeqConventions() && ! m_Imp.IsRefSeq()) {
2094  "Multiple publications with identical PubMed ID", *ctx, *it);
2095  }
2096  }
2097  }
2098 
2099  x_ReportDuplicatePubLabels(seq, unpublished_labels);
2100  x_ReportDuplicatePubLabels(seq, published_labels);
2101 
2102 }
2103 
2104 
2106 {
2107  if (! seq.GetInst().IsSetHist()) {
2108  return;
2109  }
2110 
2111  TGi gi = ZERO_GI;
2112  FOR_EACH_SEQID_ON_BIOSEQ (id, seq) {
2113  if ((*id)->IsGi()) {
2114  gi = (*id)->GetGi();
2115  break;
2116  }
2117  }
2118  if (gi == ZERO_GI) {
2119  return;
2120  }
2121 
2122  const CSeq_hist& hist = seq.GetInst().GetHist();
2123  if (hist.IsSetReplaced_by() && hist.GetReplaced_by().IsSetDate()) {
2124  const CSeq_hist_rec& rec = hist.GetReplaced_by();
2125  ITERATE(CSeq_hist_rec::TIds, id, rec.GetIds()) {
2126  if ((*id)->IsGi()) {
2127  if (gi == (*id)->GetGi()) {
2129  "Replaced by gi (" +
2130  NStr::NumericToString(gi) + ") is same as current Bioseq",
2131  seq);
2132  break;
2133  }
2134  }
2135  }
2136  }
2137 
2138  if (hist.IsSetReplaces() && hist.GetReplaces().IsSetDate()) {
2139  const CSeq_hist_rec& rec = hist.GetReplaces();
2140  ITERATE(CSeq_hist_rec::TIds, id, rec.GetIds()) {
2141  if ((*id)->IsGi()) {
2142  if (gi == (*id)->GetGi()) {
2144  "Replaces gi (" +
2145  NStr::NumericToString(gi) + ") is same as current Bioseq",
2146  seq);
2147  break;
2148  }
2149  }
2150  }
2151  }
2152 }
2153 
2154 
2155 // =============================================================================
2156 // Private
2157 // =============================================================================
2158 
2159 
2160 
2161 
2162 // Is the id contained in the bioseq?
2163 bool CValidError_bioseq::IsIdIn(const CSeq_id& id, const CBioseq& seq)
2164 {
2165  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2166  if (id.Match(**it)) {
2167  return true;
2168  }
2169  }
2170  return false;
2171 }
2172 
2173 
2175 {
2176  if (! inst.IsSetSeq_data()) {
2177  return 0;
2178  }
2179 
2180  const CSeq_data& seqdata = inst.GetSeq_data();
2181  switch (seqdata.Which()) {
2182  case CSeq_data::e_not_set:
2183  return 0;
2184  case CSeq_data::e_Iupacna:
2185  return seqdata.GetIupacna().Get().size();
2186  case CSeq_data::e_Iupacaa:
2187  return seqdata.GetIupacaa().Get().size();
2188  case CSeq_data::e_Ncbi2na:
2189  return seqdata.GetNcbi2na().Get().size();
2190  case CSeq_data::e_Ncbi4na:
2191  return seqdata.GetNcbi4na().Get().size();
2192  case CSeq_data::e_Ncbi8na:
2193  return seqdata.GetNcbi8na().Get().size();
2194  case CSeq_data::e_Ncbipna:
2195  return seqdata.GetNcbipna().Get().size();
2196  case CSeq_data::e_Ncbi8aa:
2197  return seqdata.GetNcbi8aa().Get().size();
2198  case CSeq_data::e_Ncbieaa:
2199  return seqdata.GetNcbieaa().Get().size();
2200  case CSeq_data::e_Ncbipaa:
2201  return seqdata.GetNcbipaa().Get().size();
2203  return seqdata.GetNcbistdaa().Get().size();
2204  default:
2205  return 0;
2206  }
2207 }
2208 
2209 
2210 // Returns true if seq derived from translation ending in "*" or
2211 // seq is 3' partial (i.e. the right of the sequence is incomplete)
2213 {
2214 
2215  // Look for the Cdregion feature used to create this aa product
2216  // Use the Cdregion to translate the associated na sequence
2217  // and check if translation has a '*' at the end. If it does.
2218  // message about 'X' at the end of this aa product sequence is suppressed
2219  try {
2220  const CSeq_feat* sfp = m_Imp.GetCDSGivenProduct(seq);
2221  if (sfp) {
2222  // Translate na CSeq_data
2223  string prot;
2225  if (prot[prot.size() - 1] == '*') {
2226  return true;
2227  }
2228  return false;
2229  }
2230 
2231  // Get CMolInfo for seq and determine if completeness is
2232  // "eCompleteness_no_right or eCompleteness_no_ends. If so
2233  // suppress message about "X" at end of aa sequence is suppressed
2235  if (mi && mi->IsSetCompleteness()) {
2236  if (mi->GetCompleteness() == CMolInfo::eCompleteness_no_right ||
2237  mi->GetCompleteness() == CMolInfo::eCompleteness_no_ends) {
2238  return true;
2239  }
2240  }
2241  } catch (const CException&) {
2242  } catch (const std::exception&) {
2243  }
2244  return false;
2245 }
2246 
2247 
2248 //LCOV_EXCL_START
2249 //part of segset validation, no longer used
2251 {
2252  CRef<CSeq_loc> loc;
2253  if (! seq.GetInst().IsSetExt()) {
2254  return loc;
2255  }
2256 
2257  if (seq.GetInst().GetExt().IsSeg()) {
2258  CRef<CSeq_loc> nloc(new CSeq_loc());
2259  loc = nloc;
2260  CSeq_loc_mix& mix = loc->SetMix();
2261  ITERATE (list< CRef<CSeq_loc> >, it,
2262  seq.GetInst().GetExt().GetSeg().Get()) {
2263  mix.Set().push_back(*it);
2264  }
2265  } else if (seq.GetInst().GetExt().IsRef()) {
2266  CRef<CSeq_loc> nloc(new CSeq_loc());
2267  loc = nloc;
2268  loc->Add(seq.GetInst().GetExt().GetRef());
2269  }
2270  return loc;
2271 }
2272 //LCOV_EXCL_STOP
2273 
2274 
2275 // Check if CdRegion required but not found
2277 {
2278  if (bsh && CSeq_inst::IsAa(bsh.GetInst_Mol())) {
2279  CSeq_entry_Handle nps =
2281  if (nps) {
2282  const CSeq_feat* cds = GetCDSForProduct(bsh);
2283  if (! cds) {
2284  const CSeq_feat* mat = GetPROTForProduct(bsh);
2285  if (! mat) {
2286  return true;
2287  }
2288  }
2289  }
2290  }
2291 
2292  return false;
2293 }
2294 
2295 
2297 {
2299 
2300  if (sd) {
2301  const CMolInfo& mi = sd->GetMolinfo();
2302  if (mi.IsSetBiomol()) {
2303  return mi.GetBiomol() == CMolInfo::eBiomol_mRNA;
2304  }
2305  } else if (bsh.GetBioseqMolType() == CSeq_inst::eMol_rna) {
2306  // if no molinfo, assume rna is mrna
2307  return true;
2308  }
2309 
2310  return false;
2311 }
2312 
2313 
2315 {
2316  size_t counter = 0;
2317  for (CSeq_loc_CI slit(loc); slit; ++slit) {
2318  if (! IsFarLocation(slit.GetEmbeddingSeq_loc(), m_Imp.GetTSEH())) {
2319  ++counter;
2320  }
2321  }
2322  return counter;
2323 }
2324 
2325 
2327  const CSeq_feat& curr,
2328  const CSeq_feat& prev)
2329 {
2330  if (curr.IsSetExcept() && curr.GetExcept() && curr.IsSetExcept_text()) {
2331  if (NStr::FindNoCase(curr.GetExcept_text(), "alternative processing") != NPOS) {
2332  return false;
2333  }
2334  }
2335  if (prev.IsSetExcept() && prev.GetExcept() && prev.IsSetExcept_text()) {
2336  if (NStr::FindNoCase(prev.GetExcept_text(), "alternative processing") != NPOS) {
2337  return false;
2338  }
2339  }
2340 
2341  const CProt_ref& currP = curr.GetData().GetProt();
2342  const CProt_ref& prevP = prev.GetData().GetProt();
2343 
2344  if (currP.IsSetName() && prevP.IsSetName()) {
2345  string currN;
2346  for (auto it : currP.GetName()) {
2347  currN = it;
2348  break;
2349  }
2350  string prevN;
2351  for (auto it : prevP.GetName()) {
2352  prevN = it;
2353  break;
2354  }
2355  if (NStr::EqualNocase(currN, "anchored capsid protein ancC") && NStr::EqualNocase(prevN, "capsid protein C")) {
2356  return false;
2357  }
2358  if (NStr::EqualNocase(prevN, "anchored capsid protein ancC") && NStr::EqualNocase(currN, "capsid protein C")) {
2359  return false;
2360  }
2361  if (NStr::EqualNocase(currN, "membrane glycoprotein precursor prM") && NStr::EqualNocase(prevN, "protein pr")) {
2362  return false;
2363  }
2364  if (NStr::EqualNocase(prevN, "membrane glycoprotein precursor prM") && NStr::EqualNocase(currN, "protein pr")) {
2365  return false;
2366  }
2367  if (NStr::EqualNocase(currN, "membrane glycoprotein precursor prM") && NStr::EqualNocase(prevN, "membrane glycoprotein M")) {
2368  return false;
2369  }
2370  if (NStr::EqualNocase(prevN, "membrane glycoprotein precursor prM") && NStr::EqualNocase(currN, "membrane glycoprotein M")) {
2371  return false;
2372  }
2373  if (NStr::EqualNocase(currN, "anchored capsid protein C") && NStr::EqualNocase(prevN, "capsid protein C")) {
2374  return false;
2375  }
2376  if (NStr::EqualNocase(prevN, "anchored capsid protein C") && NStr::EqualNocase(currN, "capsid protein C")) {
2377  return false;
2378  }
2379  if (NStr::EqualNocase(currN, "membrane glycoprotein precursor M") && NStr::EqualNocase(prevN, "protein pr")) {
2380  return false;
2381  }
2382  if (NStr::EqualNocase(prevN, "membrane glycoprotein precursor M") && NStr::EqualNocase(currN, "protein pr")) {
2383  return false;
2384  }
2385  if (NStr::EqualNocase(currN, "membrane glycoprotein precursor M") && NStr::EqualNocase(prevN, "membrane glycoprotein M")) {
2386  return false;
2387  }
2388  if (NStr::EqualNocase(prevN, "membrane glycoprotein precursor M") && NStr::EqualNocase(currN, "membrane glycoprotein M")) {
2389  return false;
2390  }
2391  }
2392 
2393 
2394  return true;
2395 }
2396 
2397 
2398 #define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var) \
2399 ITERATE (CBioseq_Handle::TId, Itr, Var.GetId())
2400 
2402 {
2403  if (! IsMaster(seq)) {
2404  return false;
2405  }
2406  CBioseq_Handle bsh = scope.GetBioseqHandle(seq);
2407  return IsWGS(bsh);
2408 }
2409 
2410 
2412 {
2413  bool rval = false;
2414  if (entry.IsSeq()) {
2415  if (IsMaster(entry.GetSeq()) && IsWGS(entry.GetSeq())) {
2416  rval = true;
2417  }
2418  } else if (entry.IsSet() && entry.GetSet().IsSetSeq_set()) {
2420  if (IsWGSMaster(**it)) {
2421  rval = true;
2422  break;
2423  }
2424  }
2425  }
2426  return rval;
2427 }
2428 
2429 
2431 {
2432  if (! seq.IsSetDescr()) {
2433  return false;
2434  }
2435  ITERATE(CBioseq::TDescr::Tdata, it, seq.GetDescr().Get()) {
2436  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech() && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2437  return true;
2438  }
2439  }
2440  return false;
2441 }
2442 
2443 
2445 {
2446  CSeqdesc_CI molinfo(bsh, CSeqdesc::e_Molinfo);
2447  if (molinfo && molinfo->GetMolinfo().IsSetTech() && molinfo->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2448  return true;
2449  }
2450  return false;
2451 }
2452 
2453 
2455 {
2456  bool rval = false;
2457  if (entry.IsSeq()) {
2458  rval = IsWGS(entry.GetSeq());
2459  } else if (entry.IsSet() && entry.GetSet().IsSetSeq_set()) {
2460  for (auto it : entry.GetSet().GetSeq_set()) {
2461  if (IsWGS(*it)) {
2462  rval = true;
2463  break;
2464  }
2465  }
2466  }
2467  return rval;
2468 }
2469 
2470 
2472 {
2473  const CTextseq_id* txt = id.GetTextseq_Id();
2474  if (! txt || ! txt->IsSetAccession()) {
2475  return false;
2476  }
2479  return true;
2480  } else {
2481  return false;
2482  }
2483 }
2484 
2485 
2487 {
2488  if (! seq.IsSetId()) {
2489  return false;
2490  }
2491  ITERATE(CBioseq::TId, id, seq.GetId()) {
2492  if (IsWGSAccession(**id)) {
2493  return true;
2494  }
2495  }
2496  return false;
2497 }
2498 
2499 
2501 {
2502  const CTextseq_id* txt = id.GetTextseq_Id();
2503  if (! txt || ! txt->IsSetAccession()) {
2504  return false;
2505  }
2508  return true;
2509  } else {
2510  return false;
2511  }
2512 }
2513 
2514 
2516 {
2517  if (! seq.IsSetId()) {
2518  return false;
2519  }
2520  ITERATE(CBioseq::TId, id, seq.GetId()) {
2521  if (IsTSAAccession(**id)) {
2522  return true;
2523  }
2524  }
2525  return false;
2526 }
2527 
2528 
2530 {
2531  CBioseq_Handle bsh = scope.GetBioseqHandle(seq);
2532  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
2533  if (desc && desc->GetMolinfo().IsSetCompleteness()) {
2534  CMolInfo::TCompleteness completeness = desc->GetMolinfo().GetCompleteness();
2535  if (completeness == CMolInfo::eCompleteness_partial
2536  || completeness == CMolInfo::eCompleteness_no_left
2537  || completeness == CMolInfo::eCompleteness_no_right
2538  || completeness == CMolInfo::eCompleteness_no_ends) {
2539  return true;
2540  }
2541  }
2542  return false;
2543 }
2544 
2545 
2547 {
2548  FOR_EACH_SEQID_ON_BIOSEQ(id, seq) {
2549  if ((*id)->IsPdb()) {
2550  return true;
2551  }
2552  }
2553  return false;
2554 }
2555 
2556 
2558 {
2559  if (IsPdb(seq) || IsWGSMaster(seq, *m_Scope)) {
2560  return;
2561  }
2562  const CSeq_inst& inst = seq.GetInst();
2563 
2564  TSeqPos len = inst.IsSetLength() ? inst.GetLength() : 0;
2565  if (seq.IsAa()) {
2566  if (len <= 3 && ! IsPartial(seq, *m_Scope)) {
2567  PostErr(eDiag_Warning, eErr_SEQ_INST_ShortSeq, "Sequence only " +
2568  NStr::IntToString(len) + " residues", seq);
2569  }
2570  } else {
2571  if (len <= 10 && m_report_short_seq) {
2572  PostErr(eDiag_Warning, eErr_SEQ_INST_ShortSeq, "Sequence only " +
2573  NStr::IntToString(len) + " residues", seq);
2574  }
2575  }
2576 }
2577 
2578 
2579 // Assumes that seq is segmented and has Seq-ext data
2581 {
2582  // Get parent CSeq_entry of seq and then find the next
2583  // CSeq_entry in the set. This CSeq_entry should be a CBioseq_set
2584  // of class parts.
2585  const CSeq_entry* se = seq.GetParentEntry();
2586  if (! se) {
2587  return;
2588  }
2589  const CSeq_entry* parent = se->GetParentEntry();
2590  if (! parent) {
2591  return;
2592  }
2593  if (! parent->IsSet() || ! parent->GetSet().IsSetClass() || parent->GetSet().GetClass() != CBioseq_set::eClass_segset) {
2594  return;
2595  }
2596 
2597  // Loop through seq_set looking for the parts set.
2598  FOR_EACH_SEQENTRY_ON_SEQSET (it, parent->GetSet()) {
2599  if ((*it)->Which() == CSeq_entry::e_Set
2600  && (*it)->GetSet().IsSetClass()
2601  && (*it)->GetSet().GetClass() == CBioseq_set::eClass_parts) {
2602  const CBioseq_set::TSeq_set& parts = (*it)->GetSet().GetSeq_set();
2603  const CSeg_ext::Tdata& locs = seq.GetInst().GetExt().GetSeg().Get();
2604 
2605  // Make sure the number of locations (excluding null locations)
2606  // match the number of parts
2607  size_t nulls = 0;
2608  ITERATE (CSeg_ext::Tdata, loc, locs) {
2609  if ((*loc)->IsNull()) {
2610  nulls++;
2611  }
2612  }
2613  if (locs.size() - nulls < parts.size()) {
2615  "Parts set contains too many Bioseqs", seq);
2616  return;
2617  } else if (locs.size() - nulls > parts.size()) {
2619  "Parts set does not contain enough Bioseqs", seq);
2620  return;
2621  }
2622 
2623  // Now, simultaneously loop through the parts of se_parts and CSeq_locs of
2624  // seq's CSseq-ext. If don't compare, post error.
2625  size_t size = locs.size(); // == parts.size()
2626  CSeg_ext::Tdata::const_iterator loc_it = locs.begin();
2627  CBioseq_set::TSeq_set::const_iterator part_it = parts.begin();
2628  for (size_t i = 0; i < size; ++i) {
2629  try {
2630  if ((*loc_it)->IsNull()) {
2631  ++loc_it;
2632  continue;
2633  }
2634  if (! (*part_it)->IsSeq()) {
2636  "Parts set component is not Bioseq", seq);
2637  return;
2638  }
2639  const CSeq_id& loc_id = GetId(**loc_it, m_Scope);
2640  if (! IsIdIn(loc_id, (*part_it)->GetSeq())) {
2642  "Segmented bioseq seq_ext does not correspond to parts "
2643  "packaging order", seq);
2644  return;
2645  }
2646 
2647  // advance both iterators
2648  ++part_it;
2649  ++loc_it;
2650  } catch (const CObjmgrUtilException&) {
2651  ERR_POST_X(4, "Seq-loc not for unique sequence");
2652  return;
2653  } catch (CException& x1) {
2654  string err_msg = "Unknown error:";
2655  err_msg += x1.what();
2656  ERR_POST_X(5, err_msg);
2657  return;
2658  } catch (std::exception& x2) {
2659  string err_msg = "Unknown error:";
2660  err_msg += x2.what();
2661  ERR_POST_X(5, err_msg);
2662  return;
2663  }
2664  }
2665  }
2666  }
2667 }
2668 
2669 static bool s_IsConWithGaps(const CBioseq& seq)
2670 {
2671  if (! seq.IsSetInst()) return false;
2672  const CSeq_inst& inst = seq.GetInst();
2673  if (! inst.IsSetExt()) return false;
2674  if (! inst.GetExt().IsDelta()) return false;
2675 
2676  ITERATE(CDelta_ext::Tdata, iter, inst.GetExt().GetDelta().Get()) {
2677  if (! (*iter)->IsLiteral()) continue;
2678  const CSeq_literal& lit = (*iter)->GetLiteral();
2679  if (! lit.IsSetSeq_data()) return true;
2680  if (lit.GetSeq_data().IsGap() && lit.GetLength() > 0) return true;
2681  }
2682 
2683  return false;
2684 }
2685 
2686 
2688 {
2689  bool has_gap = false;
2690  if (seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta()) {
2691  ITERATE(CDelta_ext::Tdata, iter, seq.GetInst().GetExt().GetDelta().Get()) {
2692  if ((*iter)->IsLiteral() &&
2693  (! (*iter)->GetLiteral().IsSetSeq_data() || (*iter)->GetLiteral().GetSeq_data().IsGap())) {
2694  has_gap = true;
2695  break;
2696  }
2697  }
2698  }
2699  return has_gap;
2700 }
2701 
2703 {
2705  if (! bsh) {
2706  return;
2707  }
2708 
2709  string title = sequence::CDeflineGenerator().GenerateDefline(bsh);
2710 
2711 /*bsv
2712  CMolInfo::TTech tech = CMolInfo::eTech_unknown;
2713 */
2714  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
2715  if (desc) {
2716  const CMolInfo& mi = desc->GetMolinfo();
2717 /*bsv
2718  tech = mi.GetTech();
2719 */
2721  if (m_Imp.IsGenbank()) {
2722  if (NStr::Find(title, "complete genome") != NPOS) {
2723  const CSeq_entry& ctx = *seq.GetParentEntry();
2725  "Complete genome in title without complete flag set",
2726  ctx, *desc);
2727  }
2728  }
2730  (! s_IsConWithGaps(seq)) &&
2731  ! m_Imp.IsEmbl() && ! m_Imp.IsDdbj()) {
2732  const CSeq_entry& ctx = *seq.GetParentEntry();
2734  "Circular topology without complete flag set", ctx, *desc);
2735  }
2736  }
2737  }
2738 
2739  // warning if title contains complete genome but sequence contains gap features
2740  if (NStr::FindNoCase(title, "complete genome") != NPOS && x_HasGap(seq)) {
2742  "Title contains 'complete genome' but sequence has gaps", seq);
2743  }
2744 
2745 
2746  // note - test for protein titles was moved to CValidError_bioseqset::ValidateNucProtSet
2747  // because it only applied for protein sequences in nuc-prot sets and it's more efficient
2748  // to create the defline generator once per nuc-prot set
2749 }
2750 
2751 static bool HasAssemblyOrNullGap (const CBioseq& seq)
2752 {
2753  const CSeq_inst& inst = seq.GetInst();
2754  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2755  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2756  if (! (*sg)) continue;
2757  if ((**sg).Which() != CDelta_seq::e_Literal) continue;
2758  const CSeq_literal& lit = (*sg)->GetLiteral();
2759  if (! lit.IsSetSeq_data()) return true;
2760  if (lit.GetSeq_data().IsGap()) return true;
2761  }
2762  }
2763 
2764  return false;
2765 }
2766 
2767 
2769 {
2770  const CSeq_inst& inst = seq.GetInst();
2771  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2772  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2773  if (! (*sg) ) continue;
2774  if ((**sg).Which() != CDelta_seq::e_Literal) continue;
2775  const CSeq_literal& lit = (*sg)->GetLiteral();
2776  if (! lit.IsSetSeq_data()) {
2777  PostErr(eDiag_Warning, eErr_SEQ_INST_SeqGapProblem, "TSA Seq_data NULL", seq);
2778  } else {
2779  const CSeq_data& data = lit.GetSeq_data();
2780  if (data.Which() == CSeq_data::e_Gap) {
2781  const CSeq_gap& gap = data.GetGap();
2782  if (gap.IsSetType()) {
2783  int gaptype = gap.GetType();
2784  if (gaptype == CSeq_gap::eType_unknown) {
2785  PostErr(eDiag_Warning, eErr_SEQ_INST_TSAseqGapProblem, "TSA Seq_gap.unknown", seq);
2786  } else if (gaptype == CSeq_gap::eType_other) {
2787  PostErr(eDiag_Warning, eErr_SEQ_INST_TSAseqGapProblem, "TSA Seq_gap.other", seq);
2788  }
2789  } else {
2790  PostErr(eDiag_Warning, eErr_SEQ_INST_TSAseqGapProblem, "TSA Seq_gap NULL", seq);
2791  }
2792  }
2793  }
2794  }
2795  }
2796 }
2797 
2798 
2800 {
2801  const CSeq_inst& inst = seq.GetInst();
2802  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2803  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2804  if (! (*sg)) continue;
2805  // CON division - far delta - suppresses errors
2806  if ((**sg).Which() != CDelta_seq::e_Literal) /* continue */ return false;
2807  const CSeq_literal& lit = (*sg)->GetLiteral();
2808  if (! lit.IsSetSeq_data()) {
2809  return true;
2810  } else {
2811  const CSeq_data& data = lit.GetSeq_data();
2812  if (data.Which() == CSeq_data::e_Gap) {
2813  const CSeq_gap& gap = data.GetGap();
2814  CSeq_gap::TType gap_type = gap.IsSetType() ? gap.GetType() : CSeq_gap::eType_unknown;
2815 
2816  if (gap_type != CSeq_gap::eType_centromere && gap_type != CSeq_gap::eType_heterochromatin &&
2817  gap_type != CSeq_gap::eType_short_arm && gap_type != CSeq_gap::eType_telomere &&
2818  gap_type != CSeq_gap::eType_contig) {
2819 
2820  if (! gap.IsSetLinkage_evidence() || gap.GetLinkage_evidence().empty()) {
2821  return true;
2822  }
2823  }
2824  }
2825  }
2826  }
2827  }
2828  return false;
2829 }
2830 
2831 
2833 {
2834  if (HasBadWGSGap(seq)) {
2836  "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence.", seq);
2837  }
2838 }
2839 
2840 
2842 {
2843  if (HasBadWGSGap(seq)) {
2845  "TSA submission includes wrong gap type. Gaps for TSA should be Assembly Gaps with linkage evidence.", seq);
2846  }
2847 }
2848 
2849 
2851 {
2852  if (HasBadWGSGap(seq)) {
2854  "Genome submission includes wrong gap type. Gaps for genomes should be Assembly Gaps with linkage evidence.", seq);
2855  }
2856 }
2857 
2858 
2859 bool s_FieldHasLabel(const CUser_field& field, const string& label)
2860 {
2861  if (field.IsSetLabel() && field.GetLabel().IsStr() &&
2862  NStr::EqualNocase(field.GetLabel().GetStr(), label)) {
2863  return true;
2864  } else {
2865  return false;
2866  }
2867 }
2868 
2869 
2871 {
2872  if (! field.IsSetData()) {
2873  return false;
2874  }
2875  bool rval = false;
2876  if (field.GetData().IsStr()) {
2877  if (! NStr::IsBlank(field.GetData().GetStr())) {
2878  rval = true;
2879  }
2880  } else if (field.GetData().IsStrs()) {
2882  if (! NStr::IsBlank(*s)) {
2883  rval = true;
2884  break;
2885  }
2886  }
2887  }
2888  return rval;
2889 }
2890 
2891 
2893 {
2894  bool has_biosample = false;
2895  bool has_bioproject = false;
2896 
2897  CSeqdesc_CI d(bsh, CSeqdesc::e_User);
2898  while (d) {
2900  for (auto it : d->GetUser().GetData()) {
2901  if (s_FieldHasLabel(*it, "BioSample")) {
2902  if (s_FieldHasNonBlankValue(*it)) {
2903  has_biosample = true;
2904  }
2905  } else if (s_FieldHasLabel(*it, "BioProject")) {
2906  if (s_FieldHasNonBlankValue(*it)) {
2907  has_bioproject = true;
2908  }
2909  }
2910  }
2911  } else if (m_Imp.IsGenbank()) {
2912  const CUser_object& uo = d->GetUser();
2913  if (uo.GetType().IsStr()) {
2914  const string& type = uo.GetType().GetStr();
2915  if (NStr::CompareNocase(type, "WGSProjects") == 0) {
2916  int fr = 0;
2917  int to = 0;
2918 
2919  for (auto it : uo.GetData()) {
2920  if (! it->GetLabel().IsStr()) {
2921  continue;
2922  }
2923  const string& label = it->GetLabel().GetStr();
2924  if (NStr::CompareNocase(label, "WGS_accession_first") == 0 ||
2925  NStr::CompareNocase(label, "Accession_first") == 0) {
2926  const string& str = it->GetData().GetStr();
2927  auto fst = str.find_first_of("0123456789");
2928  fr = NStr::StringToInt (str.substr(fst));
2929  } else if (NStr::CompareNocase(label, "WGS_accession_last") == 0 ||
2930  NStr::CompareNocase(label, "Accession_last") == 0) {
2931  const string& str = it->GetData().GetStr();
2932  auto lst = str.find_first_of("0123456789");
2933  to = NStr::StringToInt (str.substr(lst));
2934  }
2935  }
2936  if ((fr != 0) && (to != 0)) {
2937  int df = to - fr + 1;
2938  int blen = bsh.GetBioseqLength();
2939  if (df != blen) {
2941  "Number of accessions (" + NStr::IntToString(df) + ") does not correspond to number of records (" + NStr::IntToString(blen) +")",
2942  *(bsh.GetCompleteBioseq()));
2943  }
2944  }
2945  }
2946  }
2947  }
2948  ++d;
2949  }
2950  if (! has_biosample && ! has_bioproject) {
2952  "WGS master lacks both BioSample and BioProject",
2953  *(bsh.GetCompleteBioseq()));
2954  } else if (! has_biosample) {
2956  "WGS master lacks BioSample",
2957  *(bsh.GetCompleteBioseq()));
2958  } else if (! has_bioproject) {
2960  "WGS master lacks BioProject",
2961  *(bsh.GetCompleteBioseq()));
2962  }
2963  if (! has_biosample || ! has_bioproject) {
2964  }
2965 }
2966 
2967 
2968 static EDiagSev GetBioseqEndWarning (const CBioseq& seq, bool is_circular, EBioseqEndIsType end_is_char)
2969 {
2970  EDiagSev sev;
2971  bool only_local = true;
2972  bool is_NCACNTNW = false;
2973  bool is_patent = false;
2974  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2975  if (! (*id_it)->IsLocal()) {
2976  only_local = false;
2977  if ((*id_it)->IsPatent()) {
2978  is_patent = true;
2979  } else if (IsNTNCNWACAccession(**id_it)) {
2980  is_NCACNTNW = true;
2981  }
2982  }
2983  }
2984 
2985  if (is_NCACNTNW || is_patent) {
2986  sev = eDiag_Warning;
2987  } else if (is_circular) {
2988  sev = eDiag_Warning;
2989  } else if (only_local) {
2990  sev = eDiag_Warning;
2991  } else if (end_is_char == eBioseqEndIsType_All) {
2992  sev = eDiag_Error;
2993  } else {
2994  sev = eDiag_Warning;
2995  }
2996  return sev;
2997 }
2998 
2999 
3000 void CValidError_bioseq::x_CalculateNsStretchAndTotal(const CSeqVector& vec, TSeqPos& num_ns, TSeqPos& max_stretch, bool& n5, bool& n3)
3001 {
3003 
3004  num_ns = 0;
3005  max_stretch = 0;
3006  n5 = false;
3007  n3 = false;
3008 
3009  TSeqPos this_stretch = 0;
3010  for (TSeqPos i = 0; i < vec.size(); i++) {
3011  if (vec[i] == 'N') {
3012  num_ns++;
3013  if (vec.IsInGap(i)) {
3014  if (max_stretch < this_stretch) {
3015  max_stretch = this_stretch;
3016  }
3017  this_stretch = 0;
3018  } else {
3019  this_stretch++;
3020  if (this_stretch >= 10) {
3021  if (i < 20) {
3022  n5 = true;
3023  }
3024  if (vec.size() > 20 && i > vec.size() - 10) {
3025  n3 = true;
3026  }
3027  }
3028  }
3029  } else {
3030  if (max_stretch < this_stretch) {
3031  max_stretch = this_stretch;
3032  }
3033  this_stretch = 0;
3034  }
3035  }
3036  if (max_stretch < this_stretch) {
3037  max_stretch = this_stretch;
3038  }
3039 }
3040 
3041 
3043 {
3044  bool rval = false;
3045  if (HasAssemblyOrNullGap(seq)) {
3046  return rval;
3047  }
3048 
3050  if (! bsh) {
3051  return rval;
3052  }
3053 
3054  TSeqPos num_ns = 0;
3055  TSeqPos max_stretch = 0;
3056  bool n5 = false;
3057  bool n3 = false;
3058 
3060  x_CalculateNsStretchAndTotal(vec, num_ns, max_stretch, n5, n3);
3061 
3062  if (max_stretch >= 15) {
3064  "Sequence has a stretch of " + NStr::IntToString(max_stretch) + " Ns", seq);
3065  rval = true;
3066  } else {
3067  if (n5) {
3069  "Sequence has a stretch of at least 10 Ns within the first 20 bases", seq);
3070  rval = true;
3071  }
3072  if (n3) {
3074  "Sequence has a stretch of at least 10 Ns within the last 20 bases", seq);
3075  rval = true;
3076  }
3077  }
3078  return rval;
3079 }
3080 
3081 
3082 // check to see if sequence is all Ns
3084 {
3085  bool rval = true;
3086  bool at_least_one = false;
3087  try {
3088  for (CSeqVector_CI sv_iter(vec); (sv_iter) && rval; ++sv_iter) {
3089  if (*sv_iter != 'N') {
3090  rval = false;
3091  }
3092  at_least_one = true;
3093  }
3094  } catch (CException&) {
3095  }
3096  return (rval && at_least_one);
3097 }
3098 
3099 
3100 static int CountNs(const CSeq_data& seq_data, TSeqPos len)
3101 {
3102  int total = 0;
3103  switch (seq_data.Which()) {
3104  case CSeq_data::e_Ncbi4na: {
3105  vector<char>::const_iterator it = seq_data.GetNcbi4na().Get().begin();
3106  unsigned char mask = 0xf0;
3107  unsigned char shift = 4;
3108  for (size_t n = 0; n < len; n++) {
3109  unsigned char c = ((*it) & mask) >> shift;
3110  mask >>= 4;
3111  shift -= 4;
3112  if (! mask) {
3113  mask = 0xf0;
3114  shift = 4;
3115  ++it;
3116  }
3117  if (c == 15) {
3118  total++;
3119  }
3120  }
3121  }
3122  return total;
3123  case CSeq_data::e_Iupacna: {
3124  const string& s = seq_data.GetIupacna().Get();
3125  for (size_t n = 0; n < len && n < s.length(); n++) {
3126  if (s[n] == 'N') {
3127  total++;
3128  }
3129  }
3130  }
3131  return total;
3132  case CSeq_data::e_Ncbi8na:
3133  case CSeq_data::e_Ncbipna: {
3134  CSeq_data iupacna;
3135  if (! CSeqportUtil::Convert(seq_data, &iupacna, CSeq_data::e_Iupacna)) {
3136  return total;
3137  }
3138  const string& s = iupacna.GetIupacna().Get();
3139  for (size_t n = 0; n < len; n++) {
3140  if (s[n] == 'N') {
3141  total++;
3142  }
3143  }
3144  }
3145  return total;
3146  default:
3147  return total;
3148  }
3149 }
3150 
3151 
3153 {
3154  int count = 0;
3155  SSeqMapSelector sel;
3157  for (CSeqMap_CI seq_iter(bsh, sel); seq_iter; ++seq_iter) {
3158  switch (seq_iter.GetType()) {
3159  case CSeqMap::eSeqData:
3160  count += CountNs(seq_iter.GetData(), seq_iter.GetLength());
3161  break;
3162  default:
3163  break;
3164  }
3165  }
3166 /*
3167  int pct_n = 0;
3168  try {
3169  CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
3170  TSeqPos num_ns = 0;
3171  for (size_t i = 0; i < vec.size(); i++) {
3172  try {
3173  if (vec[i] == 'N' && !vec.IsInGap(i)) {
3174  num_ns++;
3175  }
3176  } catch (CException& e2) {
3177  //bad character
3178  }
3179  }
3180  pct_n = (num_ns * 100) / bsh.GetBioseqLength();
3181  } catch (CException& e) {
3182  pct_n = 100;
3183  }
3184 */
3185  return bsh.GetBioseqLength() ? count * 100 / bsh.GetBioseqLength() : 100;
3186 }
3187 
3188 static
3190 {
3193  bool is_first = true;
3194 
3195  if (inst.CanGetExt() && inst.GetExt().IsDelta()) {
3196  ITERATE(CDelta_ext::Tdata, iter, inst.GetExt().GetDelta().Get()) {
3197  if ((*iter)->IsLoc()) {
3198  return false;
3199  }
3200  if ((*iter)->IsLiteral()) {
3201  const CSeq_literal& lit = (*iter)->GetLiteral();
3202  if (lit.IsSetSeq_data() && lit.GetSeq_data().IsGap()) {
3203  const CSeq_gap& gap = lit.GetSeq_data().GetGap();
3205  if (gap.IsSetType()) {
3206  gaptype = gap.GetType();
3207  }
3208  if (is_first) {
3209  first = gaptype;
3210  } else {
3211  last = gaptype;
3212  }
3213  } else {
3215  }
3216  }
3217  is_first = false;
3218  }
3219  }
3220  fst = first;
3221  lst = last;
3222  return true;
3223 }
3224 
3225 static bool s_WillReportTerminalGap(const CBioseq& seq, CBioseq_Handle bsh)
3226 {
3227  if (! seq.IsSetInst() || ! seq.GetInst().IsSetRepr()) {
3228  return false;
3229  }
3230  if (! seq.GetInst().IsSetMol() || seq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
3231  return false;
3232  }
3233  CSeq_inst::TRepr repr = seq.GetInst().GetRepr();
3234 
3235  if (repr != CSeq_inst::eRepr_delta) {
3236  return false;
3237  }
3238 
3239  if (! bsh) {
3240  return false;
3241  }
3242 
3243  if (! seq.GetInst().IsSetLength() || seq.GetInst().GetLength() < 10) {
3244  return false;
3245  }
3246 
3247  if (! ShouldCheckForNsAndGap(bsh)) {
3248  return false;
3249  }
3250 
3251  return true;
3252 }
3253 
3254 
3255 static optional<int> s_MaxSeqStretchIfLessThanThreshold(const CSeqVector& vec, int threshold)
3256 {
3257  int max_stretch = 0;
3258  auto IsN = [](char c) { return c == 'N'; };
3259 
3260  for (auto begin_it = find_if_not(begin(vec), end(vec), IsN);
3261  begin_it != end(vec);) {
3262  auto distanceToEnd = distance(begin_it, end(vec));
3263  // check a sequence interval no longer than the threshold length
3264  auto interval = (distanceToEnd > threshold) ? threshold : distanceToEnd;
3265  auto end_it = find_if(begin_it, next(begin_it, interval), IsN);
3266  const auto current_stretch = distance(begin_it, end_it);
3267  if (current_stretch >= threshold) { // No Ns in the interval
3268  return {};
3269  }
3270 
3271  if (current_stretch > max_stretch) {
3272  max_stretch = current_stretch;
3273  }
3274  begin_it = find_if_not(end_it, end(vec), IsN);
3275  }
3276  return max_stretch;
3277 }
3278 
3279 
3281 {
3282  if (! seq.IsSetInst() || ! seq.GetInst().IsSetRepr()) {
3283  // can't check if no Inst or Repr
3284  return;
3285  }
3286  if (! seq.GetInst().IsSetMol() || seq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
3287  // don't check proteins here
3288  return;
3289  }
3290  CSeq_inst::TRepr repr = seq.GetInst().GetRepr();
3291 
3292  // only check for raw or for delta sequences that are delta lit only
3293  if (repr == CSeq_inst::eRepr_virtual || repr == CSeq_inst::eRepr_map) {
3294  return;
3295  }
3296 
3298  if (! bsh) {
3299  // no check if Bioseq not in scope
3300  return;
3301  }
3302 
3303  try {
3305 
3306  if (IsAllNs(vec)) {
3307  PostErr(m_Imp.IsPDB() ? eDiag_Warning : eDiag_Critical, eErr_SEQ_INST_AllNs, "Sequence is all Ns", seq);
3308  return;
3309  }
3310 
3311  // don't bother checking if length is less than 10
3312  if (! seq.IsSetInst() || ! seq.GetInst().IsSetRepr()
3313  || ! seq.GetInst().IsSetLength() || seq.GetInst().GetLength() < 10) {
3314  return;
3315  }
3316 
3317  if (const auto oMaxLength = s_MaxSeqStretchIfLessThanThreshold(vec, 10); oMaxLength.has_value()) {
3319  "Maximum contig length is " + NStr::IntToString(*oMaxLength) + " bases", seq);
3320  }
3321 
3326  bool begin_ambig = false, end_ambig = false;
3329  if (ShouldCheckForNsAndGap(bsh) && x_IsDeltaLitOnly(seq.GetInst())) {
3330  CheckBioseqEndsForNAndGap(vec, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
3331  s_GetFlankingGapTypes(seq.GetInst(), fst, lst);
3332  }
3333 
3334  bool is_circular = false;
3336  is_circular = true;
3337  }
3338  EDiagSev sev;
3339  if (begin_n != eBioseqEndIsType_None) {
3340  sev = GetBioseqEndWarning(seq, is_circular, begin_n);
3341  PostErr(sev, eErr_SEQ_INST_TerminalNs, "N at beginning of sequence", seq);
3342  } else if (begin_gap != eBioseqEndIsType_None && fst != CSeq_gap::eType_contamination) {
3343  sev = GetBioseqEndWarning(seq, is_circular, begin_gap);
3344  PostErr(sev, eErr_SEQ_INST_TerminalGap, "Gap at beginning of sequence", seq);
3345  }
3346 
3347  if (end_n != eBioseqEndIsType_None) {
3348  sev = GetBioseqEndWarning(seq, is_circular, end_n);
3349  PostErr(sev, eErr_SEQ_INST_TerminalNs, "N at end of sequence", seq);
3350  } else if (end_gap != eBioseqEndIsType_None && lst != CSeq_gap::eType_contamination) {
3351  sev = GetBioseqEndWarning(seq, is_circular, end_gap);
3352  PostErr(sev, eErr_SEQ_INST_TerminalGap, "Gap at end of sequence", seq);
3353  }
3354 
3355  if (begin_ambig && ! s_WillReportTerminalGap(seq, bsh)) {
3357  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases",
3358  seq);
3359  }
3360  if (end_ambig && ! s_WillReportTerminalGap(seq, bsh)) {
3362  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases",
3363  seq);
3364  }
3365 
3366  // don't check N content for patent sequences
3367  if (SeqIsPatent(seq)) {
3368  return;
3369  }
3370 
3371  // if TSA, check for percentage of Ns and max stretch of Ns
3372  if (IsBioseqTSA(seq, m_Scope)) {
3373  ReportBadAssemblyGap(seq);
3374  if (! HasAssemblyOrNullGap(seq)) {
3375  bool n5 = false;
3376  bool n3 = false;
3377  TSeqPos num_ns = 0, max_stretch = 0;
3378  x_CalculateNsStretchAndTotal(vec, num_ns, max_stretch, n5, n3);
3379 
3380  int pct_n = (num_ns * 100) / seq.GetLength();
3381  if (pct_n > 10) {
3383  "Sequence contains " + NStr::IntToString(pct_n) + " percent Ns", seq);
3384  }
3385 
3386  if (max_stretch >= 15) {
3388  "Sequence has a stretch of " + NStr::IntToString(max_stretch) + " Ns", seq);
3389  } else {
3390  if (n5) {
3392  "Sequence has a stretch of at least 10 Ns within the first 20 bases", seq);
3393  }
3394  if (n3) {
3396  "Sequence has a stretch of at least 10 Ns within the last 20 bases", seq);
3397  }
3398  }
3399  }
3400  } else {
3401  // not TSA, just check for really high N percent
3402  int pct_n = PctNs(bsh);
3403  if (pct_n > 50) {
3405  "Sequence contains " + NStr::IntToString(pct_n) + " percent Ns", seq);
3406  }
3407  }
3408 
3409  if (! m_Imp.IsRefSeqConventions() && ! IsRefSeq(seq) && ! IsEmblOrDdbj(seq)) {
3410  if (IsWGS(bsh)) {
3411  ReportBadWGSGap(seq);
3412  } else if (IsBioseqTSA(seq, m_Scope)) {
3413  ReportBadTSAGap(seq);
3414  } else if (m_Imp.IsGenomeSubmission()) {
3415  ReportBadGenomeGap(seq);
3416  }
3417  }
3418  } catch (exception&) {
3419  // just ignore, and continue with the validation process.
3420  }
3421 }
3422 
3424 {
3425  // rough measure of where exception occurs - triggered by certain conditions set up in unit_test_validator
3426  int errPt = 0;
3427 
3428  try {
3429 
3430  errPt++;
3431 
3432  if (! seq.IsSetInst() || ! seq.GetInst().IsSetRepr()) {
3433  // can't check if no Inst or Repr
3434  return;
3435  }
3436  if (! seq.GetInst().IsSetMol() || seq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
3437  // don't check proteins here
3438  return;
3439  }
3440  CSeq_inst::TRepr repr = seq.GetInst().GetRepr();
3441 
3442  // only check for raw or for delta sequences that are delta lit only
3443  if (repr == CSeq_inst::eRepr_virtual || repr == CSeq_inst::eRepr_map) {
3444  return;
3445  }
3446 
3448  if (! bsh) {
3449  // no check if Bioseq not in scope
3450  return;
3451  }
3452 
3453  errPt++;
3454 
3455  vector<TSeqPos> gapPositions;
3456 
3457  SSeqMapSelector sel;
3459 
3460  CSeqMap_CI gap_it(bsh, sel);
3461 
3462  errPt++;
3463 
3464  for (; gap_it; ++gap_it) {
3465 
3466  TSeqPos gp_start = gap_it.GetPosition();
3467  TSeqPos gp_end = gap_it.GetEndPosition() - 1;
3468 
3469  gapPositions.push_back(gp_start);
3470  gapPositions.push_back(gp_end);
3471 
3472  // cout << "gap start: " << gp_start << ", end: " << gp_end << endl;
3473  }
3474 
3475  errPt++;
3476 
3477  vector<TSeqPos> featPositions;
3478 
3479  CFeat_CI feat_it(bsh);
3480 
3481  errPt++;
3482 
3483  for (; feat_it; ++feat_it) {
3484 
3485  CSeq_feat_Handle feat = feat_it->GetSeq_feat_Handle();
3486  CSeqFeatData::ESubtype subtype = feat.GetFeatSubtype();
3487  if (subtype != CSeqFeatData::eSubtype_gap) continue;
3488 
3489  CConstRef<CSeq_loc> feat_loc(&feat_it->GetLocation());
3490 
3491  int ft_start = feat_loc->GetStart(eExtreme_Positional);
3492  int ft_end = feat_loc->GetStop(eExtreme_Positional);
3493 
3494  featPositions.push_back(ft_start);
3495  featPositions.push_back(ft_end);
3496 
3497  // cout << "feat start: " << ft_start << ", end: " << ft_end << endl;
3498  }
3499 
3500  errPt++;
3501 
3502  int remaininig_gaps = (int) gapPositions.size() / 2;
3503  int remaining_feats = (int) featPositions.size() / 2;
3504 
3505  if (remaininig_gaps < 1 || remaining_feats < 1) {
3506  return;
3507  }
3508 
3509  int gap_idx = 0;
3510  int feat_idx = 0;
3511 
3512  TSeqPos gap_start = gapPositions[gap_idx];
3513  gap_idx++;
3514  TSeqPos gap_end = gapPositions[gap_idx];
3515  gap_idx++;
3516  remaininig_gaps--;
3517 
3518  TSeqPos feat_start = featPositions[feat_idx];
3519  feat_idx++;
3520  TSeqPos feat_end = featPositions[feat_idx];
3521  feat_idx++;
3522  remaining_feats--;
3523 
3524  errPt++;
3525 
3526  while (remaininig_gaps >= 0 && remaining_feats >= 0) {
3527  if (gap_end < feat_start) {
3528  if (remaininig_gaps <= 0) {
3529  return;
3530  }
3531  gap_start = gapPositions[gap_idx];
3532  gap_idx++;
3533  gap_end = gapPositions[gap_idx];
3534  gap_idx++;
3535  remaininig_gaps--;
3536  } else if (feat_end < gap_start) {
3537  if (remaining_feats <= 0) {
3538  return;
3539  }
3540  feat_start = featPositions[feat_idx];
3541  feat_idx++;
3542  feat_end = featPositions[feat_idx];
3543  feat_idx++;
3544  remaining_feats--;
3545  } else {
3546  // cout << "overlap gap start: " << gap_start << ", end: " << gap_end << ", feat start: " << feat_start << ", end: " << feat_end << endl;
3547  if (feat_start != gap_start || feat_end != gap_end) {
3548  PostErr(eDiag_Warning, eErr_SEQ_INST_InstantiatedGapMismatch, "Gap feature location does not match delta gap coordinates", seq);
3549  }
3550  if (remaininig_gaps <= 0) {
3551  return;
3552  }
3553  gap_start = gapPositions[gap_idx];
3554  gap_idx++;
3555  gap_end = gapPositions[gap_idx];
3556  gap_idx++;
3557  remaininig_gaps--;
3558  if (remaining_feats <= 0) {
3559  return;
3560  }
3561  feat_start = featPositions[feat_idx];
3562  feat_idx++;
3563  feat_end = featPositions[feat_idx];
3564  feat_idx++;
3565  remaining_feats--;
3566  }
3567  }
3568 
3569  errPt++;
3570 
3571  } catch (const exception&) {
3573  string("Exception " + NStr::IntToString(errPt) + " in GapByGapInst"), seq);
3574  }
3575 }
3576 
3577 // Assumes that seq is eRepr_raw or eRepr_inst
3579 {
3580  const CSeq_inst& inst = seq.GetInst();
3582  const string& rpr = tv->FindName(inst.GetRepr(), true);
3583 
3584  if (inst.IsSetFuzz() && (!inst.IsSetSeq_data() || !inst.GetSeq_data().IsGap())) {
3586  "Fuzzy length on " + rpr + " Bioseq", seq);
3587  }
3588 
3589  if (! inst.IsSetLength() || inst.GetLength() == 0) {
3590  string len = inst.IsSetLength() ?
3591  NStr::IntToString(inst.GetLength()) : "0";
3593  "Invalid Bioseq length [" + len + "]", seq);
3594  }
3595 
3596  if (inst.GetRepr() == CSeq_inst::eRepr_raw) {
3597  const CMolInfo* mi = nullptr;
3599  if (mi_desc) {
3600  mi = &(mi_desc->GetMolinfo());
3601  }
3602  CMolInfo::TTech tech = mi ? mi->GetTech() : CMolInfo::eTech_unknown;
3603  if (tech == CMolInfo::eTech_htgs_2 &&
3604  ! GraphsOnBioseq() &&
3605  ! x_IsActiveFin()) {
3607  "HTGS 2 raw seq has no gaps and no graphs", seq);
3608  }
3609  }
3610 
3612 
3613  CSeq_data::E_Choice seqtyp = inst.IsSetSeq_data() ?
3615  if (seqtyp != CSeq_data::e_Gap) {
3616  switch (seqtyp) {
3617  case CSeq_data::e_Iupacna:
3618  case CSeq_data::e_Ncbi2na:
3619  case CSeq_data::e_Ncbi4na:
3620  case CSeq_data::e_Ncbi8na:
3621  case CSeq_data::e_Ncbipna:
3622  if (inst.IsAa()) {
3624  "Using a nucleic acid alphabet on a protein sequence",
3625  seq);
3626  return;
3627  }
3628  break;
3629  case CSeq_data::e_Iupacaa:
3630  case CSeq_data::e_Ncbi8aa:
3631  case CSeq_data::e_Ncbieaa:
3632  case CSeq_data::e_Ncbipaa:
3634  if (inst.IsNa()) {
3636  "Using a protein alphabet on a nucleic acid",
3637  seq);
3638  return;
3639  }
3640  break;
3641  case CSeq_data::e_Gap:
3642  break;
3643  default:
3645  "Sequence alphabet not set",
3646  seq);
3647  return;
3648  }
3649 
3650  bool check_alphabet = false;
3651  unsigned int factor = 1;
3652  switch (seqtyp) {
3653  case CSeq_data::e_Iupacaa:
3654  case CSeq_data::e_Iupacna:
3655  case CSeq_data::e_Ncbieaa:
3657  check_alphabet = true;
3658  break;
3659  case CSeq_data::e_Ncbi8na:
3660  case CSeq_data::e_Ncbi8aa:
3661  break;
3662  case CSeq_data::e_Ncbi4na:
3663  factor = 2;
3664  break;
3665  case CSeq_data::e_Ncbi2na:
3666  factor = 4;
3667  break;
3668  case CSeq_data::e_Ncbipna:
3669  factor = 5;
3670  break;
3671  case CSeq_data::e_Ncbipaa:
3672  factor = 21;
3673  break;
3674  default:
3675  // Logically, should not occur
3677  "Sequence alphabet not set",
3678  seq);
3679  return;
3680  }
3681  TSeqPos calc_len = inst.IsSetLength() ? inst.GetLength() : 0;
3682  if (calc_len % factor) {
3683  calc_len += factor;
3684  }
3685  calc_len /= factor;
3686 
3687  string s_len = NStr::UIntToString(inst.GetLength());
3688 
3689  size_t data_len = GetDataLen(inst);
3690  string data_len_str = NStr::NumericToString(data_len * factor);
3691  if (calc_len > data_len) {
3693  "Bioseq.seq_data too short [" + data_len_str +
3694  "] for given length [" + s_len + "]", seq);
3695  return;
3696  } else if (calc_len < data_len) {
3698  "Bioseq.seq_data is larger [" + data_len_str +
3699  "] than given length [" + s_len + "]", seq);
3700  }
3701 
3702  if (check_alphabet) {
3703  unsigned int trailingX = 0;
3704  size_t dashes = 0;
3705  bool leading_x = false, found_lower = false, cds_5_prime = false;
3706 
3709 
3710  size_t bad_cnt = 0;
3711  TSeqPos pos = 1;
3712  for (CSeqVector_CI sv_iter(*sv), sv_res_iter(sv_res); (sv_iter) && (sv_res_iter); ++sv_iter, ++sv_res_iter) {
3713  CSeqVector::TResidue res = *sv_iter;
3714  CSeqVector::TResidue n_res = *sv_res_iter;
3715  if (! IsResidue(n_res)) {
3716  if (res == 'U' && bsh.IsSetInst_Mol() && bsh.GetInst_Mol() == CSeq_inst::eMol_rna) {
3717  // U is ok for RNA
3718  } else if (res == '*' && bsh.IsAa()) {
3719  trailingX = 0;
3720  } else if (res == '-' && bsh.IsAa()) {
3721  dashes++;
3723  "Invalid residue [" + NStr::UIntToString(res)
3724  + "] at position [" + NStr::UIntToString(pos) + "]",
3725  seq);
3726  } else {
3727  if (! IsResidue(res)) {
3728  if (++bad_cnt > 10) {
3730  "More than 10 invalid residues. Checking stopped",
3731  seq);
3732  return;
3733  } else {
3735  "Invalid residue [" + NStr::UIntToString(res)
3736  + "] at position [" + NStr::UIntToString(pos) + "]",
3737  seq);
3738  }
3739  } else if (islower(res)) {
3740  found_lower = true;
3741  } else {
3742  string msg = "Invalid";
3743  if (seq.IsNa() && strchr ("EFIJLOPQXZ", res) != NULL) {
3744  msg += " nucleotide";
3745  } else if (seq.IsNa() && res == 'U') {
3746  msg += " nucleotide";
3747  }
3748  msg += " residue ";
3749  if (seqtyp == CSeq_data::e_Ncbistdaa) {
3750  msg += "[" + NStr::UIntToString(res) + "]";
3751  } else {
3752  msg += "'";
3753  msg += res;
3754  msg += "'";
3755  }
3756  msg += " at position [" + NStr::UIntToString(pos) + "]";
3757 
3759  msg, seq);
3760  }
3761  }
3762  } else if (res == '-' || sv->IsInGap(pos - 1)) {
3763  dashes++;
3764  } else if (res == '*') {
3765  trailingX = 0;
3766  } else if (res == 'X') {
3767  trailingX++;
3768  if (pos == 1) {
3769  leading_x = true;
3770  }
3771  } else if (! isalpha(res)) {
3772  string msg = "Invalid residue [";
3773  msg += res;
3774  msg += "] in position [" + NStr::UIntToString(pos) + "]";
3776  msg, seq);
3777  } else {
3778  trailingX = 0;
3779  }
3780  ++pos;
3781  }
3782 
3783  bool gap_at_start = HasBadProteinStart(*sv);
3784  size_t terminations = CountProteinStops(*sv);
3785 
3786  // only show leading or trailing X if product of NNN in nucleotide
3787  if (seq.IsAa() && (leading_x || trailingX > 0)) {
3788  CBioseq_Handle bsh2 = m_Scope->GetBioseqHandle(seq);
3789  const CSeq_feat* cds = GetCDSForProduct(bsh2);
3790  if (cds && cds->IsSetLocation()) {
3791  const CSeq_loc& cdsloc = cds->GetLocation();
3792  size_t dna_len = GetLength(cdsloc, m_Scope);
3793  if (dna_len > 5) {
3794  string cds_seq = GetSequenceStringFromLoc(cdsloc, *m_Scope);
3795  if (cds->GetData().GetCdregion().IsSetFrame()) {
3796  if (cds->GetData().GetCdregion().GetFrame() == 2) {
3797  cds_seq = cds_seq.substr(1);
3798  } else if (cds->GetData().GetCdregion().GetFrame() == 3) {
3799  cds_seq = cds_seq.substr(2);
3800  }
3801  }
3802 
3803  if (! NStr::StartsWith(cds_seq, "NNN")) {
3804  leading_x = false;
3805  }
3806  if (cds_seq.length() >= 3) {
3807  string lastcodon = cds_seq.substr(cds_seq.length() - 3);
3808  if (! NStr::StartsWith(lastcodon, "NNN")) {
3809  trailingX = 0;
3810  }
3811  }
3812  }
3813  // only need to calculate cds_5_prime to set severity for subsequent eErr_SEQ_INST_LeadingX message
3814  if (leading_x) {
3815  if (cdsloc.IsPartialStart(eExtreme_Biological)) {
3816  cds_5_prime = true;
3817  }
3818  }
3819  }
3820  }
3821 
3822  if (leading_x) {
3823  EDiagSev sev = eDiag_Warning;
3824  if (cds_5_prime) {
3825  sev = eDiag_Info;
3826  }
3828  "Sequence starts with leading X", seq);
3829  }
3830 
3831  if (trailingX > 0 && ! SuppressTrailingXMsg(seq)) {
3832  // Suppress if cds ends in "*" or 3' partial
3833  string msg = "Sequence ends in " +
3834  NStr::IntToString(trailingX) + " trailing X";
3835  if (trailingX > 1) {
3836  msg += "s";
3837  }
3839  }
3840 
3841  if (found_lower) {
3843  "Sequence contains lower-case characters", seq);
3844  }
3845 
3846  if (terminations > 0 || dashes > 0) {
3847  // Post error indicating terminations found in protein sequence
3848  // if possible, get gene and protein names
3849  CBioseq_Handle bsh3 = m_Scope->GetBioseqHandle(seq);
3850  // First get gene label
3851  string gene_label;
3852  try {
3853  const CSeq_feat* cds = GetCDSForProduct(bsh3);
3854  if (cds) {
3856  if (gene && gene->IsSetData() && gene->GetData().IsGene()) {
3857  gene->GetData().GetGene().GetLabel(&gene_label);
3858  }
3859  }
3860  } catch (...) {
3861  }
3862  // get protein label
3863  string protein_label;
3864  try {
3865  CCacheImpl::SFeatKey prot_key(
3867  const CCacheImpl::TFeatValue& prots =
3868  GetCache().GetFeatFromCache(prot_key);
3869  if (! prots.empty()) {
3870  const CSeqFeatData_Base::TProt& first_prot =
3871  prots[0].GetData().GetProt();
3872  if (! RAW_FIELD_IS_EMPTY_OR_UNSET(first_prot, Name)) {
3873  protein_label = first_prot.GetName().front();
3874  }
3875  }
3876  } catch (const CException&) {
3877  } catch (const std::exception&) {
3878  }
3879 
3880  if (NStr::IsBlank(gene_label)) {
3881  gene_label = "gene?";
3882  }
3883  if (NStr::IsBlank(protein_label)) {
3884  protein_label = "prot?";
3885  }
3886 
3887  if (dashes > 0) {
3888  if (gap_at_start && dashes == 1) {
3890  "gap symbol at start of protein sequence (" + gene_label + " - " + protein_label + ")",
3891  seq);
3892  } else if (gap_at_start) {
3894  "gap symbol at start of protein sequence (" + gene_label + " - " + protein_label + ")",
3895  seq);
3897  "[" + NStr::SizetToString (dashes - 1) + "] internal gap symbols in protein sequence (" + gene_label + " - " + protein_label + ")",
3898  seq);
3899  } else {
3901  "[" + NStr::SizetToString (dashes) + "] internal gap symbols in protein sequence (" + gene_label + " - " + protein_label + ")",
3902  seq);
3903  }
3904  }
3905 
3906  if (terminations > 0) {
3907  string msg = "[" + NStr::SizetToString(terminations) + "] termination symbols in protein sequence";
3908  msg += " (" + gene_label + " - " + protein_label + ")";
3909  const CSeq_feat* cds = GetCDSForProduct(bsh3);
3910  if (cds) {
3912  } else {
3914  }
3915  }
3916  }
3917  }
3918 
3919  bool is_wgs = IsWGS(bsh);
3920 
3921  if (seq.IsNa() && seq.GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
3922  // look for runs of Ns and gap characters
3923  bool has_gap_char = false;
3924  size_t run_len = 0;
3925  TSeqPos start_pos = 0;
3926  TSeqPos pos = 1;
3928  const size_t run_len_cutoff = ( is_wgs ? 20 : 100 );
3929  for (CSeqVector_CI sv_iter(sv); (sv_iter); ++sv_iter, ++pos) {
3930  CSeqVector::TResidue res = *sv_iter;
3931  switch (res) {
3932  case 'N':
3933  if (run_len == 0) {
3934  start_pos = pos;
3935  }
3936  run_len++;
3937  break;
3938  case '-':
3939  has_gap_char = true;
3940  ///////////////////////////////////
3941  ////////// FALL-THROUGH! //////////
3942  ///////////////////////////////////
3944  default:
3945  if (run_len >= run_len_cutoff && start_pos > 1) {
3947  "Run of " + NStr::SizetToString (run_len) + " Ns in raw sequence starting at base "
3948  + NStr::IntToString (start_pos),
3949  seq);
3950  }
3951  run_len = 0;
3952  break;
3953  }
3954  }
3955  if (has_gap_char) {
3957  "Raw nucleotide should not contain gap characters", seq);
3958  }
3959  }
3960  }
3961 }
3962 
3963 
3964 //LCOV_EXCL_START
3965 //part of segset validation, no longer used
3966 // Assumes seq is eRepr_seg or eRepr_ref
3968 {
3969  string id_test_label;
3970  seq.GetLabel(&id_test_label, CBioseq::eContent);
3971 
3973  const CSeq_inst& inst = seq.GetInst();
3974 
3975  // Validate extension data -- wrap in CSeq_loc_mix for convenience
3976  CRef<CSeq_loc> loc = GetLocFromSeq(seq);
3977  if (loc) {
3978  if (inst.IsSetRepr() && inst.GetRepr() == CSeq_inst::eRepr_seg) {
3979  m_Imp.ValidateSeqLoc(*loc, bsh, true, "Segmented Bioseq", seq);
3980  }
3981 
3982  // Validate Length
3983  try {
3984  TSeqPos loclen = GetLength(*loc, m_Scope);
3985  TSeqPos seqlen = inst.IsSetLength() ? inst.GetLength() : 0;
3986  if (seqlen > loclen) {
3988  "Bioseq.seq_data too short [" + NStr::IntToString(loclen) +
3989  "] for given length [" + NStr::IntToString(seqlen) + "]",
3990  seq);
3991  } else if (seqlen < loclen) {
3993  "Bioseq.seq_data is larger [" + NStr::IntToString(loclen) +
3994  "] than given length [" + NStr::IntToString(seqlen) + "]",
3995  seq);
3996  }
3997  } catch (const CObjmgrUtilException&) {
3998  ERR_POST_X(6, Critical << "Unable to calculate length: ");
3999  }
4000  }
4001 
4002  // Check for multiple references to the same Bioseq
4003  if (inst.IsSetExt() && inst.GetExt().IsSeg()) {
4004  const list<CRef<CSeq_loc>>& locs = inst.GetExt().GetSeg().Get();
4005  ITERATE(list<CRef<CSeq_loc>>, i1, locs) {
4006  if (! IsOneBioseq(**i1, m_Scope)) {
4007  continue;
4008  }
4009  const CSeq_id& id1 = GetId(**i1, m_Scope);
4010  list<CRef<CSeq_loc>>::const_iterator i2 = i1;
4011  for (++i2; i2 != locs.end(); ++i2) {
4012  if (! IsOneBioseq(**i2, m_Scope)) {
4013  continue;
4014  }
4015  const CSeq_id& id2 = GetId(**i2, m_Scope);
4016  if (IsSameBioseq(id1, id2, m_Scope)) {
4017  string sid;
4018  id1.GetLabel(&sid);
4019  if ((**i1).IsWhole() && (**i2).IsWhole()) {
4022  "Segmented sequence has multiple references to " +
4023  sid, seq);
4024  } else {
4027  "Segmented sequence has multiple references to " +
4028  sid + " that are not SEQLOC_WHOLE", seq);
4029  }
4030  }
4031  }
4032  }
4033  }
4034 
4035  // Check that partial sequence info on sequence segments is consistent with
4036  // partial sequence info on sequence -- aa sequences only
4037  int partial = SeqLocPartialCheck(*loc, m_Scope);
4038  if (seq.IsAa()) {
4039  bool got_partial = false;
4040  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (sd, seq) {
4041  if (! (*sd)->IsMolinfo() || ! (*sd)->GetMolinfo().IsSetCompleteness()) {
4042  continue;
4043  }
4044 
4045  switch ((*sd)->GetMolinfo().GetCompleteness()) {
4047  got_partial = true;
4048  if (! partial) {
4050  "Complete segmented sequence with MolInfo partial", seq);
4051  }
4052  break;
4054  if (! (partial & eSeqlocPartial_Start) || (partial & eSeqlocPartial_Stop)) {
4056  "No-left inconsistent with segmented SeqLoc",
4057  seq);
4058  }
4059  got_partial = true;
4060  break;
4062  if (! (partial & eSeqlocPartial_Stop) || (partial & eSeqlocPartial_Start)) {
4064  "No-right inconsistent with segmented SeqLoc",
4065  seq);
4066  }
4067  got_partial = true;
4068  break;
4070  if (! (partial & eSeqlocPartial_Start) || ! (partial & eSeqlocPartial_Stop)) {
4072  "No-ends inconsistent with segmented SeqLoc",
4073  seq);
4074  }
4075  got_partial = true;
4076  break;
4077  default:
4078  break;
4079  }
4080  }
4081  if (! got_partial) {
4083  "Partial segmented sequence without MolInfo partial", seq);
4084  }
4085  }
4086 }
4087 //LCOV_EXCL_STOP
4088 
4089 
4091 {
4092  int max_ns = -1;
4093 
4094  switch (tech) {
4098  max_ns = 80;
4099  break;
4100  case CMolInfo::eTech_wgs:
4101  max_ns = 19;
4102  break;
4103  default:
4104  max_ns = 99;
4105  break;
4106  }
4107  return max_ns;
4108 }
4109 
4110 
4111 static bool s_IsSwissProt(const CBioseq& seq)
4112 {
4113  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
4114  if ((*it)->IsSwissprot()) {
4115  return true;
4116  }
4117  }
4118  return false;
4119 }
4120 
4122 {
4123  TIntId cmp = q1->GetId()->CompareOrdered(*(q2->GetId()));
4124  if (cmp < 0) {
4125  return true;
4126  } else if (cmp > 0) {
4127  return false;
4128  }
4129 
4130  TSeqPos start1 = q1->GetStart(eExtreme_Positional);
4131  TSeqPos start2 = q2->GetStart(eExtreme_Positional);
4132  if (start1 < start2) {
4133  return true;
4134  } else if (start2 < start1) {
4135  return false;
4136  }
4137 
4138  TSeqPos stop1 = q1->GetStop(eExtreme_Positional);
4139  TSeqPos stop2 = q2->GetStop(eExtreme_Positional);
4140 
4141  if (stop1 < stop2) {
4142  return true;
4143  } else {
4144  return false;
4145  }
4146 }
4147 
4148 
4150 {
4151  bool rval = false;
4152 
4153  if (! seq.IsSetInst() || ! seq.GetInst().IsSetExt() ||
4154  ! seq.GetInst().GetExt().IsDelta()) {
4155  return false;
4156  }
4157 
4158  ITERATE(CDelta_ext::Tdata, sg, seq.GetInst().GetExt().GetDelta().Get()) {
4159  if (! (*sg)) {
4160  // skip NULL element
4161  } else if ((*sg)->IsLoc()) {
4162  const CSeq_id* id = (*sg)->GetLoc().GetId();
4163  if (id) {
4164  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
4165  if ((*id_it)->Compare(*id) == CSeq_id::e_YES) {
4166  rval = true;
4167  break;
4168  }
4169  }
4170  }
4171  if (rval) break;
4172  }
4173  }
4174  return rval;
4175 }
4176 
4177 
4179 {
4180  if (! loc.IsInt()) {
4181  return false;
4182  }
4183 
4184  TSeqPos stop = loc.GetStop(eExtreme_Positional);
4185  TSeqPos start = loc.GetStart(eExtreme_Positional);
4186 
4187  if (start > 1) {
4188  CRef<CSeq_loc> far_loc(new CSeq_loc());
4189  far_loc->SetInt().SetFrom(start - 2);
4190  far_loc->SetInt().SetTo(start - 1);
4191  far_loc->SetInt().SetId().Assign(loc.GetInt().GetId());
4192  CFeat_CI f(far_bsh.GetScope(), *far_loc);
4193  if (f) {
4194  return true;
4195  }
4196  }
4197  if (stop < far_bsh.GetBioseqLength() - 2) {
4198  CRef<CSeq_loc> far_loc(new CSeq_loc());
4199  far_loc->SetInt().SetFrom(stop + 1);
4200  far_loc->SetInt().SetTo(stop + 2);
4201  far_loc->SetInt().SetId().Assign(loc.GetInt().GetId());
4202  CFeat_CI f(far_bsh.GetScope(), *far_loc);
4203  if (f) {
4204  return true;
4205  }
4206  }
4207  return false;
4208 }
4209 
4210 
4212  const CSeq_loc& loc,
4213  const CBioseq& seq,
4214  TSeqPos& len)
4215 {
4216  if (loc.IsWhole()) {
4218  "Delta seq component should not be of type whole", seq);
4219  }
4220 
4221  const CSeq_id* id = loc.GetId();
4222  if (id) {
4223  if (id->IsGi() && loc.GetId()->GetGi() == ZERO_GI) {
4225  "Delta component is gi|0", seq);
4226  }
4227  if (! loc.IsWhole()
4228  && (id->IsGi()
4229  || id->IsGenbank()
4230  || id->IsEmbl()
4231  || id->IsDdbj() || id->IsTpg()
4232  || id->IsTpe()
4233  || id->IsTpd()
4234  || id->IsOther())) {
4235  TSeqPos stop = loc.GetStop(eExtreme_Positional);
4236  try {
4238  if (bsh) {
4239  TSeqPos seq_len = bsh.GetBioseqLength();
4240  if (seq_len <= stop) {
4241  string id_label = id->AsFastaString();
4243  "Seq-loc extent (" + NStr::IntToString (stop + 1)
4244  + ") greater than length of " + id_label
4245  + " (" + NStr::IntToString(seq_len) + ")",
4246  seq);
4247  }
4248  if (! m_Imp.IsRefSeq() && IsWGS(seq) && HasExcludedAnnotation(loc, bsh)) {
4249  string id_label = id->AsFastaString();
4251  "Scaffold points to some but not all of " +
4252  id_label + ", excluded portion contains features", seq);
4253  }
4254  } else {
4256  "Unable to find far delta sequence component", seq);
4257  }
4258  } catch (const CException&) {
4259  } catch (const std::exception&) {
4260  }
4261  }
4262  }
4263 
4264  try {
4265  if (seq.IsSetInst()) {
4266  const CSeq_inst& inst = seq.GetInst();
4267  TSeqPos loc_len = GetLength(loc, m_Scope);
4268  if (loc_len == numeric_limits<TSeqPos>::max()) {
4270  "-1 length on seq-loc of delta seq_ext", seq);
4271  string loc_str;
4272  loc.GetLabel(&loc_str);
4273  if (loc_str.empty()) {
4274  loc_str = "?";
4275  }
4276  if (x_IsDeltaLitOnly(inst)) {
4278  "Short length (-1) on seq-loc (" + loc_str + ") of delta seq_ext", seq);
4279  }
4280  } else {
4281  len += loc_len;
4282  }
4283  if (loc_len <= 10) {
4284  string loc_str;
4285  loc.GetLabel(&loc_str);
4286  if (loc_str.empty()) {
4287  loc_str = "?";
4288  }
4289  if (x_IsDeltaLitOnly(inst)) {
4291  "Short length (" + NStr::SizetToString(loc_len) +
4292  ") on seq-loc (" + loc_str + ") of delta seq_ext", seq);
4293  }
4294  }
4295  }
4296 
4297  } catch (const CObjmgrUtilException&) {
4298  string loc_str;
4299  loc.GetLabel(&loc_str);
4300  if (loc_str.empty()) {
4301  loc_str = "?";
4302  }
4304  "No length for Seq-loc (" + loc_str + ") of delta seq-ext",
4305  seq);
4306  }
4307 }
4308 
4309 
4310 static TSeqPos s_GetDeltaLen(const CDelta_seq& seg, CScope* scope)
4311 {
4312  if (seg.IsLiteral()) {
4313  return seg.GetLiteral().GetLength();
4314  } else if (seg.IsLoc()) {
4315  return GetLength(seg.GetLoc(), scope);
4316  } else {
4317  return 0;
4318  }
4319 }
4320 
4321 
4322 static string linkEvStrings[] = {
4323  "paired-ends",
4324  "align genus",
4325  "align xgenus",
4326  "align trnscpt",
4327  "within clone",
4328  "clone contig",
4329  "map",
4330  "strobe",
4331  "unspecified",
4332  "pcr",
4333  "proximity ligation",
4334  "other",
4335  "UNKNOWN VALUE"
4336 };
4337 
4338 /*bsv
4339 static bool s_IsGapComponent(const CDelta_seq& seg)
4340 {
4341  if (! seg.IsLiteral()) return false;
4342  const CSeq_literal& lit = seg.GetLiteral();
4343  if (! lit.IsSetSeq_data()) return true;
4344  if (lit.GetSeq_data().IsGap() && lit.GetLength() > 0) return true;
4345  return false;
4346 }
4347 */
4348 
4349 static bool s_IsUnspecified(const CSeq_gap& gap)
4350 {
4351  bool is_unspec = false;
4353  const CLinkage_evidence& evidence = **ev_itr;
4354  if (! evidence.CanGetType())
4355  continue;
4356  int linktype = evidence.GetType();
4357  if (linktype == 8) {
4358  is_unspec = true;
4359  }
4360  }
4361  return is_unspec;
4362 }
4363 
4364 
4366 {
4367  // always ignore for circular sequences
4368  if (bsh.GetInst().IsSetTopology() &&
4370  return true;
4371  }
4372 
4373  // ignore if location is genomic and gap is of certain type
4374  if (gap_type != CSeq_gap::eType_centromere &&
4375  gap_type != CSeq_gap::eType_telomere &&
4376  gap_type != CSeq_gap::eType_heterochromatin &&
4377  gap_type != CSeq_gap::eType_short_arm &&
4378  gap_type != CSeq_gap::eType_contamination) {
4379  return false;
4380  }
4381 
4382  CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
4383  if (src && src->GetSource().IsSetGenome() && src->GetSource().GetGenome() == CBioSource::eGenome_chromosome) {
4384  return true;
4385  } else {
4386  return false;
4387  }
4388 }
4389 
4390 
4391 // Assumes seq is a delta sequence
4393 {
4394  const CSeq_inst& inst = seq.GetInst();
4395 
4396  // Get CMolInfo and tech used for validating technique and gap positioning
4397  const CMolInfo* mi = nullptr;
4399  if (mi_desc) {
4400  mi = &(mi_desc->GetMolinfo());
4401  }
4402  CMolInfo::TTech tech = mi ? mi->GetTech() : CMolInfo::eTech_unknown;
4403 
4404  if (! inst.IsSetExt() || ! inst.GetExt().IsDelta() ||
4405  inst.GetExt().GetDelta().Get().empty()) {
4407  "No CDelta_ext data for delta Bioseq", seq);
4408  }
4409 
4410  bool any_tech_ok = false;
4411  bool has_gi = false;
4412  FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
4413  if (IsNTNCNWACAccession(**id_it)) {
4414  any_tech_ok = true;
4415  break;
4416  } else if ((*id_it)->IsGi()) {
4417  has_gi = true;
4418  }
4419  }
4421  if (! any_tech_ok && seq.IsNa()
4422  && tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4423  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3
4426  && tech != CMolInfo::eTech_htc && tech != CMolInfo::eTech_barcode
4427  && tech != CMolInfo::eTech_tsa) {
4429  "Delta seq technique should not be [" + NStr::IntToString(tech) + "]", seq);
4430  }
4431 
4432  // set severity for first / last gap error
4433  TSeqPos len = 0;
4434  TSeqPos seg = 0;
4435  bool last_is_gap = false;
4436  int prev_gap_linkage = -1;
4437  CSeq_gap::TType prev_gap_type = CSeq_gap::eType_unknown;
4438  int gap_linkage = -1;
4440  size_t num_gaps = 0;
4441  size_t num_adjacent_gaps = 0;
4442  bool non_interspersed_gaps = false;
4443  bool first = true;
4444  int num_gap_known_or_spec = 0;
4445  int num_gap_unknown_unspec = 0;
4446 
4447  vector<CConstRef<CSeq_loc> > delta_locs;
4448 
4449  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
4450  ++seg;
4451  if (! (*sg)) {
4453  "NULL pointer in delta seq_ext valnode (segment " +
4454  NStr::IntToString(seg) + ")", seq);
4455  continue;
4456  }
4457  switch ((**sg).Which()) {
4458  case CDelta_seq::e_Loc: {
4459  const CSeq_loc& loc = (**sg).GetLoc();
4460  CConstRef<CSeq_loc> tmp(&loc);
4461  delta_locs.push_back(tmp);
4462 
4463  ValidateDeltaLoc(loc, seq, len);
4464 
4465  if (! last_is_gap && ! first) {
4466  non_interspersed_gaps = true;
4467  }
4468  last_is_gap = false;
4469  prev_gap_linkage = -1;
4470  prev_gap_type = CSeq_gap::eType_unknown;
4471  gap_linkage = CSeq_gap::eType_unknown;
4472  first = false;
4473  } break;
4474  case CDelta_seq::e_Literal: {
4475  // The C toolkit code checks for valid alphabet here
4476  // The C++ object serializaton will not load if invalid alphabet
4477  // so no check needed here
4478  const CSeq_literal& lit = (*sg)->GetLiteral();
4479  TSeqPos start_len = len;
4480  len += lit.CanGetLength() ? lit.GetLength() : 0;
4481  if (lit.IsSetSeq_data() && ! lit.GetSeq_data().IsGap()
4482  && (! lit.IsSetLength() || lit.GetLength() == 0)) {
4484  "Seq-lit of length 0 in delta chain", seq);
4485  }
4486 
4487  // Check for invalid residues
4488  if (lit.IsSetSeq_data() && !lit.GetSeq_data().IsGap()) {
4489  if (! last_is_gap && ! first) {
4490  non_interspersed_gaps = true;
4491  }
4492  last_is_gap = false;
4493  prev_gap_linkage = -1;
4494  prev_gap_type = CSeq_gap::eType_unknown;
4495  const CSeq_data& data = lit.GetSeq_data();
4496  vector<TSeqPos> badIdx;
4497  CSeqportUtil::Validate(data, &badIdx);
4498  const string* ss = nullptr;
4499  switch (data.Which()) {
4500  case CSeq_data::e_Iupacaa:
4501  ss = &data.GetIupacaa().Get();
4502  break;
4503  case CSeq_data::e_Iupacna:
4504  ss = &data.GetIupacna().Get();
4505  break;
4506  case CSeq_data::e_Ncbieaa:
4507  ss = &data.GetNcbieaa().Get();
4508  break;
4509  case CSeq_data::e_Ncbistdaa: {
4510  const vector<char>& c = data.GetNcbistdaa().Get();
4511  ITERATE (vector<TSeqPos>, ci, badIdx) {
4513  "Invalid residue [" +
4514  NStr::IntToString((int)c[*ci]) + "] at position [" +
4515  NStr::IntToString((*ci) + 1) + "]", seq);
4516  }
4517  } break;
4518  default:
4519  break;
4520  }
4521 
4522  if (ss) {
4523  ITERATE (vector<TSeqPos>, it, badIdx) {
4525  "Invalid residue [" +
4526  ss->substr(*it, 1) + "] at position [" +
4527  NStr::IntToString((*it) + 1) + "]", seq);
4528  }
4529  }
4530 
4531  if (mi) {
4532  // Count adjacent Ns in Seq-lit
4533  int max_ns = s_MaxNsInSeqLitForTech(tech);
4534  size_t adjacent_ns = x_CountAdjacentNs(lit);
4535  if (max_ns >= 0 && adjacent_ns > unsigned(max_ns)) {
4537  "Run of " + NStr::NumericToString(adjacent_ns) +
4538  " Ns in delta component " + NStr::UIntToString(seg) +
4539  " that starts at base " + NStr::UIntToString(start_len + 1),
4540  seq);
4541  }
4542  }
4543  } else {
4544  gap_linkage = -1;
4545  gap_type = CSeq_gap::eType_unknown;
4546  if (lit.IsSetSeq_data() && lit.GetSeq_data().IsGap()) {
4547  const CSeq_data& data = lit.GetSeq_data();
4548  if (data.Which() == CSeq_data::e_Gap) {
4549  const CSeq_gap& gap = data.GetGap();
4550 
4551  if (gap.IsSetType()) {
4552  gap_type = gap.GetType();
4553  if (gap_type == CSeq_gap::eType_unknown && s_IsUnspecified(gap)) {
4554  num_gap_unknown_unspec++;
4555  } else {
4556  num_gap_known_or_spec++;
4557  }
4558  }
4559  if (gap.IsSetLinkage())
4560  gap_linkage = gap.GetLinkage();
4561  }
4562  }
4563  if (first && ! x_IgnoreEndGap(bsh, gap_type) && ! s_WillReportTerminalGap(seq, bsh)) {
4564  EDiagSev sev = eDiag_Error;
4565  if (tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4566  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3) {
4567  sev = eDiag_Warning;
4568  }
4570  "First delta seq component is a gap", seq);
4571  }
4572 
4573  if (last_is_gap &&
4574  (prev_gap_type == gap_type ||
4575  prev_gap_linkage != gap_linkage ||
4576  gap_linkage != CSeq_gap::eLinkage_unlinked)) {
4577  if (prev_gap_type != CSeq_gap::eType_contamination && gap_type != CSeq_gap::eType_contamination) {
4578  ++num_adjacent_gaps;
4579  }
4580  }
4581 
4582  if (lit.IsSetSeq_data() && lit.GetSeq_data().IsGap()) {
4583  ValidateSeqGap(lit.GetSeq_data().GetGap(), seq);
4584  } else if (! lit.CanGetLength() || lit.GetLength() == 0) {
4585  if (! lit.IsSetFuzz() || ! lit.GetFuzz().IsLim() || lit.GetFuzz().GetLim() != CInt_fuzz::eLim_unk) {
4587  "Gap of length 0 in delta chain", seq);
4588  } else {
4590  "Gap of length 0 with unknown fuzz in delta chain", seq);
4591  }
4592  } else if (lit.CanGetLength() && lit.GetLength() != 100) {
4593  if (lit.IsSetFuzz()) {
4595  "Gap of unknown length should have length 100", seq);
4596  }
4597  }
4598  last_is_gap = true;
4599  prev_gap_type = gap_type;
4600  prev_gap_linkage = gap_linkage;
4601  ++num_gaps;
4602  }
4603  first = false;
4604  } break;
4605  default:
4607  "CDelta_seq::Which() is e_not_set", seq);
4608  }
4609  }
4610 
4611  if (num_gap_unknown_unspec > 0 && num_gap_known_or_spec == 0) {
4612  if (num_gap_unknown_unspec > 1) {
4614  "All " + NStr::IntToString(num_gap_unknown_unspec) +
4615  " Seq-gaps have unknown type and unspecified linkage", seq);
4616  } else {
4618  "Single Seq-gap has unknown type and unspecified linkage", seq);
4619  }
4620  }
4621 
4622  if (inst.GetLength() > len) {
4624  "Bioseq.seq_data too short [" + NStr::IntToString(len) +
4625  "] for given length [" + NStr::IntToString(inst.GetLength()) +
4626  "]", seq);
4627  } else if (inst.GetLength() < len) {
4629  "Bioseq.seq_data is larger [" + NStr::IntToString(len) +
4630  "] than given length [" + NStr::IntToString(inst.GetLength()) +
4631  "]", seq);
4632  }
4633  if (non_interspersed_gaps && ! has_gi && mi &&
4634  (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
4635  tech == CMolInfo::eTech_htgs_2)) {
4636  EDiagSev missing_gaps_sev = eDiag_Error;
4638  while (desc_i) {
4639  if (desc_i->GetUser().IsRefGeneTracking()) {
4640  missing_gaps_sev = eDiag_Info;
4641  break;
4642  }
4643  ++desc_i;
4644  }
4645 
4646  PostErr(missing_gaps_sev, eErr_SEQ_INST_MissingGaps,
4647  "HTGS delta seq should have gaps between all sequence runs", seq);
4648  }
4649  if (num_adjacent_gaps >= 1) {
4650  string msg = (num_adjacent_gaps == 1) ?
4651  "There is 1 adjacent gap in delta seq" :
4652  "There are " + NStr::SizetToString(num_adjacent_gaps) +
4653  " adjacent gaps in delta seq";
4655  }
4656  if (last_is_gap && ! x_IgnoreEndGap(bsh, gap_type) && ! s_WillReportTerminalGap(seq, bsh)) {
4657  EDiagSev sev = eDiag_Error;
4658  if (tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4659  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3) {
4660  sev = eDiag_Warning;
4661  }
4663  "Last delta seq component is a gap", seq);
4664  }
4665 
4666  // Validate technique
4667  if (num_gaps == 0 && mi) {
4668  if (tech == CMolInfo::eTech_htgs_2 &&
4669  ! GraphsOnBioseq() &&
4670  ! x_IsActiveFin()) {
4672  "HTGS 2 delta seq has no gaps and no graphs", seq);
4673  }
4674  }
4675 
4676  // look for multiple delta locs overlapping
4677  if (delta_locs.size() > 1) {
4678  stable_sort(delta_locs.begin(), delta_locs.end(), s_LocSortCompare);
4679  vector<CConstRef<CSeq_loc>>::iterator it1 = delta_locs.begin();
4680  vector<CConstRef<CSeq_loc>>::iterator it2 = it1;
4681  ++it2;
4682  while (it2 != delta_locs.end()) {
4683  if ((*it1)->GetId()->Compare(*(*it2)->GetId()) == CSeq_id::e_YES
4684  && Compare (**it1, **it2, m_Scope, fCompareOverlapping) != eNoOverlap) {
4685  string seq_label = (*it1)->GetId()->AsFastaString();
4687  "Overlapping delta range " + NStr::IntToString((*it2)->GetStart(eExtreme_Positional) + 1)
4688  + "-" + NStr::IntToString((*it2)->GetStop(eExtreme_Positional) + 1)
4689  + " and " + NStr::IntToString((*it1)->GetStart(eExtreme_Positional) + 1)
4690  + "-" + NStr::IntToString((*it1)->GetStop(eExtreme_Positional) + 1)
4691  + " on a Bioseq " + seq_label,
4692  seq);
4693  }
4694  ++it1;
4695  ++it2;
4696  }
4697  }
4698 
4699  if (IsSelfReferential(seq)) {
4701  "Self-referential delta sequence", seq);
4702  }
4703 
4704  // look for Ns next to gaps
4705  if (seq.IsNa() && seq.GetLength() > 1 && x_IsDeltaLitOnly(inst)) {
4706  try {
4707  TSeqPos pos = 0;
4709  ITERATE (CDelta_ext::Tdata, delta_i, seq.GetInst().GetExt().GetDelta().Get()) {
4710  if (delta_i->Empty()) {
4711  continue; // Ignore NULLs, reported separately above.
4712  }
4713  const CDelta_seq& seg2 = **delta_i;
4714  TSeqPos delta_len = s_GetDeltaLen(seg2, m_Scope);
4715  if (pos > 0) {
4716  if (sv.IsInGap(pos)) {
4717  CSeqVector::TResidue res = sv[pos - 1];
4718  if (res == 'N' && ! sv.IsInGap(pos - 1)) {
4720  "Ambiguous residue N is adjacent to a gap around position " + NStr::SizetToString (pos + 1),
4721  seq);
4722  }
4723  }
4724  }
4725  if (delta_len > 0 && pos + delta_len < len) {
4726  if (sv.IsInGap(pos + delta_len - 1)) {
4727  CSeqVector::TResidue res = sv[pos + delta_len];
4728  if (res == 'N' && ! sv.IsInGap(pos + delta_len)) {
4730  "Ambiguous residue N is adjacent to a gap around position " + NStr::SizetToString(pos + delta_len + 1),
4731  seq);
4732  }
4733  }
4734  }
4735  pos += delta_len;
4736  }
4737  } catch (const CException&) {
4738  } catch (const std::exception&) {
4739  }
4740  }
4741 
4742 }
4743 
4744 
4745 bool s_HasGI(const CBioseq& seq)
4746 {
4747  bool has_gi = false;
4748  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
4749  if ((*id_it)->IsGi()) {
4750  has_gi = true;
4751  break;
4752  }
4753  }
4754  return has_gi;
4755 }
4756 
4757 
4759 {
4760  if (gap.IsSetLinkage_evidence()) {
4761  int linkcount = 0;
4762  int linkevarray[13];
4763  for (int i = 0; i < 13; i++) {
4764  linkevarray[i] = 0;
4765  }
4766  bool is_unspec = false;
4768  const CLinkage_evidence& evidence = **ev_itr;
4769  if (! evidence.CanGetType())
4770  continue;
4771  int linktype = evidence.GetType();
4772  if (linktype == 8) {
4773  is_unspec = true;
4774  }
4775  linkcount++;
4776  if (linktype == 255) {
4777  (linkevarray[11])++;
4778  } else if (linktype < 0 || linktype > 10) {
4779  (linkevarray[12])++;
4780  } else {
4781  (linkevarray[linktype])++;
4782  }
4783  }
4784  if (linkevarray[8] > 0 && linkcount > linkevarray[8]) {
4786  "Seq-gap type has unspecified and additional linkage evidence", seq);
4787  }
4788  for (int i = 0; i < 13; i++) {
4789  if (linkevarray[i] > 1) {
4791  "Linkage evidence '" + linkEvStrings[i] + "' appears " +
4792  NStr::IntToString(linkevarray[i]) + " times", seq);
4793  }
4794  }
4795  if (! gap.IsSetLinkage() || gap.GetLinkage() != CSeq_gap::eLinkage_linked) {
4797  "Seq-gap with linkage evidence must have linkage field set to linked", seq);
4798  }
4799  if (gap.IsSetType()) {
4800  int gaptype = gap.GetType();
4801  if (gaptype != CSeq_gap::eType_fragment &&
4802  gaptype != CSeq_gap::eType_clone &&
4803  gaptype != CSeq_gap::eType_repeat &&
4804  gaptype != CSeq_gap::eType_scaffold) {
4805  if (gaptype == CSeq_gap::eType_unknown && is_unspec) {
4806  /* suppress for legacy records */
4807  } else if (gaptype == CSeq_gap::eType_contamination) {
4808  if (linkevarray[8] > 0 && linkcount == linkevarray[8]) {
4809  /* contamination can only have linked unspecified */
4810  } else {
4812  "Contamination gaps must have linkage evidence 'unspecified'", seq);
4813  }
4814  } else {
4816  "Seq-gap of type " + NStr::IntToString(gaptype) +
4817  " should not have linkage evidence", seq);
4818  }
4819  }
4820  }
4821  } else {
4822  if (gap.IsSetType()) {
4823  int gaptype = gap.GetType();
4824  if (gaptype == CSeq_gap::eType_scaffold) {
4826  "Seq-gap type == scaffold is missing required linkage evidence", seq);
4827  }
4828  if (gaptype == CSeq_gap::eType_repeat && gap.IsSetLinkage() && gap.GetLinkage() == CSeq_gap::eLinkage_linked) {
4829  bool suppress_SEQ_INST_SeqGapProblem = false;
4830  if (seq.IsSetDescr() && s_HasGI(seq)) {
4832  {
4833  if ((**it).IsCreate_date())
4834  {
4835  CDate threshold_date(CTime(2012, 10, 1));
4836  if ((**it).GetCreate_date().Compare(threshold_date) == CDate::eCompare_before)
4837  suppress_SEQ_INST_SeqGapProblem = true;
4838  break;
4839  }
4840  }
4841  }
4842  if (! suppress_SEQ_INST_SeqGapProblem)
4844  "Seq-gap type == repeat and linkage == linked is missing required linkage evidence", seq);
4845 
4846  }
4847  if (gaptype == CSeq_gap::eType_contamination) {
4849  "Contamination gap-types must be linked and have linkage-evidence of type 'unspecified'", seq);
4850  }
4851  }
4852  }
4853 }
4854 
4855 
4857  const CSeq_inst& inst,
4858  const CBioseq& seq)
4859 {
4860  bool rtn = true;
4862  string rpr = tv->FindName(inst.GetRepr(), true);
4863  if (NStr::Equal(rpr, "ref")) {
4864  rpr = "reference";
4865  } else if (NStr::Equal(rpr, "const")) {
4866  rpr = "constructed";
4867  }
4868  const string err0 = "Bioseq-ext not allowed on " + rpr + " Bioseq";
4869  const string err1 = "Missing or incorrect Bioseq-ext on " + rpr + " Bioseq";
4870  const string err2 = "Missing Seq-data on " + rpr + " Bioseq";
4871  const string err3 = "Seq-data not allowed on " + rpr + " Bioseq";
4872  switch (inst.GetRepr()) {
4874  if (inst.IsSetExt()) {
4876  rtn = false;
4877  }
4878  if (inst.IsSetSeq_data()) {
4880  rtn = false;
4881  }
4882  break;
4883  case CSeq_inst::eRepr_map:
4884  if (! inst.IsSetExt() || ! inst.GetExt().IsMap()) {
4886  rtn = false;
4887  }
4888  if (inst.IsSetSeq_data()) {
4890  rtn = false;
4891  }
4892  break;
4893  case CSeq_inst::eRepr_ref:
4894  if (! inst.IsSetExt() || ! inst.GetExt().IsRef() ) {
4896  rtn = false;
4897  }
4898  if (inst.IsSetSeq_data()) {
4900  rtn = false;
4901  }
4902  break;
4903  case CSeq_inst::eRepr_seg:
4904  if (! inst.IsSetExt() || ! inst.GetExt().IsSeg() ) {
4906  rtn = false;
4907  }
4908  if (inst.IsSetSeq_data()) {
4910  rtn = false;
4911  }
4912  break;
4913  case CSeq_inst::eRepr_raw:
4915  if (inst.IsSetExt()) {
4917  rtn = false;
4918  }
4919  if (! inst.IsSetSeq_data() ||
4921  || inst.GetSeq_data().Which() == CSeq_data::e_Gap)
4922  {
4924  rtn = false;
4925  }
4926  break;
4928  if (! inst.IsSetExt() || ! inst.GetExt().IsDelta()) {
4930  rtn = false;
4931  }
4932  if (inst.IsSetSeq_data()) {
4934  rtn = false;
4935  }
4936  break;
4937  default:
4938  PostErr(
4940  "Invalid Bioseq->repr = " +
4941  NStr::IntToString(static_cast<int>(inst.GetRepr())), seq);
4942  rtn = false;
4943  }
4944  return rtn;
4945 }
4946 
4947 
4949 {
4950  bool rval = false;
4951 
4952  // not a delta sequence, so no components to check
4953  if (! bsh.IsSetInst()
4954  || !bsh.GetInst().IsSetRepr()
4955  || bsh.GetInst().GetRepr() != CSeq_inst::eRepr_delta
4956  || !bsh.GetInst().IsSetExt()
4957  || !bsh.GetInst().GetExt().IsDelta()
4958  || !bsh.GetInst().GetExt().GetDelta().IsSet()) {
4959  return rval;
4960  }
4961 
4962  for (auto it : bsh.GetInst().GetExt().GetDelta().Get()) {
4963  if (! it->IsLoc()) continue;
4964  CBioseq_Handle hdl = bsh.GetScope().GetBioseqHandle(it->GetLoc());
4965  if (! hdl) continue;
4967  if (! ci) continue;
4968  const CBioSource& crc = ci->GetSource();
4970  if (crc.IsSetGenome()) {
4971  cgenome = crc.GetGenome();
4972  }
4973  if (parent_location == cgenome) break;
4974  if (parent_location == CBioSource::eGenome_unknown || parent_location == CBioSource::eGenome_genomic) break;
4975  if (cgenome == CBioSource::eGenome_unknown || cgenome == CBioSource::eGenome_genomic) break;
4976  rval = true;
4977  break;
4978  }
4979  return rval;
4980 }
4981 
4982 
4983 // From VR-796:
4984 // If the lineage contains Metazoa, the topology is circular, and the
4985 // location is mitochondrion, the sequence length should not be
4986 // greater than 65000bp.
4987 // This is erring on the side of caution as most Metazoan genomes are less than 17000 bp.
4988 // From RW-1709:
4989 // Raising length from 65000 to 100000.
4991 {
4994  src.IsSetOrg() && src.GetOrg().IsSetLineage() &&
4995  inst.IsSetLength() &&
4996  NStr::Find(src.GetOrg().GetLineage(), "Metazoa") != NPOS) {
4997  return 100000;
4998  }
4999  return 0;
5000 }
5001 
5002 
5004 {
5006  if (! di) {
5007  // add to list of sources with no descriptor later to be reported
5009  return;
5010  }
5011  _ASSERT(di);
5012 
5013  const CBioSource& src = di->GetSource();
5014 
5015  if (m_Imp.IsTransgenic(src) &&
5016  CSeq_inst::IsNa(bsh.GetInst_Mol())) {
5017  // "if" means "if no biosrcs on bsh"
5018  if (GetCache().GetFeatFromCache(
5021  {
5023  "Transgenic source descriptor requires presence of source feature",
5024  *(bsh.GetBioseqCore()));
5025  }
5026  }
5027 
5028  if (src.IsSetGenome() && x_ParentAndComponentLocationsDiffer(bsh, src.GetGenome())) {
5030  "Genome difference between parent and component",
5031  *(bsh.GetBioseqCore()));
5032  }
5033 
5034  size_t max_len = x_BadMetazoanMitochondrialLength(src, bsh.GetInst());
5035  if (max_len > 0 && bsh.GetInst().IsSetLength() && bsh.GetInst().GetLength() > max_len) {
5037  "Mitochondrial Metazoan sequences should be less than 65000 bp",
5038  *(bsh.GetBioseqCore()));
5039  }
5040 }
5041 
5042 
5044 {
5046 
5047  if (! sd) {
5049  "No Mol-info applies to this Bioseq",
5050  seq);
5051  }
5052 }
5053 
5054 
5056 {
5057  string label;
5058  seq.GetLabel(&label, CBioseq::eBoth);
5059 
5060  if (! m_CurrentHandle) {
5061  return;
5062  }
5063 
5065  // look for pub or feat with cit
5066  ITERATE(CCacheImpl::TFeatValue, all_feat_it, *m_AllFeatIt) {
5067  if (all_feat_it->IsSetCit() || all_feat_it->GetData().IsPub()) {
5068  return;
5069  }
5070  }
5071 
5073  }
5074 }
5075 
5076 
5078 {
5080  if (closest_molinfo) {
5081  const CSeq_entry& ctx = *seq.GetParentEntry();
5083  "Suspicious use of complete", ctx, *closest_molinfo);
5084  } else {
5086  "Suspicious use of complete", seq);
5087  }
5088 }
5089 
5090 
5092  const CBioseq& seq,
5093  const CMolInfo& mi)
5094 {
5095  if (! mi.IsSetCompleteness()) {
5096  return;
5097  }
5098  if (! seq.IsNa()) {
5099  return;
5100  }
5101 
5103  CMolInfo::TBiomol biomol = mi.IsSetBiomol() ?
5105  EDiagSev sev = mi.GetTech() == CMolInfo::eTech_htgs_3 ?
5106  eDiag_Warning : /* eDiag_Error */ eDiag_Warning;
5107 
5108  string title;
5110  if (desc) {
5111  title = desc->GetTitle();
5112  } else {
5113  sequence::CDeflineGenerator defline_generator;
5114  title = defline_generator.GenerateDefline(seq, *m_Scope, sequence::CDeflineGenerator::fIgnoreExisting);
5115  }
5116  if (! NStr::IsBlank(title)) {
5117  if (NStr::FindNoCase(title, "complete sequence") != string::npos
5118  || NStr::FindNoCase(title, "complete genome") != string::npos) {
5119  return;
5120  }
5121  }
5122 
5123  bool reported = false;
5124 
5125  if (comp == CMolInfo::eCompleteness_complete) {
5126  if (biomol == CMolInfo::eBiomol_genomic || biomol == CMolInfo::eBiomol_cRNA) {
5127  bool is_gb = false;
5128  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
5129  if ((*it)->IsGenbank()) {
5130  is_gb = true;
5131  break;
5132  }
5133  }
5134 
5135  if (is_gb) {
5136  if (seq.IsSetInst() && seq.GetInst().IsSetTopology()
5138  const CSeq_entry& ctx = *seq.GetParentEntry();
5140  "Circular topology has complete flag set, but title should say complete sequence or complete genome",
5141  ctx);
5142  } else {
5144  reported = true;
5145  }
5146  }
5147  }
5148 
5149  if (! reported) {
5150  // for SQD-1484
5151  // warn if completeness = complete, organism not viral and origin not artificial, no location set or location is genomic
5153  if (src_desc) {
5154  const CBioSource& biosrc = src_desc->GetSource();
5155  if ((! biosrc.IsSetLineage()
5156  || (NStr::FindNoCase(biosrc.GetLineage(), "Viruses") == string::npos
5157  && NStr::FindNoCase(biosrc.GetLineage(), "Viroids") == string::npos)) // not viral
5158  && (! biosrc.IsSetOrigin() || biosrc.GetOrigin() != CBioSource::eOrigin_artificial) // not artificial
5159  && (! src_desc->GetSource().IsSetGenome()
5160  || src_desc->GetSource().GetGenome() == CBioSource::eGenome_genomic
5161  || src_desc->GetSource().GetGenome() == CBioSource::eGenome_unknown)) { // location not set or genomic or unknown
5163  reported = true;
5164  }
5165  }
5166  }
5167  if (! reported && HasAssemblyOrNullGap(seq)) {
5168  // for VR-614
5170  }
5171  }
5172 }
5173 
5174 
5175 static bool s_StandaloneProt(const CBioseq_Handle& bsh)
5176 {
5177  // proteins are never standalone within the context of a Genbank / Refseq
5178  // record.
5179 
5181  while (eh) {
5182  if (eh.IsSet()) {
5183  CBioseq_set_Handle bsh2 = eh.GetSet();
5184  if (bsh2.IsSetClass()) {
5185  CBioseq_set::TClass cls = bsh2.GetClass();
5186  switch (cls) {
5193  return false;
5194  default:
5195  break;
5196  }
5197  }
5198  }
5199  eh = eh.GetParentEntry();
5200  }
5201 
5202  return true;
5203 }
5204 
5205 
5207 {
5208  CBioseq_Handle parent;
5209 
5210  if (part) {
5211  CSeq_entry_Handle segset =
5213  if (segset) {
5214  for (CSeq_entry_CI it(segset); it; ++it) {
5215  if (it->IsSeq() && it->GetSeq().IsSetInst_Repr() &&
5216  it->GetSeq().GetInst_Repr() == CSeq_inst::eRepr_seg) {
5217  parent = it->GetSeq();
5218  break;
5219  }
5220  }
5221  }
5222  }
5223  return parent;
5224 }
5225 
5226 
5227 static bool s_SeqIdCompare(const CConstRef<CSeq_id>& q1, const CConstRef<CSeq_id>& q2)
5228 {
5229  // is q1 < q2
5230  return (q1->CompareOrdered(*q2) < 0);
5231 }
5232 
5233 
5234 static bool s_SeqIdMatch(const CConstRef<CSeq_id>& q1, const CConstRef<CSeq_id>& q2)
5235 {
5236  // is q1 == q2
5237  return (q1->CompareOrdered(*q2) == 0);
5238 }
5239 
5240 
5242 {
5243  if (! m_GeneIt) {
5244  return;
5245  }
5246  /*
5247  bool is_circular = bsh.IsSetInst_Topology() && bsh.GetInst_Topology() == CSeq_inst::eTopology_circular;
5248  */
5249  try {
5250  vector<CConstRef<CSeq_feat>> containing_genes;
5251  vector<int> num_contained;
5253  TSeqPos left = fi->GetLocation().GetStart(eExtreme_Positional);
5254  vector<CConstRef<CSeq_feat>>::iterator cit = containing_genes.begin();
5255  vector<int>::iterator nit = num_contained.begin();
5256  while (cit != containing_genes.end() && nit != num_contained.end()) {
5257  ECompare comp = Compare(fi->GetLocation(), (*cit)->GetLocation(), m_Scope, fCompareOverlapping);
5258  if (comp == eContained || comp == eSame) {
5259  (*nit)++;
5260  }
5261  TSeqPos n_right = (*cit)->GetLocation().GetStop(eExtreme_Positional);
5262  if (n_right < left) {
5263  // report if necessary
5264  if (*nit > 4) {
5266  "Gene contains " + NStr::IntToString (*nit) + " other genes",
5267  **cit);
5268  }
5269  // remove from list
5270  cit = containing_genes.erase(cit);
5271  nit = num_contained.erase(nit);
5272  } else {
5273  ++cit;
5274  ++nit;
5275  }
5276  }
5277 
5278  const CSeq_feat& ft = fi->GetOriginalFeature();
5279  const CSeq_feat* p = &ft;
5280  CConstRef<CSeq_feat> ref(p);
5281  containing_genes.push_back(ref);
5282  num_contained.push_back(0);
5283  }
5284 
5285  vector<CConstRef<CSeq_feat>>::iterator cit = containing_genes.begin();
5286  vector<int>::iterator nit = num_contained.begin();
5287  while (cit != containing_genes.end() && nit != num_contained.end()) {
5288  if (*nit > 4) {
5290  "Gene contains " + NStr::IntToString (*nit) + " other genes",
5291  **cit);
5292  }
5293  ++cit;
5294  ++nit;
5295  }
5296  } catch (const exception& e) {
5298  string("Exception while validating bioseq MultipleGeneOverlap. EXCEPTION: ") +
5299  e.what(), *(bsh.GetCompleteBioseq()));
5300  }
5301 }
5302 
5303 
5304 void CValidError_bioseq::x_ReportGeneOverlapError(const CSeq_feat& feat, const string& gene_label)
5305 {
5306  string msg("gene [");
5307  msg += gene_label;
5308 
5309  if (feat.GetData().IsCdregion()) {
5310 
5311  msg += "] overlaps CDS but does not completely contain it";
5313 
5314  } else if (feat.GetData().IsRna()) {
5315 
5316  if (GetOverlappingOperon(feat.GetLocation(), *m_Scope)) {
5317  return;
5318  }
5319 
5320  msg += "] overlaps mRNA but does not completely contain it";
5322  }
5323 }
5324 
5325 static void s_GetGeneTextLabel(const CSeq_feat& feat, string& label)
5326 {
5327  _ASSERT(feat.IsSetData() && feat.GetData().IsGene() && "Here should be a gene feature");
5328  _ASSERT(feat.IsSetLocation() && "The feature should have a location");
5329 
5330  if (feat.IsSetData() && feat.GetData().IsGene()) {
5331 
5332  const CGene_ref& gene = feat.GetData().GetGene();
5333  if (gene.IsSetLocus()) {
5334  label += gene.GetLocus();
5335  } else if (gene.IsSetDesc()) {
5336  label += gene.GetDesc();
5337  }
5338 
5339  if (feat.IsSetLocation()) {
5340  string loc_label;
5341  feat.GetLocation().GetLabel(&loc_label);
5342 
5343  if (! label.empty()) {
5344  label += ':';
5345  }
5346  label += loc_label;
5347  }
5348 
5349  if (gene.IsSetLocus_tag()) {
5350 
5351  if (! label.empty()) {
5352  label += ':';
5353  }
5354  label += gene.GetLocus_tag();
5355  }
5356  }
5357 }
5358 
5360 {
5361  const CGene_ref* grp = feat.GetGeneXref();
5362  if (grp && grp->IsSuppressed()) {
5363  return;
5364  }
5365 
5366  CConstRef<CSeq_feat> connected_gene = m_Imp.GetCachedGene(&feat);
5367  if (connected_gene) {
5368  EOverlapType overlap_type = eOverlap_Contained;
5369  if (feat.IsSetExcept_text() &&
5370  NStr::FindNoCase(feat.GetExcept_text(), "trans-splicing") != string::npos) {
5371  overlap_type = eOverlap_Subset;
5372  }
5373 
5374  if (TestForOverlapEx(connected_gene->GetLocation(), feat.GetLocation(),
5375  overlap_type, m_Scope) < 0) {
5376 
5377  string gene_label;
5378  s_GetGeneTextLabel(*connected_gene, gene_label);
5379  x_ReportGeneOverlapError(feat, gene_label);
5380  }
5381  return;
5382  }
5383  /*
5384  const CGene_ref* grp = feat.GetGeneXref();
5385  if (grp && grp->IsSuppressed()) {
5386  return;
5387  }
5388  */
5389 
5390  const CSeq_loc& loc = feat.GetLocation();
5391 
5393  if (! gene)
5394  return;
5395  if (TestForOverlapEx(gene->GetLocation(), feat.GetLocation(), eOverlap_Contained, m_Scope) < 0) {
5396 
5397  string gene_label;
5398  s_GetGeneTextLabel(*gene, gene_label);
5399 
5400  // found an intersecting (but not overlapping) gene
5401  x_ReportGeneOverlapError(feat, gene_label);
5402  }
5403 }
5404 
5405 
5406 bool s_IsCDDFeat(const CMappedFeat& feat)
5407 {
5408  if (feat.GetData().IsRegion()) {
5409  FOR_EACH_DBXREF_ON_FEATURE (db, feat) {
5410  if ((*db)->CanGetDb() &&
5411  NStr::Compare((*db)->GetDb(), "CDD") == 0) {
5412  return true;
5413  }
5414  }
5415  }
5416  return false;
5417 }
5418 
5419 
5420 bool s_CheckPosNOrGap(TSeqPos pos, const CSeqVector& vec)
5421 {
5422  if (vec.IsInGap(pos) || vec[pos] == 'N') {
5423  return true;
5424  } else {
5425  return false;
5426  }
5427 }
5428 
5429 
5430 bool s_AfterIsGapORN(TSeqPos pos, TSeqPos after, TSeqPos len, const CSeqVector& vec)
5431 {
5432  if (pos < len - after && s_CheckPosNOrGap(pos + after, vec)) {
5433  return true;
5434  } else {
5435  return false;
5436  }
5437 }
5438 
5439 
5440 bool s_AfterIsGap(TSeqPos pos, TSeqPos after, TSeqPos len, const CSeqVector& vec)
5441 {
5442  if (pos < len - after && vec.IsInGap(pos + after)) {
5443  return true;
5444  } else {
5445  return false;
5446  }
5447 }
5448 
5449 
5450 bool s_BeforeIsGapOrN(TSeqPos pos, TSeqPos before, const CSeqVector& vec)
5451 {
5452  if (pos >= before && s_CheckPosNOrGap(pos - before, vec)) {
5453  return true;
5454  } else {
5455  return false;
5456  }
5457 }
5458 
5459 
5460 bool s_BeforeIsGap(TSeqPos pos, TSeqPos before, const CSeqVector& vec)
5461 {
5462  if (pos >= before && vec.IsInGap(pos - before)) {
5463  return true;
5464  } else {
5465  return false;
5466  }
5467 }
5468 
5469 
5471  const CSeq_loc& loc,
5472  unsigned int tag,
5473  bool& bad_seq,
5474  bool& is_gap,
5475  bool& abuts_n)
5476 {
5477  bad_seq = false;
5478  is_gap = false;
5479  abuts_n = false;
5481  return false;
5482  }
5483 
5485  for (CSeq_loc_CI sl_iter(loc); sl_iter; ++sl_iter) { // EQUIV_IS_ONE not supported
5486  if (! first) {
5487  first = sl_iter;
5488  }
5489  last = sl_iter;
5490  }
5491 
5492  if (first.GetStrand() != last.GetStrand()) {
5493  return false;
5494  }
5496 
5497  if (! m_Scope) {
5498  return false;
5499  }
5500 
5501  //CBioseq_Handle bsh = m_Scope->GetBioseqHandleFromTSE(*(temp.GetRangeAsSeq_loc()->GetId()), m_Imp.GetTSE_Handle() );
5503  if (! bsh) {
5504  return false;
5505  }
5506 
5507  TSeqPos acceptor = temp.GetRange().GetFrom();
5508  TSeqPos donor = temp.GetRange().GetTo();
5509  TSeqPos start = acceptor;
5510  TSeqPos stop = donor;
5511 
5513  temp.GetStrand());
5514  TSeqPos len = bsh.GetBioseqLength();
5515  if (start >= len || stop >= len) {
5516  return false;
5517  }
5518 
5519  if (temp.GetStrand() == eNa_strand_minus) {
5520  swap(acceptor, donor);
5521  stop = len - donor - 1;
5522  start = len - acceptor - 1;
5523  }
5524 
5525  bool result = false;
5526 
5527  try {
5528  if (tag == eSeqlocPartial_Nostop) {
5529  if (s_AfterIsGap(stop, 1, len, vec) || s_AfterIsGap(stop, 2, len, vec)) {
5530  is_gap = true;
5531  return true;
5532  } else if (s_AfterIsGapORN(stop, 1, len, vec)) {
5533  abuts_n = true;
5534  return false;
5535  }
5536  } else if (tag == eSeqlocPartial_Nostart && start > 0) {
5537  if (s_BeforeIsGap(start, 1, vec) || s_BeforeIsGap(start, 2, vec)) {
5538  is_gap = true;
5539  return true;
5540  } else if (s_BeforeIsGapOrN(start, 1, vec)) {
5541  abuts_n = true;
5542  return false;
5543  }
5544  }
5545  } catch (exception&) {
5546  return false;
5547  }
5548 
5549  if ((tag == eSeqlocPartial_Nostop) && (stop < len - 2)) {
5550  try {
5551  CSeqVector::TResidue res1 = vec[stop + 1];
5552  CSeqVector::TResidue res2 = vec[stop + 2];
5553 
5554  if (IsResidue(res1) && IsResidue(res2) && isalpha(res1) && isalpha(res2)) {
5555  if ((res1 == 'G' && res2 == 'T') ||
5556  (res1 == 'G' && res2 == 'C')) {
5557  result = true;
5558  }
5559  } else {
5560  bad_seq = true;
5561  }
5562  } catch (exception&) {
5563  return false;
5564  }
5565  } else if ((tag == eSeqlocPartial_Nostart) && (start > 1)) {
5566  try {
5567  CSeqVector::TResidue res1 = vec[start - 2];
5568  CSeqVector::TResidue res2 = vec[start - 1];
5569 
5570  if (IsResidue(res1) && IsResidue(res2) && isalpha(res1) && isalpha(res2)) {
5571  if ((res1 == 'A') && (res2 == 'G')) {
5572  result = true;
5573  }
5574  } else {
5575  bad_seq = true;
5576  }
5577  } catch (exception&) {
5578  return false;
5579  }
5580  }
5581 
5582  return result;
5583 }
5584 
5585 
5586 static bool s_MatchPartialType(const CSeq_loc& loc1, const CSeq_loc& loc2, unsigned int partial_type)
5587 {
5588  bool rval = false;
5589 
5590  switch (partial_type) {
5593  rval = true;
5594  }
5595  break;
5596  case eSeqlocPartial_Nostop:
5598  rval = true;
5599  }
5600  break;
5601  default:
5602  rval = false;
5603  break;
5604  }
5605  return rval;
5606 }
5607 
5608 
5609 // REQUIRES: feature is either Gene or mRNA
5611 {
5612  EOverlapType overlap_type;
5613  if (feat.GetData().IsGene()) {
5614  overlap_type = eOverlap_Simple;
5615  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
5616  overlap_type = eOverlap_CheckIntervals;
5617  } else {
5618  return false;
5619  }
5620 
5622  feat.GetLocation(),
5624  overlap_type,
5625  *m_Scope);
5626 
5627  if (cds) {
5628  if (TestForOverlapEx(
5629  cds->GetLocation(),
5630  feat.GetLocation(),
5631  eOverlap_Simple) == 0) {
5632  return true;
5633  }
5634  }
5635  return false;
5636 }
5637 
5638 
5639 bool CValidError_bioseq::x_MatchesOverlappingFeaturePartial(const CMappedFeat& feat, unsigned int partial_type)
5640 {
5641  bool rval = false;
5642 
5643 
5644  if (feat.GetData().IsGene()) {
5645  TSeqPos gene_start = feat.GetLocation().GetStart(eExtreme_Biological);
5646  TSeqPos gene_stop = feat.GetLocation().GetStop(eExtreme_Biological);
5647 
5648  // gene is ok if its partialness matches the overlapping coding region or mRNA
5650  vector<CMappedFeat> children = tr->GetChildren(feat);
5651  ITERATE(vector<CMappedFeat>, it, children) {
5652  if ((it->GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA || it->GetData().IsCdregion()) &&
5653  it->GetLocation().GetStart(eExtreme_Biological) == gene_start &&
5654  it->GetLocation().GetStop(eExtreme_Biological) == gene_stop &&
5655  s_MatchPartialType(feat.GetLocation(), it->GetLocation(), partial_type)) {
5656  rval = true;
5657  break;
5658  }
5659  }
5660  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
5661  bool look_for_gene = true;
5662  TSeqPos mrna_start = feat.GetLocation().GetStart(eExtreme_Biological);
5663  TSeqPos mrna_stop = feat.GetLocation().GetStop(eExtreme_Biological);
5664 
5665  vector<CMappedFeat> cds_children = m_Imp.GetGeneCache().GetFeatTreeFromCache(m_CurrentHandle)->GetChildren(feat);
5666  if (cds_children.size() > 0) {
5667  look_for_gene = false;
5668  for (auto it = cds_children.begin(); it != cds_children.end(); it++) {
5669  if (partial_type == sequence::eSeqlocPartial_Nostart) {
5670  if (it->GetLocation().GetStart(eExtreme_Biological) == mrna_start) {
5671  rval = true;
5672  } else {
5673  rval = false;
5674  }
5675  } else if (partial_type == sequence::eSeqlocPartial_Nostop) {
5676  if (it->GetLocation().GetStop(eExtreme_Biological) == mrna_stop) {
5677  rval = true;
5678  } else {
5679  rval = false;
5680  }
5681  }
5682  }
5683  }
5684 
5685  if (! rval && look_for_gene) {
5687  if (gene) {
5688  const CSeq_loc& gene_loc = gene->GetLocation();
5689  if (gene_loc.GetStart(eExtreme_Biological) == mrna_start
5690  && gene_loc.GetStop(eExtreme_Biological) == mrna_stop
5691  && s_MatchPartialType(feat.GetLocation(), gene_loc, partial_type)) {
5692  rval = true;
5693  }
5694  }
5695  }
5696  } else if (feat.GetData().IsCdregion()) {
5697  // coding region is ok if same as mRNA AND partial at splice site or gap
5699  CMappedFeat mrna = tr->GetParent(feat, CSeqFeatData::eSubtype_mRNA);
5700  if (mrna) {
5701  const CSeq_loc& mrna_loc = mrna.GetLocation();
5702  bool bad_seq = false;
5703  bool is_gap = false;
5704  bool abuts_n = false;
5705  if (s_MatchPartialType(feat.GetLocation(), mrna_loc, partial_type)
5706  && x_IsPartialAtSpliceSiteOrGap(feat.GetLocation(), partial_type, bad_seq, is_gap, abuts_n)) {
5707  rval = true;
5708  }
5709  }
5710  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_exon) {
5711  // exon is ok if its partialness and endpoint matches the mRNA endpoint
5712 #ifdef USE_FEAT_TREE_FOR_EXON
5714  CMappedFeat mrna = tr->GetParent(feat, CSeqFeatData::eSubtype_mRNA);
5715  if (mrna) {
5716  const CSeq_loc& mrna_loc = mrna.GetLocation();
5717  if (s_MatchPartialType(feat.GetLocation(), mrna_loc, partial_type)) {
5718  if (partial_type == eSeqlocPartial_Nostart
5719  && mrna_loc.IsPartialStart(eExtreme_Biological)) {
5720  rval = true;
5721  } else if (partial_type == eSeqlocPartial_Nostop
5722  && mrna_loc.IsPartialStop(eExtreme_Biological)) {
5723  rval = true;
5724  }
5725  }
5726  }
5727 #else
5728  TFeatScores mRNAs;
5731  ITERATE(TFeatScores, s, mRNAs) {
5732  const CSeq_loc& mrna_loc = s->second->GetLocation();
5733  if (s_MatchPartialType(feat.GetLocation(), mrna_loc, partial_type)) {
5734  if (partial_type == eSeqlocPartial_Nostart
5735  && mrna_loc.IsPartialStart(eExtreme_Biological)) {
5736  rval = true;
5737  } else if (partial_type == eSeqlocPartial_Nostop
5738  && mrna_loc.IsPartialStop(eExtreme_Biological)) {
5739  rval = true;
5740  break;
5741  }
5742  }
5743  }
5744 #endif
5745  }
5746 
5747  return rval;
5748 }
5749 
5750 
5752 {
5753  if (m_Imp.x_IsFarFetchFailure(feat.GetLocation())) {
5755  } else if (feat.GetData().Which() == CSeqFeatData::e_Cdregion
5756  && feat.IsSetExcept()
5757  && NStr::Find(feat.GetExcept_text(), "rearrangement required for product") != string::npos) {
5758  // suppress
5759  } else {
5761  "PartialLocation: Improper use of partial (greater than or less than)", feat);
5762  }
5763 }
5764 
5765 
5767 {
5768  if (m_Imp.x_IsFarFetchFailure(feat.GetLocation())) {
5770  } else if (m_Imp.IsRefSeq()) {
5771  // suppress
5772  } else if (feat.GetData().Which() == CSeqFeatData::e_Cdregion
5773  && feat.IsSetExcept()
5774  && NStr::Find(feat.GetExcept_text(), "rearrangement required for product") != string::npos) {
5775  // suppress
5776  } else if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
5777  // ignore start/stop not at end in genomic gpipe sequence
5778  } else {
5780  "PartialLocation: Internal partial intervals do not include first/last residue of sequence", feat);
5781  }
5782 }
5783 
5784 
5786 {
5787  if (s1 == eNa_strand_minus) {
5788  if (s2 == eNa_strand_minus) {
5789  return true;
5790  } else {
5791  return false;
5792  }
5793  } else {
5794  if (s2 == eNa_strand_minus) {
5795  return false;
5796  } else {
5797  return true;
5798  }
5799  }
5800 }
5801 
5802 
5804 {
5805  bool partial_start = loc.IsPartialStart(eExtreme_Positional);
5806  bool partial_stop = loc.IsPartialStop(eExtreme_Positional);
5807  if (! partial_start && ! partial_stop) {
5808  return false;
5809  }
5810  TSeqPos start = loc.GetStart(eExtreme_Positional);
5811  TSeqPos stop = loc.GetStop(eExtreme_Positional);
5812  ENa_strand feat_strand = loc.GetStrand();
5813 
5814  CBioseq_Handle bsh;
5815  try {
5816  bsh = m_Scope->GetBioseqHandle(loc);
5817  } catch (CException&) {
5818  return false;
5819  }
5820  if (! bsh) {
5821  return false;
5822  }
5823 
5825  while (intron) {
5826  ENa_strand intron_strand = intron->GetLocation().GetStrand();
5827  if (StrandsMatch(feat_strand, intron_strand)) {
5828  TSeqPos intron_start = intron->GetLocation().GetStart(eExtreme_Positional);
5829  if (intron_start == stop + 1 && partial_stop) {
5830  return true;
5831  }
5832  if (intron_start > stop + 1) {
5833  return false;
5834  }
5835  if (start > 0 && partial_start) {
5836  TSeqPos intron_stop = intron->GetLocation().GetStop(eExtreme_Positional);
5837  if (intron_stop == start - 1) {
5838  return true;
5839  }
5840  }
5841  }
5842  ++intron;
5843  }
5844  return false;
5845 }
5846 
5847 
5848 void CValidError_bioseq::x_ReportStartStopPartialProblem(int partial_type, bool at_splice_or_gap, bool abuts_n, const CSeq_feat& feat)
5849 {
5850  EDiagSev sev = eDiag_Warning;
5851  if (m_Imp.IsGenomeSubmission() &&
5854  sev = eDiag_Error;
5855  }
5856 
5857  string msg = (partial_type == 0 ? "Start" : "Stop");
5858  msg += " does not include first/last residue of ";
5859 
5860  bool mrna = false;
5861  bool organelle = false;
5862  bool not_expected = false;
5863  if (at_splice_or_gap) {
5864  if (feat.GetData().IsCdregion() || feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
5866  msg += "mRNA ";
5867  mrna = true;
5868  } else if (m_CurrentHandle && IsOrganelle(m_CurrentHandle)) {
5869  msg += "organelle ";
5870  organelle = true;
5871  sev = eDiag_Info;
5872  } else if (m_splicing_not_expected) {
5873  not_expected = true;
5874  sev = eDiag_Info;
5875  } else {
5876  // do not report
5877  return;
5878  }
5879  }
5880  }
5881  msg += "sequence";
5882  if (organelle) {
5883  msg += " (organelle does not use standard splice site convention)";
5884  }
5885  if (not_expected) {
5886  msg += " (but is at consensus splice site)";
5887  }
5888 
5889  EErrType err_type;
5890 
5891  if (sev == eDiag_Warning && abuts_n) {
5892  sev = eDiag_Info;
5893  }
5894 
5895  if (partial_type == 0) {
5896  if (mrna) {
5898  } else if (organelle) {
5900  } else {
5902  }
5903  } else {
5904  if (mrna) {
5906  } else if (organelle) {
5908  } else {
5910  }
5911  }
5912 
5913  PostErr(sev, err_type, msg, feat);
5914 }
5915 
5916 
5918  const CMappedFeat& feat, bool is_complete)
5919 {
5920  unsigned int partial_loc = eSeqlocPartial_Complete;
5921 
5922  bool is_partial = feat.IsSetPartial() && feat.GetPartial();
5923  // NOTE - have to use original seqfeat in order for this to work correctly
5924  // for features on segmented sequences
5925  partial_loc = SeqLocPartialCheck(feat.GetOriginalSeq_feat()->GetLocation(), m_Scope);
5926  if (feat.IsSetProduct() && ! feat.GetProduct().IsWhole()) {
5928  "Feature products should be entire sequences.", *(feat.GetSeq_feat()));
5929  }
5930 
5931  if (partial_loc == eSeqlocPartial_Complete && ! is_partial) {
5932  return;
5933  }
5934 
5935  if (partial_loc & eSeqlocPartial_Nointernal) {
5937  }
5938 
5939  if (partial_loc & eSeqlocPartial_Limwrong) {
5941  }
5942 
5943  if (s_IsCDDFeat(feat)) {
5944  // no additional warnings
5945  return;
5946  }
5947 
5948  string except_text;
5949  bool no_nonconsensus_except = true;
5950  if (feat.IsSetExcept_text()) {
5951  except_text = feat.GetExcept_text();
5952  if (feat.IsSetExcept()) {
5953  if (NStr::Find (except_text, "nonconsensus splice site") != string::npos ||
5954  NStr::Find (except_text, "heterogeneous population sequenced") != string::npos ||
5955  NStr::Find (except_text, "low-quality sequence region") != string::npos ||
5956  NStr::Find (except_text, "artificial location") != string::npos) {
5957  no_nonconsensus_except = false;
5958  }
5959  }
5960  }
5961 
5962  string comment_text;
5963  if (feat.IsSetComment()) {
5964  comment_text = feat.GetComment();
5965  }
5966 
5967  if (is_complete && feat.GetData().Which() == CSeqFeatData::e_Cdregion) {
5969  "Partial CDS on complete sequence",
5970  *(feat.GetSeq_feat()));
5971  }
5972 
5973  // partial location
5974  unsigned int errtype = eSeqlocPartial_Nostart;
5975  for (int j = 0; j < 2; ++j) {
5976  if (partial_loc & errtype) {
5977  bool bad_seq = false;
5978  bool is_gap = false;
5979  bool abuts_n = false;
5980 
5981  if (m_Scope && x_MatchesOverlappingFeaturePartial(feat, errtype)) {
5982  // error is suppressed
5983  } else if (m_Imp.x_IsFarFetchFailure(feat.GetLocation())) {
5985  } else if (feat.GetData().IsCdregion() &&
5988  // suppress
5989  } else if (x_IsPartialAtSpliceSiteOrGap(feat.GetLocation(), errtype, bad_seq, is_gap, abuts_n)) {
5990  if (is_gap || CGeneCache::IsPseudo(*feat.GetOriginalSeq_feat())) {
5991  // suppress for everything
5992  } else {
5993  x_ReportStartStopPartialProblem(j, true, abuts_n, *(feat.GetSeq_feat()));
5994  }
5995  } else if (bad_seq) {
5997  (errtype == eSeqlocPartial_Nostart ?
5998  "PartialLocation: Start does not include first/last residue of sequence (and is at bad sequence)" :
5999  "PartialLocation: Stop does not include first/last residue of sequence (and is at bad sequence)"),
6000  *(feat.GetSeq_feat()));
6001  } else if (feat.GetData().Which() == CSeqFeatData::e_Cdregion) {
6002  if (feat.IsSetExcept()
6003  && NStr::Find(except_text, "rearrangement required for product") != string::npos) {
6004  // suppress
6005  } else if (feat.IsSetComment() &&
6006  NStr::Find(comment_text, "coding region disrupted by sequencing gap") != string::npos) {
6007  // suppress
6008  } else if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
6009  // suppress
6010  } else if (! no_nonconsensus_except) {
6011  // suppress
6012  } else if (s_PartialAtGapOrNs(m_Scope, feat.GetLocation(), errtype, true)) {
6013  // suppress
6014  } else {
6015  EDiagSev sev = eDiag_Warning;
6016  if (abuts_n) {
6017  sev = eDiag_Info;
6018  }
6019  if (j == 0) {
6021  "5' partial is not at beginning of sequence, gap, or consensus splice site",
6022  *(feat.GetSeq_feat()));
6023  } else {
6025  "3' partial is not at end of sequence, gap, or consensus splice site",
6026  *(feat.GetSeq_feat()));
6027  }
6028  }
6029  } else if ((feat.GetData().Which() == CSeqFeatData::e_Gene ||
6031  x_IsSameAsCDS(feat)) {
6032  if (j == 0) {
6034  "Start does not include first/last residue of sequence", *(feat.GetSeq_feat()));
6035  } else if (j == 1) {
6037  "Stop does not include first/last residue of sequence", *(feat.GetSeq_feat()));
6038  }
6039  } else if (CGeneCache::IsPseudo(*feat.GetSeq_feat())) {
6040  // suppress
6041  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_tRNA &&
6042  j == 0 && x_PartialAdjacentToIntron(feat.GetLocation())) {
6043  // suppress tRNAs adjacent to introns
6044  } else if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
6045  // ignore start/stop not at end in genomic gpipe sequence
6046  } else {
6047  x_ReportStartStopPartialProblem(j, false, abuts_n, *(feat.GetSeq_feat()));
6048  }
6049  }
6050  errtype <<= 1;
6051  }
6052 }
6053 
6054 
6056 {
6057  CSeqdesc_CI di(bsh, CSeqdesc::e_User);
6058  while (di) {
6059  if (di->GetUser().HasField("StructuredCommentPrefix")) {
6060  const CUser_field& field = di->GetUser().GetField("StructuredCommentPrefix");
6061  if (field.IsSetData() && field.GetData().IsStr() && NStr::EqualNocase(field.GetData().GetStr(), "##Genome-Annotation-Data-START##")) {
6062  return true;
6063  }
6064  }
6065  ++di;
6066  }
6067  return false;
6068 }
6069 
6070 
6072  const CBioseq& seq, bool is_complete)
6073 {
6074  // test
6075  string accession;
6076  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
6077  if ((*it)->IsGenbank()) {
6078  if ((*it)->GetGenbank().IsSetAccession()) {
6079  accession = (*it)->GetGenbank().GetAccession();
6080  break;
6081  }
6082  } else if ((*it)->IsDdbj()) {
6083  if ((*it)->GetDdbj().IsSetAccession()) {
6084  accession = (*it)->GetDdbj().GetAccession();
6085  break;
6086  }
6087  } else if ((*it)->IsGi()) {
6088  accession = NStr::NumericToString((*it)->GetGi());
6089  }
6090  }
6091 
6092  try {
6093  unsigned int nummrna = 0, numcds = 0, numcrgn = 0, numvseg = 0, numdseg = 0, numjseg = 0;
6094  int numgene = 0, num_pseudomrna = 0, num_pseudocds = 0, num_rearrangedcds = 0;
6095  vector< CConstRef < CSeq_id > > cds_products, mrna_products;
6096 
6097  int num_full_length_prot_ref = 0;
6098 
6099  bool is_mrna = IsMrna(m_CurrentHandle);
6101  bool is_virtual = (m_CurrentHandle.GetInst_Repr() == CSeq_inst::eRepr_virtual);
6103 
6104  bool is_emb = false, non_pseudo_16S_rRNA = false;
6105  bool is_refseq = m_Imp.IsRefSeqConventions();
6106  FOR_EACH_SEQID_ON_BIOSEQ (seq_it, seq) {
6107  if ((*seq_it)->IsEmbl()) {
6108  is_emb = true;
6109  } else if ((*seq_it)->IsOther()) {
6110  is_refseq = true;
6111  }
6112  }
6113 
6114  int firstcdsgencode = 0;
6115  bool mixedcdsgencodes = false;
6116 
6117  if (m_AllFeatIt) {
6119  const CSeq_feat& feat = fi->GetOriginalFeature();
6120 
6121  CSeqFeatData::E_Choice ftype = feat.GetData().Which();
6122  CSeqFeatData::ESubtype subtype = fi->GetFeatSubtype();
6123 
6124  if (ftype == CSeqFeatData::e_Gene) {
6125  numgene++;
6126  const CGene_ref& gene_ref = feat.GetData().GetGene();
6127  if (gene_ref.IsSetLocus()) {
6128  string locus = gene_ref.GetLocus();
6129  if (m_GeneIt) {
6131  const CSeq_feat& gene_feat = gene_it->GetOriginalFeature();
6132  const CGene_ref& other_gene_ref = gene_feat.GetData().GetGene();
6133  if (other_gene_ref.IsSetLocus_tag()
6134  && NStr::EqualCase (other_gene_ref.GetLocus_tag(), locus)
6135  && (!other_gene_ref.IsSetLocus()
6136  || !NStr::EqualCase(other_gene_ref.GetLocus(), locus))) {
6138  "locus collides with locus_tag in another gene", feat);
6139  }
6140  }
6141  }
6142  }
6143  } else if (feat.GetData().IsCdregion()) {
6144  numcds++;
6145  if (feat.IsSetProduct()) {
6146  const CSeq_id* p = feat.GetProduct().GetId();
6147  CConstRef<CSeq_id> ref(p);
6148  cds_products.push_back(ref);
6149  } else if (feat.IsSetPseudo() && feat.GetPseudo()) {
6150  num_pseudocds++;
6151  } else {
6153  if (gene && gene->IsSetPseudo() && gene->GetPseudo()) {
6154  num_pseudocds++;
6155  } else if (feat.IsSetExcept_text()
6156  && NStr::Find (feat.GetExcept_text(), "rearrangement required for product") != string::npos) {
6157  num_rearrangedcds++;
6158  }
6159  }
6160  ValidateBadGeneOverlap(feat);
6161 
6162  const CCdregion& cdregion = feat.GetData().GetCdregion();
6163  if (cdregion.IsSetCode()) {
6164  int cdsgencode = 0;
6165  ITERATE(CCdregion::TCode::Tdata, it, cdregion.GetCode().Get()) {
6166  if ((*it)->IsId()) {
6167  cdsgencode = (*it)->GetId();
6168  }
6169  }
6170  if (cdsgencode != 0) {
6171  if (firstcdsgencode == 0) {
6172  firstcdsgencode = cdsgencode;
6173  } else if (firstcdsgencode != cdsgencode) {
6174  mixedcdsgencodes = true;
6175  }
6176  }
6177  }
6178  } else if (fi->GetFeatSubtype() == CSeqFeatData::eSubtype_mRNA) {
6179  nummrna++;
6181  if (feat.IsSetProduct()) {
6182  const CSeq_id* p = feat.GetProduct().GetId();
6183  CConstRef<CSeq_id> ref(p);
6184  mrna_products.push_back(ref);
6185  } else if (feat.IsSetPseudo() && feat.GetPseudo()) {
6186  num_pseudomrna++;
6187  } else {
6188  if (gene && gene->IsSetPseudo() && gene->GetPseudo()) {
6189  num_pseudomrna++;
6190  }
6191  }
6192  ValidateBadGeneOverlap(feat);
6193 
6194  } else if (fi->GetFeatSubtype() == CSeqFeatData::eSubtype_rRNA) {
6195  if (! feat.IsSetPseudo() || ! feat.GetPseudo()) {
6196  const CRNA_ref& rref = feat.GetData().GetRna();
6197  if (rref.CanGetExt() && rref.GetExt().IsName()) {
6198  const string& rna_name = rref.GetExt().GetName();
6199  if (NStr::EqualNocase(rna_name, "16S ribosomal RNA")) {
6200  non_pseudo_16S_rRNA = true;
6201  }
6202  }
6203  }
6204  } else if (subtype == CSeqFeatData::eSubtype_C_region) {
6205  numcrgn++;
6206  } else if (subtype == CSeqFeatData::eSubtype_V_segment) {
6207  numvseg++;
6208  } else if (subtype == CSeqFeatData::eSubtype_D_segment) {
6209  numdseg++;
6210  } else if (subtype == CSeqFeatData::eSubtype_J_segment) {
6211  numjseg++;
6212  }
6213 
6215 
6216  if (seq.GetInst().GetRepr() != CSeq_inst::eRepr_seg) {
6217  ValidateFeatPartialInContext(*fi, is_complete);
6218  }
6219 
6220  if (is_aa) { // protein
6221  switch (ftype) {
6222  case CSeqFeatData::e_Prot: {
6223  if (IsOneBioseq(feat.GetLocation(), m_Scope)) {
6225  if ((range.IsWhole() ||
6226  (range.GetFrom() == 0 && range.GetTo() == len - 1)) &&
6227  (! feat.GetData().GetProt().IsSetProcessed() ||
6230  num_full_length_prot_ref++;
6231  }
6232  }
6233  } break;
6234 
6235  case CSeqFeatData::e_Gene:
6237  if (feat.GetData().GetGene().IsSetLocus_tag()) {
6239  "Genes on protein sequences with PGAP annotation should not have locus tags.", feat);
6240  }
6241  } else if (! s_StandaloneProt(m_CurrentHandle)) {
6242  // report only if NOT standalone protein
6243  // and NOT PGAP
6245  "Invalid feature for a protein Bioseq.", feat);
6246  }
6247  break;
6248 
6249  default:
6250  break;
6251  }
6252  }
6253 
6254  if (is_mrna) { // mRNA
6255  switch (ftype) {
6256  case CSeqFeatData::e_Cdregion: {
6257  // Test for Multi interval CDS feature
6258  if (NumOfIntervals(feat.GetLocation()) > 1) {
6259  bool excpet = feat.IsSetExcept() && feat.GetExcept();
6260  bool slippage_except = false;
6261  bool circular_rna = false;
6262  if (feat.IsSetExcept_text()) {
6263  const string& text = feat.GetExcept_text();
6264  slippage_except =
6265  NStr::FindNoCase(text, "ribosomal slippage") != NPOS;
6266  circular_rna =
6267  NStr::FindNoCase(text, "circular RNA") != NPOS;
6268  }
6269  if ((! excpet || ! slippage_except) && ! circular_rna) {
6270  EDiagSev sev = is_refseq ? eDiag_Warning : eDiag_Error;
6272  "Multi-interval CDS feature is invalid on an mRNA "
6273  "(cDNA) Bioseq.",
6274  feat);
6275  }
6276  }
6277  } break;
6278  case CSeqFeatData::e_Rna: {
6279  const CRNA_ref& rref = feat.GetData().GetRna();
6280  if (rref.GetType() == CRNA_ref::eType_mRNA) {
6282  "mRNA feature is invalid on an mRNA (cDNA) Bioseq.",
6283  feat);
6284  }
6285  } break;
6286  case CSeqFeatData::e_Imp: {
6287  const CImp_feat& imp = feat.GetData().GetImp();
6288  if (imp.GetKey() == "intron") {
6290  "Invalid feature for an mRNA Bioseq.", feat);
6291  }
6292  } break;
6293  default:
6294  break;
6295  }
6296  }
6297 
6298  if (! is_emb) {
6299  if (IsFarLocation(feat.GetLocation(), m_Imp.GetTSEH())) {
6301  "Feature has 'far' location - accession not packaged in record",
6302  feat);
6303  }
6304  }
6305 
6306  } // end of for loop
6307 
6308  if (non_pseudo_16S_rRNA && m_CurrentHandle) {
6310  if (src_desc) {
6311  const CBioSource& biosrc = src_desc->GetSource();
6312  int genome = 0;
6313  bool isEukaryote = false;
6314  bool isMicrosporidia = false;
6315  if (biosrc.IsSetGenome()) {
6316  genome = biosrc.GetGenome();
6317  }
6318  if (biosrc.IsSetLineage()) {
6319  string lineage = biosrc.GetLineage();
6320  if (NStr::StartsWith(lineage, "Eukaryota; ", NStr::eNocase)) {
6321  isEukaryote = true;
6322  if (NStr::StartsWith(lineage, "Eukaryota; Fungi; Microsporidia; ", NStr::eNocase)) {
6323  isMicrosporidia = true;
6324  }
6325  }
6326  }
6327  if (isEukaryote && (! isMicrosporidia) &&
6329  genome != CBioSource::eGenome_chloroplast &&
6330  genome != CBioSource::eGenome_chromoplast &&
6331  genome != CBioSource::eGenome_kinetoplast &&
6332  genome != CBioSource::eGenome_plastid &&
6333  genome != CBioSource::eGenome_apicoplast &&
6334  genome != CBioSource::eGenome_leucoplast &&
6335  genome != CBioSource::eGenome_proplastid &&
6338  "Improper 16S ribosomal RNA",
6339  *(seq.GetParentEntry()), *src_desc);
6340  }
6341  }
6342  }
6343  } // end of branch where features are present
6344 
6345  if (mixedcdsgencodes) {
6346  EDiagSev sev = eDiag_Error;
6347  if (IsSynthetic()) {
6348  sev = eDiag_Warning;
6349  }
6351  "Multiple CDS genetic codes on sequence", seq);
6352  }
6353 
6354  // if no full length prot feature on a part of a segmented bioseq
6355  // search for such feature on the master bioseq
6356  if (is_aa && num_full_length_prot_ref == 0) {
6358  if (parent) {
6359  TSeqPos parent_len = 0;
6360  if (parent.IsSetInst() && parent.GetInst().IsSetLength()) {
6361  parent_len = parent.GetInst().GetLength();
6362  }
6363  for (CFeat_CI it(parent, CSeqFeatData::e_Prot); it; ++it) {
6364  try {
6365  const CSeq_feat& prot_feat = it->GetOriginalFeature();
6367 
6368  if ((range.IsWhole() ||
6369  (range.GetFrom() == 0 && range.GetTo() == parent_len - 1)) &&
6370  (! prot_feat.GetData().GetProt().IsSetProcessed() ||
6373  num_full_length_prot_ref++;
6374  }
6375  } catch (const exception&) {
6376  CSeq_loc::TRange range = it->GetLocation().GetTotalRange();
6377  if ((range.IsWhole() ||
6378  (range.GetFrom() == 0 && range.GetTo() == parent_len - 1)) &&
6379  (! it->GetData().GetProt().IsSetProcessed() ||
6380  (it->GetData().GetProt().GetProcessed() == CProt_ref::eProcessed_not_set ||
6381  it->GetData().GetProt().GetProcessed() == CProt_ref::eProcessed_preprotein))){
6382  num_full_length_prot_ref++;
6383  }
6384  }
6385  }
6386  }
6387  }
6388 
6389  if (is_aa && num_full_length_prot_ref == 0 && ! is_virtual && ! m_Imp.IsPDB()) {
6391  }
6392 
6393  if (is_aa && num_full_length_prot_ref > 1 && ! SeqIsPatent(seq)) {
6395  NStr::IntToString (num_full_length_prot_ref)
6396  + " full-length protein features present on protein", seq);
6397  }
6398 
6399  if (! is_aa) {
6400  // validate abutting UTRs for nucleotides
6402 
6403  // validate coding regions between UTRs
6404  ValidateCDSUTR();
6405  }
6406 
6407  // validate abutting RNA features
6409 
6410  // before validating CDS/mRNA matches, determine whether to suppress duplicate messages
6411  bool cds_products_unique = true;
6412  if (cds_products.size() > 1) {
6413  stable_sort(cds_products.begin(), cds_products.end(), s_SeqIdCompare);
6414  cds_products_unique = seq_mac_is_unique(cds_products.begin(), cds_products.end(), s_SeqIdMatch);
6415  }
6416 
6417  bool mrna_products_unique = true;
6418  if (mrna_products.size() > 1) {
6419  stable_sort(mrna_products.begin(), mrna_products.end(), s_SeqIdCompare);
6420  mrna_products_unique = seq_mac_is_unique(mrna_products.begin(), mrna_products.end(), s_SeqIdMatch);
6421  }
6422 
6423  if (numcds > 0 && nummrna > 1) {
6424  if (cds_products.size() > 0 && cds_products.size() + num_pseudocds + num_rearrangedcds != numcds) {
6426  NStr::SizetToString (numcds) + " CDS features have "
6427  + NStr::SizetToString (cds_products.size()) + " product references",
6428  seq);
6429  }
6430  if (cds_products.size() > 0 && (! cds_products_unique)) {
6432  "CDS products are not unique", seq);
6433  }
6434  if (mrna_products.size() > 0 && mrna_products.size() + num_pseudomrna != nummrna) {
6436  NStr::SizetToString (nummrna) + " mRNA features have "
6437  + NStr::SizetToString (mrna_products.size()) + " product references",
6438  seq);
6439  }
6440  if (mrna_products.size() > 0 && (! mrna_products_unique)) {
6442  "mRNA products are not unique", seq);
6443  }
6444  }
6445 
6447 
6448  if (numcds > 0 && numcrgn + numvseg + numdseg + numjseg > 0 && m_Imp.DoCompareVDJCtoCDS() && m_Imp.IsRefSeq()) {
6449  /*
6450  LOG_POST_XX(Corelib_App, 1, "numcds: " + NStr::IntToString(numcds) + "\n");
6451  LOG_POST_XX(Corelib_App, 1, "nummrna: " + NStr::IntToString(nummrna) + "\n");
6452  LOG_POST_XX(Corelib_App, 1, "numcrgn: " + NStr::IntToString(numcrgn) + "\n");
6453  LOG_POST_XX(Corelib_App, 1, "numvseg: " + NStr::IntToString(numvseg) + "\n");
6454  LOG_POST_XX(Corelib_App, 1, "numdseg: " + NStr::IntToString(numdseg) + "\n");
6455  LOG_POST_XX(Corelib_App, 1, "numjseg: " + NStr::IntToString(numjseg) + "\n");
6456  */
6457  // x_ValidateCDSVDJCmatch(m_CurrentHandle, numcds, numcrgn, numvseg, numdseg, numjseg);
6459  }
6460 
6461  if (! SeqIsPatent(seq)) {
6463  }
6464 
6465  } catch (const exception& e) {
6466  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
6468  string("Exception while validating Seqfeat Context. EXCEPTION: ") +
6469  e.what(), seq);
6470  }
6471  }
6472 
6473 }
6474 
6475 
6476 TGi GetGIForSeqId(const CSeq_id& id, CScope& scope)
6477 {
6478  if (id.IsGi()) {
6479  return id.GetGi();
6480  }
6481  CBioseq_Handle bsh = scope.GetBioseqHandle(id);
6482  if (! bsh) {
6483  return ZERO_GI;
6484  }
6485  ITERATE(CBioseq::TId, id_it, bsh.GetBioseqCore()->GetId()) {
6486  if ((*id_it)->IsGi()) {
6487  return (*id_it)->GetGi();
6488  }
6489  }
6490  return ZERO_GI;
6491 }
6492 
6493 
6494 string s_GetMrnaProteinLink(const CUser_field& field)
6495 {
6496  string ml;
6497  if (field.IsSetLabel() && field.GetLabel().IsStr() &&
6498  NStr::Equal(field.GetLabel().GetStr(), "protein seqID") &&
6499  field.IsSetData() && field.GetData().IsStr()) {
6500  ml = field.GetData().GetStr();
6501  }
6502  return ml;
6503 }
6504 
6505 
6507 {
6508  string ml;
6509  if (user.IsSetType() && user.GetType().IsStr() &&
6510  NStr::Equal(user.GetType().GetStr(), "MrnaProteinLink") &&
6511  user.IsSetData()) {
6512  ITERATE(CUser_object::TData, it, user.GetData()) {
6513  ml = s_GetMrnaProteinLink(**it);
6514  if (! NStr::IsBlank(ml)) {
6515  break;
6516  }
6517  }
6518  }
6519  return ml;
6520 }
6521 
6522 
6523 string s_GetMrnaProteinLink(const CSeq_feat& mrna)
6524 {
6525  string ml;
6526  if (mrna.IsSetExt()) {
6527  ml = s_GetMrnaProteinLink(mrna.GetExt());
6528  }
6529  return ml;
6530 
6531 }
6532 
6533 
6535 {
6536  if (! cds.IsSetId() || ! cds.GetId().IsLocal()
6537  || ! mrna.IsSetId() || ! mrna.GetId().IsLocal()) {
6538  return 0;
6539  }
6540 
6541  bool match1 = false, match2 = false;
6542  bool has1 = false, has2 = false;
6543  FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, cds) {
6544  if ((*itx)->IsSetId()) {
6545  has1 = true;
6546  if (s_FeatureIdsMatch((*itx)->GetId(), mrna.GetId())) {
6547  match1 = true;
6548  }
6549  }
6550  }
6551 
6552  FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, mrna) {
6553  if ((*itx)->IsSetId()) {
6554  has2 = true;
6555  if (s_FeatureIdsMatch((*itx)->GetId(), cds.GetId())) {
6556  match2 = true;
6557  }
6558  }
6559  }
6560 
6561  if ((has1 || has2) && (! match1 || ! match2)) {
6562  return 1;
6563  }
6564 
6565  if (! cds.IsSetProduct() || ! mrna.IsSetExt()) {
6566  return 0;
6567  }
6568 
6569  TGi gi = GetGIForSeqId(*(cds.GetProduct().GetId()), *m_Scope);
6570 
6571  if (gi == ZERO_GI) {
6572  return 0;
6573  }
6574 
6575  string ml = s_GetMrnaProteinLink(mrna);
6576  if (! NStr::IsBlank(ml)) {
6577  try {
6578  CSeq_id id(ml);
6579  if (id.IsGi()) {
6580  if (id.GetGi() == gi) {
6581  return 0;
6582  } else {
6583  return 2;
6584  }
6585  }
6586  } catch (const CException&) {
6587  return 2;
6588  } catch (const std::exception&) {
6589  return 2;
6590  }
6591  }
6592  return 0;
6593 }
6594 
6595 bool s_IdXrefsAreReciprocal(const CSeq_feat& cds, const CSeq_feat& mrna)
6596 {
6597  if (! cds.IsSetId() || ! cds.GetId().IsLocal()
6598  || ! mrna.IsSetId() || ! mrna.GetId().IsLocal()) {
6599  return false;
6600  }
6601 
6602  bool match = false;
6603 
6604  FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, cds) {
6605  if ((*itx)->IsSetId() && s_FeatureIdsMatch((*itx)->GetId(), mrna.GetId())) {
6606  match = true;
6607  break;
6608  }
6609  }
6610  if (! match) {
6611  return false;
6612  }
6613  match = false;
6614 
6615  FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, mrna) {
6616  if ((*itx)->IsSetId() && s_FeatureIdsMatch((*itx)->GetId(), cds.GetId())) {
6617  match = true;
6618  break;
6619  }
6620  }
6621 
6622  return match;
6623 }
6624 
6626 {
6627  return s_IdXrefsAreReciprocal(cds, mrna);
6628 }
6629 
6631  CScope* scope) :
6632  m_Mrna(&mrna),
6633  m_Scope(scope),
6634  m_HasMatch(false),
6635  m_IsPseudo(false)
6636 {
6637 }
6638 
6640 {
6641  return *m_Mrna;
6642 }
6643 
6644 bool CMrnaMatchInfo::Overlaps(const CSeq_feat& cds) const
6645 {
6646  EOverlapType overlap_type = eOverlap_CheckIntRev;
6647 
6648  if (cds.IsSetExcept_text() &&
6649  (NStr::FindNoCase(cds.GetExcept_text(), "ribosomal slippage") != string::npos)) {
6650  overlap_type = eOverlap_SubsetRev;
6651  }
6652  return (TestForOverlapEx(cds.GetLocation(), m_Mrna->GetLocation(), overlap_type, m_Scope) >= 0);
6653 }
6654 
6656 {
6657  m_HasMatch = true;
6658 }
6659 
6660 
6662 {
6663  return m_HasMatch;
6664 }
6665 
6666 
6667 bool CMrnaMatchInfo::OkWithoutCds(bool isGenbank) const
6668 {
6669  if (m_IsPseudo) return true;
6670  const CSeq_loc& loc = m_Mrna->GetLocation();
6671  TSeqPos len = GetLength(loc, m_Scope);
6672  if (len < 6) {
6673  return true;
6674  }
6675  TSeqPos mrna_start = loc.GetStart(eExtreme_Biological);
6676  TSeqPos mrna_stop = loc.GetStop(eExtreme_Biological);
6677  TFeatScores utr5s;
6680  ITERATE(TFeatScores, s, utr5s) {
6681  const CSeq_loc& utr5_loc = s->second->GetLocation();
6682  TSeqPos utr5_start = utr5_loc.GetStart(eExtreme_Biological);
6683  TSeqPos utr5_stop = utr5_loc.GetStop(eExtreme_Biological);
6684  if (utr5_start == mrna_start) {
6685  if (mrna_stop >= utr5_stop && mrna_stop - utr5_stop < 6) {
6686  return true;
6687  } else if (utr5_stop >= mrna_stop && utr5_stop - mrna_stop < 6) {
6688  return true;
6689  }
6690  }
6691  }
6692  TFeatScores utr3s;
6695  ITERATE(TFeatScores, s, utr3s) {
6696  const CSeq_loc& utr3_loc = s->second->GetLocation();
6697  TSeqPos utr3_start = utr3_loc.GetStart(eExtreme_Biological);
6698  TSeqPos utr3_stop = utr3_loc.GetStop(eExtreme_Biological);
6699  if (utr3_stop == mrna_stop) {
6700  if (mrna_start >= utr3_start && mrna_start - utr3_start < 6) {
6701  return true;
6702  } else if (utr3_start >= mrna_start && utr3_start - mrna_start < 6) {
6703  return true;
6704  }
6705  }
6706  }
6707  if (m_Mrna->IsSetPartial() && m_Mrna->GetPartial() && isGenbank) {
6708  return true;
6709  }
6710  return false;
6711 }
6712 
6713 
6715  CScope* scope) :
6716  m_Cds(&cds),
6717  m_Scope(scope),
6718  m_IsPseudo(false),
6719  m_NeedsMatch(true),
6720  m_ProductsUnique(true)
6721 {
6723  if (m_Cds->IsSetExcept_text() &&
6724  (NStr::FindNoCase(m_Cds->GetExcept_text(), "ribosomal slippage") != string::npos)) {
6726  }
6727  m_OtherMrnas.clear();
6728 }
6729 
6730 
6732 {
6733  return *m_Cds;
6734 }
6735 
6736 bool CCdsMatchInfo::Overlaps(const CSeq_feat& mrna) const
6737 {
6738  if (m_Cds.IsNull()) {
6739  return false;
6740  }
6741  return (TestForOverlapEx(m_Cds->GetLocation(), mrna.GetLocation(), m_OverlapType, m_Scope) >= 0);
6742 }
6743 
6744 void CCdsMatchInfo::NeedsMatch(const bool needs_match)
6745 {
6746  m_NeedsMatch = needs_match;
6747 }
6748 
6750 {
6751  return m_NeedsMatch;
6752 }
6753 
6755 {
6756  return (m_BestMatch != nullptr);
6757 }
6758 
6760 {
6761  return *m_BestMatch;
6762 }
6763 
6765 {
6766  return m_IsPseudo;
6767 }
6768 
6770 {
6771  m_IsPseudo = true;
6772 }
6773 
6774 bool CCdsMatchInfo::AssignXrefMatch(TmRNAList& unmatched_mrnas, const CTSE_Handle& tse)
6775 {
6776  if (unmatched_mrnas.empty()) {
6777  return false;
6778  }
6779  bool rval = false;
6781  if (! (*xref_it)->IsSetId() ||
6782  ! (*xref_it)->GetId().IsLocal()) {
6783  continue;
6784  }
6785  vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::eSubtype_mRNA, (*xref_it)->GetId().GetLocal());
6786  ITERATE(vector<CSeq_feat_Handle>, h, handles) {
6787  if (HasMatch()) {
6788  m_OtherMrnas.push_back(h->GetSeq_feat());
6789  } else {
6790  TmRNAList::iterator mrna_it = unmatched_mrnas.find((*h).GetSeq_feat());
6791  if (mrna_it != unmatched_mrnas.end()) {
6792  m_BestMatch = mrna_it->second;
6793  m_BestMatch->SetMatch();
6794  unmatched_mrnas.erase(mrna_it);
6795  rval = true;
6796  }
6797  }
6798  }
6799  }
6800  return rval;
6801 }
6802 
6803 
6805 {
6806  if (unmatched_mrnas.empty()) {
6807  return false;
6808  }
6809 
6810  bool rval = false;
6811  TFeatScores feats;
6813  m_OverlapType, feats, scope);
6814  if (feats.size() == 0) {
6815  TmRNAList::iterator mrna_it = unmatched_mrnas.begin();
6816  while (mrna_it != unmatched_mrnas.end()) {
6817  if (Overlaps(mrna_it->second->GetSeqfeat())) {
6818  m_BestMatch = mrna_it->second;
6819  m_BestMatch->SetMatch();
6820  unmatched_mrnas.erase(mrna_it);
6821  return true;
6822  }
6823  ++mrna_it;
6824  }
6825  } else {
6826  NON_CONST_ITERATE(TFeatScores, it, feats) {
6827  TmRNAList::iterator mrna_it = unmatched_mrnas.find(it->second);
6828  if (mrna_it != unmatched_mrnas.end()) {
6829  if (rval) {
6830  m_OtherMrnas.push_back(it->second);
6831  } else {
6832  m_BestMatch = mrna_it->second;
6833  m_BestMatch->SetMatch();
6834  unmatched_mrnas.erase(mrna_it);
6835  rval = true;
6836  }
6837  }
6838  }
6839  }
6840 
6841  return rval;
6842 }
6843 
6844 
6846 {
6847  string product_string;
6848 
6849  if (! mrna.IsSetProduct()) {
6850  return product_string;
6851  }
6852 
6853  mrna.GetProduct().GetLabel(&product_string);
6854 
6855  return product_string;
6856 }
6857 
6858 
6859 void CCdsMatchInfo::UpdateOtherMrnas(const TmRNAList& unmatched_mrnas)
6860 {
6861  list<CConstRef<CSeq_feat>>::iterator it = m_OtherMrnas.begin();
6862  list<string> product_strings;
6863  while (it != m_OtherMrnas.end()) {
6864  TmRNAList::const_iterator mrna_it = unmatched_mrnas.find(it->GetPointer());
6865  if (mrna_it == unmatched_mrnas.end()) {
6866  it = m_OtherMrnas.erase(it);
6867  } else {
6868  auto product_string = s_GetMrnaProductString(**it);
6869  product_strings.push_back(product_string);
6870  ++it;
6871  }
6872  }
6873  if (m_OtherMrnas.size() == 0) {
6874  return;
6875  }
6876  m_OtherMrnas.sort();
6877  m_OtherMrnas.unique();
6878  product_strings.push_back(s_GetMrnaProductString(m_BestMatch->GetSeqfeat()));
6879  const auto num_products = product_strings.size();
6880  if (product_strings.size() > 1) {
6881  m_ProductsUnique = false;
6882  product_strings.sort();
6883  product_strings.unique();
6884  const auto num_unique_products = product_strings.size();
6885  if (num_unique_products == num_products) {
6886  m_ProductsUnique = true;
6887  }
6888  }
6889 }
6890 
6891 
6893 {
6894  m_BestMatch = match;
6895  m_BestMatch->SetMatch();
6896 }
6897 
6898 
6899 bool CCdsMatchInfo::AssignMatch(TmRNAList& mrna_map, CFeatTree& feat_tree, CScope& scope)
6900 {
6902  CMappedFeat mrna = feat_tree.GetParent(cds, CSeqFeatData::eSubtype_mRNA);
6903  if (mrna) {
6904  const CSeq_feat* key = mrna.GetSeq_feat();
6905  CRef<CMrnaMatchInfo> match = mrna_map[key];
6906  bool rval = false;
6907  if (match) {
6908  SetMatch(match);
6909  rval = true;
6910  }
6911  mrna_map.erase(key);
6912  return rval;
6913  } else {
6914  return false;
6915  }
6916 }
6917 
6918 
6920  CCdsMatchInfo& cds_match,
6921  const TmRNAList& unmatched_mrnas)
6922 {
6923  if (! cds_match.HasMatch()) {
6924  return;
6925  }
6926  cds_match.UpdateOtherMrnas(unmatched_mrnas);
6927  size_t num_mrnas = cds_match.CountOtherMrnas() + 1;
6928  if (num_mrnas < 2) {
6929  return;
6930  }
6931  if (cds_match.AreMrnaProductsUnique()) {
6933  "CDS matches " + NStr::NumericToString(num_mrnas)
6934  + " mRNAs, but product locations are unique",
6935  cds_match.GetSeqfeat());
6936  return;
6937  } else {
6939  "CDS matches " + NStr::NumericToString(num_mrnas)
6940  + " mRNAs",
6941  cds_match.GetSeqfeat());
6942  }
6943 }
6944 
6945 
6947 {
6948  if (! cds_match.HasMatch()) {
6949  return;
6950  }
6951 
6952  const auto& mrna_feat = cds_match.GetMatch().GetSeqfeat();
6953  const auto& cds_feat = cds_match.GetSeqfeat();
6954 
6955  const auto xrefs_match = x_IdXrefsNotReciprocal(cds_feat, mrna_feat);
6956 
6957  // Could also check that xrefs are reciprocal here, but that is checked in CValidError_feat
6958  if (xrefs_match == 2) {
6960  "MrnaProteinLink inconsistent with feature ID cross-references",
6961  mrna_feat);
6962  }
6963 }
6964 
6965 
6966 bool s_GeneralTagsMatch(const string& protein_id, const CDbtag& dbtag)
6967 {
6968  size_t start_pos = NStr::Find(protein_id, "gnl|");
6969  if (start_pos == string::npos) {
6970  return false;
6971  }
6972  start_pos = NStr::Find(protein_id, "|", start_pos + 5);
6973  if (start_pos == string::npos) {
6974  return false;
6975  }
6976  size_t end_pos = NStr::Find(protein_id, "|", start_pos + 1);
6977  string prot_tag;
6978  if (end_pos == string::npos) {
6979  prot_tag = protein_id.substr(start_pos + 1);
6980  } else {
6981  prot_tag = protein_id.substr(start_pos + 1, end_pos - start_pos - 1);
6982  }
6983 
6984  if (dbtag.IsSetTag()) {
6985  if (dbtag.GetTag().IsStr()) {
6986  if (NStr::Equal(dbtag.GetTag().GetStr(), prot_tag)) {
6987  return true;
6988  }
6989  }
6990  else if (dbtag.GetTag().IsId()) {
6991  if (NStr::Equal(NStr::NumericToString(dbtag.GetTag().GetId()), prot_tag)) {
6992  return true;
6993  }
6994  }
6995  }
6996  return false;
6997 }
6998 
6999 void CValidError_bioseq::x_TranscriptIDsMatch(const string& protein_id, const CSeq_feat& cds)
7000 {
7001  if (! cds.IsSetProduct() || ! cds.GetProduct().GetId()) {
7002  if (! sequence::IsPseudo(cds, *m_Scope)) {
7004  "CDS-mRNA pair has one missing protein_id (" + protein_id + ")", cds);
7005  }
7006  return;
7007  }
7008  const CSeq_id& product_id = *(cds.GetProduct().GetId());
7009  if (product_id.IsGeneral()) {
7010  if (! s_GeneralTagsMatch(protein_id, product_id.GetGeneral())) {
7012  "CDS-mRNA pair has mismatching protein_ids (" +
7013  product_id.AsFastaString() + ", " + protein_id + ")", cds);
7014  }
7015  return;
7016  }
7017  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(product_id);
7018  if (bsh) {
7019  ITERATE(CBioseq::TId, id_it, bsh.GetBioseqCore()->GetId()) {
7020  if ((*id_it)->IsGeneral()) {
7021  if (! s_GeneralTagsMatch(protein_id, (*id_it)->GetGeneral())) {
7023  "CDS-mRNA pair has mismatching protein_ids (" +
7024  (*id_it)->AsFastaString() + ", " + protein_id + ")", cds);
7025  }
7026  return;
7027  }
7028  }
7029  }
7030  if (m_Imp.IsRefSeq()) {
7031  // per VR-760, do not report if RefSeq and no general tag
7032  return;
7033  }
7034 
7035  // no general tags, try plain match
7036  if (bsh) {
7037  ITERATE(CBioseq::TId, id_it, bsh.GetBioseqCore()->GetId()) {
7038  if (NStr::Equal(protein_id, (*id_it)->AsFastaString())) {
7039  return;
7040  }
7041  }
7042  } else if (NStr::Equal(protein_id, product_id.AsFastaString())) {
7043  return;
7044  }
7045 
7047  "CDS-mRNA pair has one missing protein_id (" + protein_id + ")", cds);
7048 }
7049 
7050 
7052 {
7053  if (! cds_match.HasMatch()) {
7054  return;
7055  }
7056  const auto& mrna_feat = cds_match.GetMatch().GetSeqfeat();
7057  const auto& cds_feat = cds_match.GetSeqfeat();
7058  string cds_transcript_id;
7059  string mrna_transcript_id;
7060  string mrna_protein_id;
7061  bool must_reconcile = false;
7062  if (mrna_feat.IsSetQual()) {
7063  ITERATE(CSeq_feat::TQual, q, mrna_feat.GetQual()) {
7064  if ((*q)->IsSetQual() && (*q)->IsSetVal()) {
7065  if (NStr::EqualNocase((*q)->GetQual(), "orig_transcript_id")) {
7066  mrna_transcript_id = (*q)->GetVal();
7067  must_reconcile = true;
7068  } else if (NStr::EqualNocase((*q)->GetQual(), "orig_protein_id")) {
7069  mrna_protein_id = (*q)->GetVal();
7070  must_reconcile = true;
7071  }
7072  }
7073  }
7074  }
7075  if (cds_feat.IsSetQual()) {
7076  ITERATE(CSeq_feat::TQual, q, cds_feat.GetQual()) {
7077  if ((*q)->IsSetQual() && (*q)->IsSetVal()) {
7078  if (NStr::EqualNocase((*q)->GetQual(), "orig_transcript_id")) {
7079  cds_transcript_id = (*q)->GetVal();
7080  must_reconcile = true;
7081  }
7082  }
7083  }
7084  }
7085 
7086  if (must_reconcile) {
7087  if (! NStr::Equal(mrna_transcript_id, cds_transcript_id)) {
7089  "CDS-mRNA pair has mismatching transcript_ids ("
7090  + cds_transcript_id + "," + mrna_transcript_id + ")",
7091  cds_feat);
7092  }
7093  x_TranscriptIDsMatch(mrna_protein_id, cds_feat);
7094  }
7095 
7096 }
7097 
7098 
7099 static bool x_FeatIsCDS(const CSeq_feat& ft)
7100 
7101 {
7102  if (ft.IsSetData()) {
7104  if (sbt == CSeqFeatData::eSubtype_cdregion) {
7105  return true;
7106  }
7107  }
7108  return false;
7109 }
7110 
7111 static bool x_FeatIsVDJC(const CSeq_feat& ft)
7112 
7113 {
7114  if (ft.IsSetData()) {
7116  if (sbt == CSeqFeatData::eSubtype_C_region ||
7120  return true;
7121  }
7122  }
7123  return false;
7124 }
7125 
7126 static bool x_BadCDSinVDJC(const CSeq_loc& cdsloc, const CSeq_loc& vdjcloc, CScope* scope)
7127 
7128 {
7129  if (s_OverlapOrAbut(vdjcloc, cdsloc, scope)) {
7130  if (! s_CheckIntervals(vdjcloc, cdsloc, scope)) {
7131  return true;
7132  }
7133  }
7134  return false;
7135 }
7136 
7137 bool x_IsPseudo(const CGene_ref& ref)
7138 {
7139  if (ref.IsSetPseudo() && ref.GetPseudo()) {
7140  return true;
7141  } else {
7142  return false;
7143  }
7144 }
7145 
7146 bool x_HasNamedQual(const CSeq_feat& feat, const string& qual)
7147 {
7148  bool rval = false;
7149  if (feat.IsSetQual()) {
7150  for (auto it : feat.GetQual()) {
7151  if (it->IsSetQual() && NStr::EqualNocase(it->GetQual(), qual)) {
7152  rval = true;
7153  break;
7154  }
7155  }
7156  }
7157  return rval;
7158 }
7159 
7160 bool x_IsPseudo(const CSeq_feat& feat, CValidError_imp& imp)
7161 {
7162  if (feat.IsSetPseudo() && feat.GetPseudo()) {
7163  return true;
7164  } else if (x_HasNamedQual(feat, "pseudogene")) {
7165  return true;
7166  } else if (feat.IsSetData() && feat.GetData().IsGene() &&
7167  x_IsPseudo(feat.GetData().GetGene())) {
7168  return true;
7169  } else {
7170  try {
7171  CConstRef<CSeq_feat> gene = imp.GetCachedGene(&feat);
7172  if (gene) {
7173  if (gene->IsSetPseudo() && gene->GetPseudo()) {
7174  return true;
7175  }
7176  if (gene->IsSetData()) {
7177  const CSeqFeatData& data = gene->GetData();
7178  if (data.IsGene()) {
7179  const CGene_ref& ref = data.GetGene();
7180  if (x_IsPseudo(ref)) {
7181  return true;
7182  }
7183  }
7184  }
7185  }
7186  } catch (...) {
7187  }
7188  }
7189  return false;
7190 }
7191 
7193 {
7194  SAnnotSelector sel;
7195 
7196  CFeat_CI it1(seq, sel);
7197 
7198  if (! it1) {
7199  return;
7200  }
7201 
7202  while (it1) {
7203  const CSeq_feat& ft1 = it1->GetOriginalFeature();
7204  if (x_FeatIsCDS(ft1) || x_FeatIsVDJC(ft1)) {
7205  break;
7206  }
7207  ++it1;
7208  }
7209 
7210  if (! it1) {
7211  return;
7212  }
7213 
7214  CFeat_CI it2 = it1;
7215  ++it2;
7216 
7217  while (it2) {
7218  const CSeq_feat& ft2 = it2->GetOriginalFeature();
7219  if (x_FeatIsCDS(ft2) || x_FeatIsVDJC(ft2)) {
7220 
7221  const CSeq_feat& ft1 = it1->GetOriginalFeature();
7222 
7223  const CSeq_loc& loc1 = ft1.GetLocation();
7224  const CSeq_loc& loc2 = ft2.GetLocation();
7225 
7226  ENa_strand strand1 = eNa_strand_plus;
7227  if (loc1.IsSetStrand() && loc1.GetStrand() == eNa_strand_minus) {
7228  strand1 = eNa_strand_minus;
7229  }
7230 
7231  ENa_strand strand2 = eNa_strand_plus;
7232  if (loc2.IsSetStrand() && loc2.GetStrand() == eNa_strand_minus) {
7233  strand2 = eNa_strand_minus;
7234  }
7235 
7236  if (strand1 == strand2) {
7237  bool bad = false;
7238  if (x_FeatIsCDS(ft1) && x_FeatIsVDJC(ft2)) {
7239  bad = x_BadCDSinVDJC(loc1, loc2, m_Scope);
7240  if (bad) {
7241  if (x_IsPseudo(ft1, m_Imp) || x_IsPseudo(ft2, m_Imp)) {
7243  "No parent for (pseudo) CdRegion", ft1);
7244  } else {
7246  "No parent for CdRegion", ft1);
7247  }
7248  }
7249  } else if (x_FeatIsVDJC(ft1) && x_FeatIsCDS(ft2)) {
7250  bad = x_BadCDSinVDJC(loc2, loc1, m_Scope);
7251  if (bad) {
7252  if (x_IsPseudo(ft1, m_Imp) || x_IsPseudo(ft2, m_Imp)) {
7254  "No parent for (pseudo) CdRegion", ft2);
7255  } else {
7257  "No parent for CdRegion", ft2);
7258  }
7259  }
7260  }
7261  }
7262  it1 = it2;
7263  }
7264  ++it2;
7265  }
7266 }
7267 
7268 
7270 {
7271  unsigned int lclcds = 0, lclcrgn = 0, lclvseg = 0, lcldseg = 0, lcljseg = 0, lclnone = 0, lclothr = 0;
7272 
7273  CSeq_entry_Handle topseh = seq.GetTopLevelEntry();
7274  CSeqEntryIndex idx(topseh);
7275  CRef<CBioseqIndex> bsx = idx.GetBioseqIndex(seq);
7276  bsx->IterateFeatures([this, &lclcds, &lclcrgn, &lclvseg, &lcldseg, &lcljseg, &lclnone, &lclothr](CFeatureIndex& sfx) {
7277  CSeqFeatData::ESubtype sbt = sfx.GetSubtype();
7278  if (sbt == CSeqFeatData::ESubtype::eSubtype_cdregion) {
7279  lclcds++;
7281  CRef<CFeatureIndex> prnt = sfx.GetBestParent();
7282  if (prnt) {
7283  CSeqFeatData::ESubtype ptyp = prnt->GetSubtype();
7284  // CConstRef<CSeq_loc> ploc = prnt->GetMappedLocation();
7285  if (ptyp == CSeqFeatData::ESubtype::eSubtype_C_region) {
7286  lclcrgn++;
7287  } else if (ptyp == CSeqFeatData::ESubtype::eSubtype_V_segment) {
7288  lclvseg++;
7289  } else if (ptyp == CSeqFeatData::ESubtype::eSubtype_D_segment) {
7290  lcldseg++;
7291  } else if (ptyp == CSeqFeatData::ESubtype::eSubtype_J_segment) {
7292  lcljseg++;
7293  } else {
7294  lclothr++;
7295  }
7296  } else {
7297  lclnone++;
7298  string sloc_str;
7299  cloc->GetLabel(&sloc_str);
7300  // LOG_POST_XX(Corelib_App, 1, "No parent for CdRegion at: " + sloc_str + "\n");
7301  CSeq_feat_Handle sfh = sfx.GetSeqFeatHandle();
7303  if (sf) {
7304  string locus;
7305  CRef<CFeatureIndex> gne = sfx.GetBestGene();
7306  if (gne) {
7307  const CGene_ref& gene = gne->GetMappedFeat().GetData().GetGene();
7308  if (gene.IsSetLocus()) {
7309  locus = gene.GetLocus();
7310  } else if (gene.IsSetLocus_tag()) {
7311  locus = gene.GetLocus_tag();
7312  } else {
7313  CConstRef<CSeq_loc> gloc = gne->GetMappedLocation();
7314  if (gloc) {
7315  // string gloc_str;
7316  gloc->GetLabel(&locus);
7317  // LOG_POST_XX(Corelib_App, 1, " but GetBestGene is: " + gloc_str + "\n");
7318  }
7319  }
7320  }
7321  if (locus.length() > 0) {
7322  PostErr(eDiag_Warning, eErr_SEQ_FEAT_CDSdoesNotMatchVDJC,
7323  "No parent for CdRegion (gene is " + locus + ")", *sf);
7324  } else {
7325  PostErr(eDiag_Warning, eErr_SEQ_FEAT_CDSdoesNotMatchVDJC,
7326  "No parent for CdRegion", *sf);
7327  }
7328  }
7329  }
7330  }
7331  });
7332  /*
7333  LOG_POST_XX(Corelib_App, 1, "lclcds: " + NStr::IntToString(lclcds) + "\n");
7334  LOG_POST_XX(Corelib_App, 1, "lclcrgn: " + NStr::IntToString(lclcrgn) + "\n");
7335  LOG_POST_XX(Corelib_App, 1, "lclvseg: " + NStr::IntToString(lclvseg) + "\n");
7336  LOG_POST_XX(Corelib_App, 1, "lcldseg: " + NStr::IntToString(lcldseg) + "\n");
7337  LOG_POST_XX(Corelib_App, 1, "lcljseg: " + NStr::IntToString(lcljseg) + "\n");
7338  LOG_POST_XX(Corelib_App, 1, "lclothr: " + NStr::IntToString(lclothr) + "\n");
7339  LOG_POST_XX(Corelib_App, 1, "lclnone: " + NStr::IntToString(lclnone) + "\n");
7340  */
7341 }
7342 
7344 {
7345  if (! m_AllFeatIt) {
7346  return;
7347  }
7348 
7349  list<CRef<CCdsMatchInfo>> cds_list;
7350  TmRNAList mrna_map;
7351 
7352  // Loop over all features
7353  // Populate the cds and mrna lists
7354  for (const auto& mapped_feat : *m_AllFeatIt) {
7355  if (! mapped_feat.IsSetData()) {
7356  continue;
7357  }
7358 
7359  if (mapped_feat.GetData().IsCdregion()) {
7360  const auto& cds_feat = *mapped_feat.GetSeq_feat();
7361 
7362  auto cds_match = Ref(new CCdsMatchInfo(cds_feat, m_Scope));
7363 
7364  // If pseudo, no need to match with mRNA
7365  if (cds_feat.IsSetPseudo() && cds_feat.GetPseudo()) {
7366  cds_match->SetPseudo();
7367  } else {
7368  CConstRef<CSeq_feat> gene_feat = m_Imp.GetCachedGene(&cds_feat);
7369  if (gene_feat &&
7370  gene_feat->IsSetPseudo() && gene_feat->GetPseudo()) {
7371  cds_match->SetPseudo();
7372  }
7373  }
7374  cds_list.push_back(cds_match);
7375  } else if (mapped_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
7376  const auto& feat = *mapped_feat.GetSeq_feat();
7377  mrna_map[mapped_feat.GetSeq_feat()] = Ref(new CMrnaMatchInfo(feat, m_Scope));
7378  }
7379  }
7380 
7381  if (! mrna_map.empty()) {
7383  }
7384 
7385  const size_t num_mrna = mrna_map.size();
7386  const CTSE_Handle& tse = seq.GetTSE_Handle();
7387  // First attempt to match by xref
7388  for (auto&& cds : cds_list) {
7389  cds->AssignXrefMatch(mrna_map, tse);
7390  }
7391  // Now attempt to match by overlap
7392  if (! mrna_map.empty()) {
7393  for (auto&& cds : cds_list) {
7394  if (! cds->HasMatch()) {
7395  cds->AssignOverlapMatch(mrna_map, *m_Scope);
7396  }
7397  }
7398  }
7399  // Now loop over cds to find number of matched cds and number of matched mrna
7400  int num_matched_cds = 0;
7401  int num_unmatched_cds = 0;
7402  for (auto&& cds : cds_list) {
7403  // Check to see if a CDS feat references or overlaps multiple mRNAs
7404  // mrna_list now contains only unmatched mrnas
7405  x_CheckForMultiplemRNAs(*cds, mrna_map);
7406  x_CheckMrnaProteinLink(*cds);
7407  // check for mismatching qualifiers
7409 
7410  if (cds->IsPseudo() ||
7411  (cds->GetSeqfeat().IsSetExcept() &&
7412  cds->GetSeqfeat().IsSetExcept_text() &&
7413  NStr::Find(cds->GetSeqfeat().GetExcept_text(), "rearrangement required for product") != string::npos)) {
7414  cds->NeedsMatch(false); // In this case, we don't require a matching mRNA
7415  continue;
7416  }
7417 
7418  if (cds->HasMatch()) {
7419  ++num_matched_cds;
7420  } else {
7421  ++num_unmatched_cds;
7422  }
7423  }
7424 
7425  // Code now returns a eErr_SEQ_FEAT_CDSwithNoMRNA warning,
7426  // even when no CDS features are matched
7427  if (num_unmatched_cds > 0 &&
7428  num_mrna > 0) {
7429  if (num_unmatched_cds >= 10) {
7430  const auto nmcds = num_matched_cds + num_unmatched_cds;
7432  NStr::NumericToString (num_unmatched_cds)
7433  + " out of " + NStr::IntToString (nmcds)
7434  + " CDSs unmatched",
7435  *(seq.GetCompleteBioseq()));
7436  } else {
7437  for (const auto& cds : cds_list) {
7438  if (! cds->HasMatch() && cds->NeedsMatch()) {
7440  "Unmatched CDS", cds->GetSeqfeat());
7441  }
7442  }
7443  }
7444  }
7445 
7446  // check to see if remaining mRNAs are pseudo
7447  size_t num_unmatched_mrna = 0;
7448 
7449  NON_CONST_ITERATE(TmRNAList, it, mrna_map) {
7450  CConstRef<CSeq_feat> gene_feat = m_Imp.GetCachedGene(CConstRef<CSeq_feat>(&(it->second->GetSeqfeat())));
7451  if (gene_feat &&
7452  gene_feat->IsSetPseudo() && gene_feat->GetPseudo()) {
7453  it->second->SetPseudo();
7454  }
7455  if (! it->second->OkWithoutCds(m_Imp.IsGenbank())) {
7456  num_unmatched_mrna++;
7457  }
7458  }
7459 
7460  // if (numcds > 0) {
7461  if (num_unmatched_mrna > 10) {
7462  string msg = "No matches for " + NStr::NumericToString(num_unmatched_mrna) + " mRNAs";
7464  msg, *(seq.GetCompleteBioseq()));
7465  } else {
7466  ITERATE(TmRNAList, it, mrna_map) {
7467  if (! it->second->OkWithoutCds(m_Imp.IsGenbank())) {
7469  "No CDS location match for 1 mRNA", it->second->GetSeqfeat());
7470  }
7471  }
7472  }
7473  // }
7474 }
7475 
7477 {
7478  if (m_GeneIt && m_AllFeatIt) {
7479  if (! m_GeneIt->empty()) {
7480  // nothing to validate if there aren't any genes
7481  // count mRNAs and CDSs for each gene.
7482  typedef map<CConstRef<CSeq_feat>, SIZE_TYPE> TFeatCount;
7483  TFeatCount cds_count, mrna_count;
7484 
7485  // create indices for gene labels and locus tags if needed
7486  typedef map<string, CConstRef<CSeq_feat> > TGeneList;
7487  TGeneList gene_labels, gene_locus_tags;
7488 
7489  CConstRef<CSeq_feat> gene;
7490 
7492  CSeqFeatData::ESubtype subtype = it->GetData().GetSubtype();
7493  if (subtype != CSeqFeatData::eSubtype_cdregion && subtype != CSeqFeatData::eSubtype_mRNA) {
7494  continue;
7495  }
7496  const CSeq_feat& feat = it->GetOriginalFeature();
7497 
7498  gene = m_Imp.GetCachedGene(&feat);
7499 
7500  if (gene) {
7501  if (cds_count.find(gene) == cds_count.end()) {
7502  cds_count[gene] = mrna_count[gene] = 0;
7503  }
7504 
7505  switch (subtype) {
7507  cds_count[gene]++;
7508  break;
7510  mrna_count[gene]++;
7511  break;
7512  default:
7513  break;
7514  }
7515  }
7516  }
7517 
7518  ITERATE (TFeatCount, it, cds_count) {
7519  SIZE_TYPE cds_num = it->second,
7520  mrna_num = mrna_count[it->first];
7521  if (cds_num > 0 && mrna_num > 1 && cds_num != mrna_num) {
7523  "mRNA count (" + NStr::SizetToString(mrna_num) +
7524  ") does not match CDS (" + NStr::SizetToString(cds_num) +
7525  ") count for gene", *it->first);
7526  }
7527  }
7528  }
7529  }
7530 }
7531 
7532 
7534 {
7535  // note - if we couldn't build the feature iterator, no point
7536  // in trying this
7537  if (! m_AllFeatIt) {
7538  return;
7539  }
7540 
7541  // count features of interest, find strand for coding region
7542  ENa_strand strand = eNa_strand_unknown;
7543 
7544  // we want a few different
7545 
7546  // In feat_key, feat_subtype will be overwritten a few times
7547  CCacheImpl::SFeatKey feat_key(
7549  // cdregion is special because it can set strand
7550  const CCacheImpl::TFeatValue& cd_region_feats =
7551  GetCache().GetFeatFromCache(feat_key);
7552  const size_t num_cds = cd_region_feats.size();
7554  const size_t num_3utr = GetCache().GetFeatFromCache(feat_key).size();
7556  const size_t num_5utr = GetCache().GetFeatFromCache(feat_key).size();
7558  const size_t num_gene = GetCache().GetFeatFromCache(feat_key).size();
7559 
7560  // cdregion is special because it can set strand
7561  if (num_cds > 0) {
7562  strand = cd_region_feats.back().GetLocation().GetStrand();
7563  }
7564 
7565  bool is_mrna = false;
7566  if (seq.CanGetInst_Mol() && seq.GetInst_Mol() == CSeq_inst::eMol_rna) {
7568  if (! sd) {
7569  // assume RNA sequence is mRNA
7570  is_mrna = true;
7571  } else if (sd->GetMolinfo().IsSetBiomol() && sd->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
7572  is_mrna = true;
7573  }
7574  }
7575 
7576  if (is_mrna) {
7577 
7578  ITERATE(CCacheImpl::TFeatValue, cdregion_it, cd_region_feats) {
7579  if (cdregion_it->GetLocation().GetStrand() == eNa_strand_minus) {
7581  "CDS should not be on minus strand of mRNA molecule", cdregion_it->GetOriginalFeature());
7582  }
7583  }
7584  }
7585 
7586  if (is_mrna || (num_cds == 1 && num_gene < 2)) {
7587 
7588  if (is_mrna) {
7589  // if this is an mRNA sequence, features should be on the plus strand
7590  strand = eNa_strand_plus;
7591  }
7592 
7593  int utr5_right = 0;
7594  int utr3_right = 0;
7595  int cds_right = 0;
7596  bool first_cds = true;
7597 
7598  // get multiple kinds of features
7599  vector<CCacheImpl::SFeatKey> featKeys;
7600  CCacheImpl::SFeatKey multi_feat_key_template(
7602  CCacheImpl::kAnyFeatSubtype, // will be overwritten a few times
7603  seq);
7604  multi_feat_key_template.feat_subtype =
7606  featKeys.push_back(multi_feat_key_template);
7607  multi_feat_key_template.feat_subtype = CSeqFeatData::eSubtype_3UTR;
7608  featKeys.push_back(multi_feat_key_template);
7609  multi_feat_key_template.feat_subtype = CSeqFeatData::eSubtype_5UTR;
7610  featKeys.push_back(multi_feat_key_template);
7611  multi_feat_key_template.feat_subtype = CSeqFeatData::eSubtype_gene;
7612  featKeys.push_back(multi_feat_key_template);
7613 
7615  GetCache().GetFeatFromCacheMulti(featKeys);
7616 
7617  if (strand == eNa_strand_minus) {
7618  // minus strand - expect 3'UTR, CDS, 5'UTR
7619  ITERATE(CCacheImpl::TFeatValue, cug_it, *cug_feats) {
7620  CSeqFeatData::ESubtype subtype = cug_it->GetData().GetSubtype();
7621  int this_left = cug_it->GetLocation().GetStart (eExtreme_Positional);
7622  int this_right = cug_it->GetLocation().GetStop (eExtreme_Positional);
7623  if (subtype == CSeqFeatData::eSubtype_3UTR) {
7624  if (cug_it->GetLocation().GetStrand() != eNa_strand_minus) {
7626  "3'UTR is not on minus strand", cug_it->GetOriginalFeature());
7627  } else if (utr5_right > 0 && utr5_right + 1 != this_left) {
7629  "Previous 5'UTR does not abut next 3'UTR", cug_it->GetOriginalFeature());
7630  }
7631  utr3_right = this_right;
7632  } else if (subtype == CSeqFeatData::eSubtype_cdregion) {
7633  if (utr3_right > 0 && utr3_right + 1 != this_left) {
7635  "CDS does not abut 3'UTR", cug_it->GetOriginalFeature());
7636  }
7637  first_cds = false;
7638  cds_right = this_right;
7639  } else if (subtype == CSeqFeatData::eSubtype_5UTR && num_5utr < 2) {
7640  if (cug_it->GetLocation().GetStrand() != eNa_strand_minus) {
7642  "5'UTR is not on minus strand", cug_it->GetOriginalFeature());
7643  } else if (cds_right > 0 && cds_right + 1 != this_left) {
7645  "5'UTR does not abut CDS", cug_it->GetOriginalFeature());
7646  }
7647  utr5_right = this_right;
7648  }
7649  }
7650  } else {
7651  // plus strand - expect 5'UTR, CDS, 3'UTR
7652  ITERATE(CCacheImpl::TFeatValue, cug_it, *cug_feats) {
7653  CSeqFeatData::ESubtype subtype = cug_it->GetData().GetSubtype();
7654  int this_left = cug_it->GetLocation().GetStart (eExtreme_Positional);
7655  int this_right = cug_it->GetLocation().GetStop (eExtreme_Positional);
7656  if (subtype == CSeqFeatData::eSubtype_5UTR && num_5utr < 2) {
7657  if (cug_it->GetLocation().GetStrand() == eNa_strand_minus) {
7659  "5'UTR is not on plus strand", cug_it->GetOriginalFeature());
7660  } else if (utr3_right > 0 && utr3_right + 1 != this_left) {
7662  "Previous 3'UTR does not abut next 5'UTR", cug_it->GetOriginalFeature());
7663  }
7664  utr5_right = this_right;
7665  } else if (subtype == CSeqFeatData::eSubtype_cdregion) {
7666  if (utr5_right > 0 && utr5_right + 1 != this_left && first_cds ) {
7667 
7669  "5'UTR does not abut CDS", cug_it->GetOriginalFeature());
7670  }
7671  first_cds = false;
7672  cds_right = this_right;
7673  } else if (subtype == CSeqFeatData::eSubtype_3UTR) {
7674  if (cug_it->GetLocation().GetStrand() == eNa_strand_minus) {
7676  "3'UTR is not on plus strand", cug_it->GetOriginalFeature());
7677  } else if (cds_right > 0 && cds_right + 1 != this_left && num_3utr == 1) {
7679  "CDS does not abut 3'UTR", cug_it->GetOriginalFeature());
7680  }
7681  if (is_mrna && num_cds == 1 && num_3utr == 1 && this_right != (int) seq.GetBioseqLength() - 1) {
7683  "3'UTR does not extend to end of mRNA", cug_it->GetOriginalFeature());
7684  }
7685  }
7686  }
7687  }
7688  }
7689 }
7690 
7699 };
7700 
7702 {
7704 
7705  if (! feat.IsSetData() || ! feat.GetData().IsRna()) {
7706  return e_RnaPosition_Ignore;
7707  }
7708  const CRNA_ref& rna = feat.GetData().GetRna();
7709  if (! rna.IsSetType()) {
7710  rval = e_RnaPosition_Ignore;
7711  } else if (! rna.IsSetExt()) {
7712  rval = e_RnaPosition_Ignore;
7713  } else if (rna.GetType() == CRNA_ref::eType_rRNA) {
7714  const string& product = rna.GetExt().GetName();
7715  if (NStr::StartsWith(product, "small ", NStr::eNocase)
7716  || NStr::StartsWith(product, "18S ", NStr::eNocase)
7717  || NStr::StartsWith(product, "16S ", NStr::eNocase)
7718  // variant spellings
7719  || NStr::StartsWith(product, "18 ", NStr::eNocase)
7720  || NStr::StartsWith(product, "16 ", NStr::eNocase)) {
7722  } else if (NStr::StartsWith(product, "5.8S ", NStr::eNocase)
7723  // variant spellings
7724  || NStr::StartsWith(product, "5.8 ", NStr::eNocase)) {
7726  } else if (NStr::StartsWith(product, "large ", NStr::eNocase)
7727  || NStr::StartsWith(product, "26S ", NStr::eNocase)
7728  || NStr::StartsWith(product, "28S ", NStr::eNocase)
7729  || NStr::StartsWith(product, "23S ", NStr::eNocase)
7730  // variant spellings
7731  || NStr::StartsWith(product, "26 ", NStr::eNocase)
7732  || NStr::StartsWith(product, "28 ", NStr::eNocase)
7733  || NStr::StartsWith(product, "23 ", NStr::eNocase)) {
7735  }
7736  } else if (rna.GetType() == CRNA_ref::eType_other || rna.GetType() == CRNA_ref::eType_miscRNA) {
7737  string product;
7738  if (rna.GetExt().IsName()) {
7739  product = rna.GetExt().GetName();
7740  if (NStr::EqualNocase(product, "misc_RNA")) {
7741  FOR_EACH_GBQUAL_ON_SEQFEAT (it, feat) {
7742  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")
7743  && (*it)->IsSetVal() && !NStr::IsBlank ((*it)->GetVal())) {
7744  product = (*it)->GetVal();
7745  break;
7746  }
7747  }
7748  }
7749  } else if (rna.GetExt().IsGen()) {
7750  if (rna.GetExt().GetGen().IsSetProduct()) {
7751  product = rna.GetExt().GetGen().GetProduct();
7752  }
7753  }
7754  if (NStr::EqualNocase(product, "internal transcribed spacer 1")
7755  || NStr::EqualNocase(product, "internal transcribed spacer1")) {
7757  } else if (NStr::EqualNocase(product, "internal transcribed spacer 2")
7758  || NStr::EqualNocase(product, "internal transcribed spacer2")) {
7760  } else if (NStr::EqualNocase(product, "internal transcribed spacer")
7761  || NStr::EqualNocase(product, "ITS")
7762  || NStr::EqualNocase(product, "16S-23S ribosomal RNA intergenic spacer")
7763  || NStr::EqualNocase(product, "16S-23S intergenic spacer")
7764  || NStr::EqualNocase(product, "intergenic spacer")) {
7766  }
7767  }
7768  return rval;
7769 }
7770 
7771 
7772 bool CValidError_bioseq::x_IsRangeGap(const CBioseq_Handle& seq, int start, int stop)
7773 {
7774  if (! seq.IsSetInst() || ! seq.GetInst().IsSetRepr()
7775  || seq.GetInst().GetRepr() != CSeq_inst::eRepr_delta
7776  || ! seq.GetInst().IsSetExt()
7777  || ! seq.GetInst().GetExt().IsDelta()
7778  || ! seq.GetInst().GetExt().GetDelta().IsSet()) {
7779  return false;
7780  }
7781  if (start < 0 || (unsigned int) stop >= seq.GetInst_Length() || start > stop) {
7782  return false;
7783  }
7784 
7785  int offset = 0;
7786  ITERATE (CDelta_ext::Tdata, it, seq.GetInst().GetExt().GetDelta().Get()) {
7787  int this_len = 0;
7788  if ((*it)->IsLiteral()) {
7789  this_len = (*it)->GetLiteral().GetLength();
7790  } else if ((*it)->IsLoc()) {
7791  this_len = GetLength((*it)->GetLoc(), m_Scope);
7792  }
7793  if ((*it)->IsLiteral() &&
7794  (! (*it)->GetLiteral().IsSetSeq_data() || (*it)->GetLiteral().GetSeq_data().IsGap())) {
7795  if (start >= offset && stop < offset + this_len) {
7796  return true;
7797  }
7798  }
7799  offset += this_len;
7800  if (offset > start) {
7801  return false;
7802  }
7803  }
7804  return false;
7805 }
7806 
7807 
7809 {
7815  return true;
7816  } else {
7817  return false;
7818  }
7819 }
7820 
7821 
7823 {
7824  // if unable to build feature iterator for this sequence, this will
7825  // also fail
7826  if (! m_AllFeatIt) {
7827  return;
7828  }
7829  if (seq.IsSetInst() && seq.GetInst().IsSetMol() && seq.GetInst().GetMol() == CSeq_inst::eMol_rna) {
7831  if (sd && sd->GetMolinfo().IsSetBiomol() && sd->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_mRNA) {
7832  // do not check for mRNA sequences
7833  return;
7834  }
7835  }
7836  // VR-802 do not report certain errors for organelle sequences
7837  bool is_organelle = IsOrganelle(seq);
7838 
7840 
7841  CFeat_CI it(seq, sel);
7842 
7843 
7844  if (it) {
7845 
7846  ENa_strand strand1 = eNa_strand_plus;
7847  if (it->GetLocation().IsSetStrand() && it->GetLocation().GetStrand() == eNa_strand_minus) {
7848  strand1 = eNa_strand_minus;
7849  }
7851  int right1 = it->GetLocation().GetStop(eExtreme_Positional);
7852 
7853  CFeat_CI it2 = it;
7854  ++it2;
7855  while (it2) {
7857  if (pos2 != e_RnaPosition_Ignore) {
7858  ENa_strand strand2 = eNa_strand_plus;
7859  if (it2->GetLocation().IsSetStrand() && it2->GetLocation().GetStrand() == eNa_strand_minus) {
7860  strand2 = eNa_strand_minus;
7861  }
7862  int left2 = it2->GetLocation().GetStart(eExtreme_Positional);
7863  int right2 = it2->GetLocation().GetStop(eExtreme_Positional);
7864 
7865  if ((strand1 == eNa_strand_minus && strand2 != eNa_strand_minus)
7866  || (strand1 != eNa_strand_minus && strand2 == eNa_strand_minus)) {
7867  // different strands
7868  if (pos1 != e_RnaPosition_Ignore && pos2 != e_RnaPosition_Ignore) {
7873  } else {
7875  "Inconsistent strands for rRNA components",
7876  it2->GetOriginalFeature());
7877  }
7878  }
7879  } else if (pos1 == e_RnaPosition_Ignore || pos2 == e_RnaPosition_Ignore) {
7880  // ignore
7881  } else if (right1 + 1 < left2) {
7882  // gap between features
7883  if (x_IsRangeGap(seq, right1 + 1, left2 - 1)) {
7884  // ignore, gap between features is gap in sequence
7885  } else if (strand1 == eNa_strand_minus) {
7886  if (s_AreAdjacent(pos2, pos1)) {
7888  "ITS does not abut adjacent rRNA component",
7889  it2->GetOriginalFeature());
7890  }
7891  } else {
7892  if (s_AreAdjacent(pos1, pos2)) {
7894  "ITS does not abut adjacent rRNA component",
7895  it2->GetOriginalFeature());
7896  }
7897  }
7898  } else if (right1 + 1 > left2) {
7899  // features overlap
7900  if (strand1 == eNa_strand_minus) {
7901  // on minus strand
7902  if (s_AreAdjacent(pos2, pos1)) {
7904  "ITS overlaps adjacent rRNA component",
7905  it2->GetOriginalFeature());
7906  } else {
7908  "rRNA components overlap and out of order", it2->GetOriginalFeature());
7909  }
7910  } else {
7911  // on plus strand
7912  if (s_AreAdjacent(pos1, pos2)) {
7914  "ITS overlaps adjacent rRNA component",
7915  it2->GetOriginalFeature());
7916  } else {
7918  "rRNA components overlap and out of order", it2->GetOriginalFeature());
7919  }
7920  }
7921 
7922  } else if (! is_organelle) {
7923  // features abut
7924  if (strand1 == eNa_strand_minus) {
7925  // on minus strand
7926  if (pos1 == pos2
7929  && seq.IsSetInst_Repr() && seq.GetInst_Repr() == CSeq_inst::eRepr_seg) {
7930  /* okay in segmented set */
7931  } else if (! s_AreAdjacent(pos2, pos1)) {
7933  "Problem with order of abutting rRNA components",
7934  it2->GetOriginalFeature());
7935  }
7936  } else {
7937  // on plus strand
7938  if (pos1 == pos2
7941  && seq.IsSetInst_Repr() && seq.GetInst_Repr() == CSeq_inst::eRepr_seg) {
7942  /* okay in segmented set */
7943  } else if (! s_AreAdjacent(pos1, pos2)) {
7945  "Problem with order of abutting rRNA components",
7946  it2->GetOriginalFeature());
7947  }
7948  }
7949  }
7950  it = it2;
7951  pos1 = pos2;
7952  strand1 = strand2;
7953  right1 = right2;
7954  }
7955  ++it2;
7956  }
7957  }
7958 
7959 }
7960 
7961 
7963  const CSeq_feat& curr,
7964  const CSeq_feat& prev,
7965  bool is_viral,
7966  bool is_htgs,
7967  bool same_annot,
7968  bool same_label)
7969 {
7970  if (! same_annot && ! same_label) {
7971  return eDiag_Warning;
7972  }
7973 
7974  EDiagSev severity = eDiag_Warning;
7975  CSeqFeatData::ESubtype curr_subtype = curr.GetData().GetSubtype();
7976 
7977  if ((prev.IsSetDbxref() && IsFlybaseDbxrefs(prev.GetDbxref())) ||
7978  (curr.IsSetDbxref() && IsFlybaseDbxrefs(curr.GetDbxref()))) {
7979  severity = eDiag_Error;
7980  }
7981 
7982  if (curr_subtype == CSeqFeatData::eSubtype_repeat_region
7983  || curr_subtype == CSeqFeatData::eSubtype_site
7984  || curr_subtype == CSeqFeatData::eSubtype_bond) {
7985  severity = eDiag_Warning;
7986  }
7987 
7988  if (same_label) {
7991  if (g1 && g2 && g1 != g2) {
7992  // different genes
7993  severity = eDiag_Warning;
7994  }
7995  } else {
7996  //same annot
7997  // lower severity for some pairs of partial features or pseudo features
7998  if (curr.IsSetPartial() && curr.GetPartial()
7999  && prev.IsSetPartial() && prev.GetPartial()) {
8000  if (curr_subtype == CSeqFeatData::eSubtype_gene
8001  || curr_subtype == CSeqFeatData::eSubtype_mRNA
8002  || (curr_subtype == CSeqFeatData::eSubtype_cdregion && is_viral)) {
8003  severity = eDiag_Warning;
8004  }
8005  }
8006  if (curr_subtype == CSeqFeatData::eSubtype_gene
8007  && curr.IsSetPseudo() && curr.GetPseudo()
8008  && prev.IsSetPseudo() && prev.GetPseudo()) {
8009  severity = eDiag_Warning;
8010  } else if (curr_subtype == CSeqFeatData::eSubtype_gene && is_viral) {
8011  severity = eDiag_Warning;
8012  } else if (curr_subtype == CSeqFeatData::eSubtype_cdregion && is_htgs) {
8013  severity = eDiag_Warning;
8014  }
8015  }
8016 
8017  return severity;
8018 }
8019 
8020 
8021 // assumption: this would only be called if the feature subtypes are already known to match
8023 {
8024  if (! IsDicistronicGene(f1) || ! IsDicistronicGene(f2)) {
8025  return false;
8026  }
8027  if (fruit_fly) {
8028  return true;
8029  }
8030  if (m_Imp.IsRefSeq()) {
8031  return true;
8032  }
8033  return false;
8034 }
8035 
8036 
8037 bool CValidError_bioseq::x_ReportDupOverlapFeaturePair(const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2, bool fruit_fly, bool viral, bool htgs)
8038 {
8039  if (x_SuppressDicistronic(f1, f2, fruit_fly)) {
8040  return false;
8041  }
8042 
8043  bool rval = false;
8044 
8045  // Get type of duplication, if any
8046  EDuplicateFeatureType dup_type = IsDuplicate(f1, f2);
8047  const CSeq_feat& feat1 = *(f1.GetSeq_feat());
8048  const CSeq_feat& feat2 = *(f2.GetSeq_feat());
8049 
8050  switch (dup_type) {
8051  case eDuplicate_Duplicate: {
8052  EDiagSev severity = x_DupFeatSeverity(feat1, feat2, viral, htgs, true, true);
8055  if (g1 && g2 && g1.GetPointer() != g2.GetPointer()) {
8056  severity = eDiag_Warning;
8057  }
8059  "Duplicate feature", feat2);
8060  rval = true;
8061  } break;
8063  if (PartialsSame(feat1.GetLocation(), feat2.GetLocation())) {
8064  EDiagSev severity = x_DupFeatSeverity(feat1, feat2, viral, htgs, true, false);
8065  if (feat1.GetData().IsImp()) {
8066  severity = eDiag_Warning;
8067  }
8069  "Features have identical intervals, but labels differ",
8070  feat2);
8071  rval = true;
8072  }
8073  break;
8075  EDiagSev severity = x_DupFeatSeverity(feat1, feat2, viral, htgs, false, true);
8077  "Duplicate feature (packaged in different feature table)",
8078  feat2);
8079  rval = true;
8080  } break;
8082  EDiagSev severity = x_DupFeatSeverity(feat1, feat2, viral, htgs, false, false);
8084  "Features have identical intervals, but labels "
8085  "differ (packaged in different feature table)",
8086  feat2);
8087  rval = true;
8088  } break;
8089  case eDuplicate_Not:
8090  // no error
8091  break;
8092  }
8093  return rval;
8094 }
8095 
8096 
8098 {
8099  const CSeq_feat& feat1 = *(f1.GetSeq_feat());
8100  const CSeq_feat& feat2 = *(f2.GetSeq_feat());
8101  // subtypes
8102  CSeqFeatData::ESubtype feat1_subtype = feat1.GetData().GetSubtype();
8103  CSeqFeatData::ESubtype feat2_subtype = feat2.GetData().GetSubtype();
8104  // locations
8105  const CSeq_loc& feat1_loc = feat1.GetLocation();
8106  const CSeq_loc& feat2_loc = feat2.GetLocation();
8107 
8108  if ((feat1_subtype == CSeqFeatData::eSubtype_mat_peptide_aa ||
8109  feat1_subtype == CSeqFeatData::eSubtype_propeptide_aa ||
8110  feat1_subtype == CSeqFeatData::eSubtype_sig_peptide_aa ||
8111  feat1_subtype == CSeqFeatData::eSubtype_transit_peptide_aa)) {
8112  if ((feat2_subtype == CSeqFeatData::eSubtype_mat_peptide_aa ||
8113  feat2_subtype == CSeqFeatData::eSubtype_propeptide_aa ||
8114  feat2_subtype == CSeqFeatData::eSubtype_sig_peptide_aa ||
8115  feat2_subtype == CSeqFeatData::eSubtype_transit_peptide_aa) &&
8117  s_NotPeptideException(feat1, feat2)) {
8118  EDiagSev overlapPepSev =
8120  string msg = "Signal, Transit, or Mature peptide features overlap";
8121 
8122  try {
8123  const CSeq_feat* cds = m_Imp.GetCDSGivenProduct(bioseq);
8124  if (cds) {
8125  string cds_loc;
8126  const CSeq_id* id = cds->GetLocation().GetId();
8127  if (id) {
8129  if (bsh && bsh.GetCompleteBioseq()) {
8130  AppendBioseqLabel(cds_loc, *(bsh.GetCompleteBioseq()), true);
8131  if (NStr::StartsWith(cds_loc, "BIOSEQ: ")) {
8132  cds_loc = cds_loc.substr(8);
8133  }
8134  } else {
8135  id->GetLabel(&cds_loc, CSeq_id::eContent);
8136  }
8137  }
8138  if (! NStr::IsBlank(cds_loc)) {
8139  cds_loc = " (parent CDS is on " + cds_loc + ")";
8140  msg += cds_loc;
8141  }
8142  }
8143  } catch (const exception&) {
8144  }
8145 
8146  if (! reported_last_peptide) {
8147  PostErr(overlapPepSev,
8149  msg,
8150  feat1);
8151  }
8152  PostErr(overlapPepSev,
8154  msg,
8155  feat2);
8156  reported_last_peptide = true;
8157  } else {
8158  reported_last_peptide = false;
8159  }
8160  }
8161 }
8162 
8163 
8165 {
8166  if (! m_AllFeatIt) {
8167  return;
8168  }
8169 
8170  try {
8171 
8172  bool fruit_fly = false;
8173  bool viral = false;
8174  bool htgs = false;
8175 
8177  if (di && di->GetSource().IsSetOrg()) {
8178  if (di->GetSource().GetOrg().IsSetTaxname()
8179  && NStr::StartsWith(di->GetSource().GetOrg().GetTaxname(), "Drosophila ", NStr::eNocase)) {
8180  fruit_fly = true;
8181  }
8182  if (di->GetSource().GetOrg().IsSetOrgname()
8183  && di->GetSource().GetOrg().GetOrgname().IsSetLineage()
8184  && NStr::StartsWith(di->GetSource().GetOrg().GetOrgname().GetLineage(), "Viruses; ")) {
8185  viral = true;
8186  }
8187  }
8188 
8190  if (mi && mi->GetMolinfo().IsSetTech()) {
8191  CMolInfo::TTech tech = mi->GetMolinfo().GetTech();
8192  htgs = (tech == CMolInfo::eTech_htgs_1
8193  || tech == CMolInfo::eTech_htgs_2
8194  || tech == CMolInfo::eTech_htgs_3);
8195  }
8196 
8197  // TODO: fix: quadratic in size of m_AllFeatIt.
8199  CCacheImpl::TFeatValue::const_iterator curr_it = prev_it;
8200  ++curr_it;
8201  CConstRef<CSeq_feat> prev_feat = prev_it->GetSeq_feat();
8202  CSeq_feat_Handle f1 = prev_it->GetSeq_feat_Handle();
8203  TSeqPos prev_end = prev_feat->GetLocation().GetStop (eExtreme_Positional);
8204  for(; curr_it != m_AllFeatIt->end(); ++curr_it) {
8205  CConstRef<CSeq_feat> curr_feat = curr_it->GetSeq_feat();
8206  TSeqPos curr_start = curr_feat->GetLocation().GetStart(eExtreme_Positional);
8207  if (curr_start > prev_end) {
8208  break;
8209  }
8210  if (x_ReportDupOverlapFeaturePair(f1, curr_it->GetSeq_feat_Handle(), fruit_fly, viral, htgs)) {
8211  break;
8212  }
8213  }
8214  }
8215 
8216  CCacheImpl::TFeatValue::const_iterator prev_prot = m_AllFeatIt->begin();
8217  if (prev_prot != m_AllFeatIt->end()) {
8218  CCacheImpl::TFeatValue::const_iterator curr_prot = prev_prot;
8219  ++curr_prot;
8220  bool reported_last_peptide = false;
8221  for (; curr_prot != m_AllFeatIt->end(); ++prev_prot, ++curr_prot) {
8222  x_ReportOverlappingPeptidePair(prev_prot->GetSeq_feat_Handle(), curr_prot->GetSeq_feat_Handle(), bioseq, reported_last_peptide);
8223  }
8224  }
8225  } catch (const exception& e) {
8226  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
8228  string("Exception while validating duplicate/overlapping features. EXCEPTION: ") +
8229  e.what(), bioseq);
8230  }
8231  }
8232 }
8233 
8234 static vector<int> s_LocationToStartStopPairs(const CSeq_loc& loc)
8235 {
8236  vector<int> intervalpoints;
8237 
8238  CSeq_loc_CI curr(loc);
8239  while (curr) {
8240  const CSeq_loc& part = curr.GetEmbeddingSeq_loc();
8241  if (part.IsInt()) {
8242  const CSeq_interval& ivl = part.GetInt();
8243  intervalpoints.push_back(ivl.GetFrom());
8244  intervalpoints.push_back(ivl.GetTo());
8245  } else if (part.IsPacked_int()) {
8247  const CSeq_interval& ivl = **it;
8248  intervalpoints.push_back(ivl.GetFrom());
8249  intervalpoints.push_back(ivl.GetTo());
8250  ++curr;
8251  }
8252  continue;
8253  } else if (part.IsPnt()) {
8254  const CSeq_point& pnt = part.GetPnt();
8255  intervalpoints.push_back(pnt.GetPoint());
8256  intervalpoints.push_back(pnt.GetPoint());
8257  }
8258  ++curr;
8259  }
8260 
8261  return intervalpoints;
8262 }
8263 
8264 
8265 static bool s_SubsequentIntron(CFeat_CI feat_ci_dup, Int4 start, Int4 stop, Int4 max)
8266 {
8267  ++feat_ci_dup;
8268 
8269  while (feat_ci_dup) {
8270 
8271  const CSeq_feat& const_feat_dup = feat_ci_dup->GetOriginalFeature();
8272  const CSeq_loc& loc_dup = const_feat_dup.GetLocation();
8273  for (CSeq_loc_CI curr(loc_dup); curr; ++curr) {
8274  Int4 fr = 0;
8275  Int4 to = 0;
8276  const CSeq_loc& part = curr.GetEmbeddingSeq_loc();
8277  if (part.IsInt()) {
8278  const CSeq_interval& ivl = part.GetInt();
8279  fr = ivl.GetFrom();
8280  to = ivl.GetTo();
8281  } else if (part.IsPnt()) {
8282  const CSeq_point& pnt = part.GetPnt();
8283  fr = pnt.GetPoint();
8284  to = pnt.GetPoint();
8285  } else {
8286  continue;
8287  }
8288  if (start + 1 == fr && stop - 1 == to) {
8289  return true;
8290  }
8291  if (start + 1 == fr && to == max) {
8292  return true;
8293  }
8294  if (to > stop) {
8295  return false;
8296  }
8297  }
8298 
8299  ++feat_ci_dup;
8300  }
8301 
8302  return false;
8303 }
8304 
8306 {
8307  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(bioseq);
8308  if (! bsh)
8309  return;
8311  try {
8312  for (CFeat_CI feat_ci(bsh, sel); feat_ci; ++feat_ci) {
8313 
8314  const CSeq_feat& const_feat = feat_ci->GetOriginalFeature();
8315  const CSeq_loc& loc = const_feat.GetLocation();
8316 
8317  vector<int> intervalpoints = s_LocationToStartStopPairs(loc);
8318 
8319  unsigned len = (unsigned)intervalpoints.size();
8320  if (len < 4) {
8321  continue;
8322  }
8323  unsigned max = len - 1;
8324 
8325  bool twintron = true;
8326 
8327  for (unsigned pos = 1; pos < max; pos += 2) {
8328  Int4 intL = intervalpoints[pos];
8329  Int4 intR = intervalpoints[pos + 1];
8330 
8331  CFeat_CI feat_ci_dup = feat_ci;
8332  if (! s_SubsequentIntron(feat_ci_dup, intL, intR, intervalpoints[max])) {
8333  twintron = false;
8334  break;
8335  }
8336  }
8337 
8338  EDiagSev sev = eDiag_Error;
8339  if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
8340  sev = eDiag_Warning;
8341  }
8342  if (twintron) {
8344  "Multi-interval intron contains possible twintron",
8345  const_feat);
8346  } else {
8348  "An intron should not have multiple intervals",
8349  const_feat);
8350  }
8351  }
8352  } catch (CException& e) {
8353  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
8354  if (! IsSelfReferential(bioseq)) {
8355  ERR_POST(Error << "ValidateTwintrons error: " << e.what());
8356  }
8357  }
8358  }
8359 }
8360 
8361 
8363 {
8364  ITERATE (TDbtags, db, dbxrefs) {
8365  if ((*db)->CanGetDb()) {
8366  if (NStr::EqualCase((*db)->GetDb(), "FLYBASE") ||
8367  NStr::EqualCase((*db)->GetDb(), "FlyBase")) {
8368  return true;
8369  }
8370  }
8371  }
8372  return false;
8373 }
8374 
8375 
8376 static bool s_IsTPAAssemblyOkForBioseq(const CBioseq& seq, bool has_refseq)
8377 {
8378  bool has_local = false, has_genbank = false;
8379  bool has_gi = false, has_tpa = false, has_bankit = false, has_smart = false;
8380 
8381  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
8382  switch ((*it)->Which()) {
8383  case CSeq_id::e_Local:
8384  has_local = true;
8385  break;
8386  case CSeq_id::e_Genbank:
8387  case CSeq_id::e_Embl:
8388  case CSeq_id::e_Ddbj:
8389  has_genbank = true;
8390  break;
8391  case CSeq_id::e_Other:
8392  has_refseq = true;
8393  break;
8394  case CSeq_id::e_Gi:
8395  has_gi = true;
8396  break;
8397  case CSeq_id::e_Tpg:
8398  case CSeq_id::e_Tpe:
8399  case CSeq_id::e_Tpd:
8400  has_tpa = true;
8401  break;
8402  case CSeq_id::e_General:
8403  if ((*it)->GetGeneral().IsSetDb()) {
8404  if (NStr::Equal((*it)->GetGeneral().GetDb(), "BankIt", NStr::eNocase)) {
8405  has_bankit = true;
8406  } else if (NStr::Equal((*it)->GetGeneral().GetDb(), "TMSMART", NStr::eNocase)) {
8407  has_smart = true;
8408  }
8409  }
8410  break;
8411  default:
8412  break;
8413  }
8414  }
8415 
8416  if (has_genbank) return false;
8417  if (has_tpa) return true;
8418  if (has_refseq) return false;
8419  if (has_bankit) return true;
8420  if (has_smart) return true;
8421  if (has_gi) return false;
8422  if (has_local) return true;
8423 
8424  return false;
8425 }
8426 
8427 
8428 static void GetDateString(string& out_date_str, const CDate& date)
8429 {
8430  if (date.IsStr()) {
8431  out_date_str = date.GetStr();
8432  } else if (date.IsStd()) {
8433  if (date.GetStd().IsSetYear()) {
8434  date.GetDate(&out_date_str, "%{%3N %{%D, %}%}%Y");
8435  }
8436  }
8437 }
8438 
8439 
8441 {
8443  if (NStr::IsBlank(prefix)) {
8444  return "";
8445  }
8446  string keyword = CComment_rule::KeywordForPrefix(prefix);
8447  return keyword;
8448 }
8449 
8450 
8452 {
8453  // list of structured comment prefixes
8454  vector<string> sc_prefixes;
8455 
8457  while (di) {
8458  const CUser_object& obj = di->GetUser();
8461  if (! NStr::IsBlank(prefix)) {
8462  sc_prefixes.push_back(prefix);
8463  }
8464  }
8465  ++di;
8466  }
8467 
8468  sort(sc_prefixes.begin(), sc_prefixes.end());
8469  int num_seen = 0;
8470  string previous;
8471  ITERATE(vector<string>, it, sc_prefixes) {
8472  if (NStr::EqualNocase(previous, *it)) {
8473  num_seen++;
8474  } else {
8475  if (num_seen > 1) {
8477  "Multiple structured comments with prefix " + previous,
8478  seq);
8479  }
8480  previous = *it;
8481  num_seen = 1;
8482  }
8483  }
8484  if (num_seen > 1) {
8486  "Multiple structured comments with prefix " + previous,
8487  seq);
8488  }
8489 
8490 }
8491 
8492 /*
8493 bool s_IsGenbankMasterAccession(const string& acc)
8494 {
8495  bool rval = false;
8496  switch (acc.length()) {
8497  case 12:
8498  if (NStr::EndsWith(acc, "000000")) {
8499  rval = true;
8500  }
8501  break;
8502  case 13:
8503  if (NStr::EndsWith(acc, "0000000")) {
8504  rval = true;
8505  }
8506  break;
8507  case 14:
8508  if (NStr::EndsWith(acc, "00000000")) {
8509  rval = true;
8510  }
8511  break;
8512  default:
8513  break;
8514  }
8515  return rval;
8516 }
8517 
8518 bool s_IsMasterAccession(const CSeq_id& id)
8519 {
8520  bool rval = false;
8521  switch (id.Which()) {
8522  case CSeq_id::e_Other:
8523  if (id.GetOther().IsSetAccession()) {
8524  const string& acc = id.GetOther().GetAccession();
8525  switch (acc.length()) {
8526  case 15:
8527  if (NStr::EndsWith(acc, "000000")) {
8528  rval = true;
8529  }
8530  break;
8531  case 16:
8532  case 17:
8533  if (NStr::EndsWith(acc, "0000000")) {
8534  rval = true;
8535  }
8536  break;
8537  default:
8538  break;
8539  }
8540  }
8541  break;
8542  case CSeq_id::e_Genbank:
8543  if (id.GetGenbank().IsSetAccession()) {
8544  rval = s_IsGenbankMasterAccession(id.GetGenbank().GetAccession());
8545  }
8546  break;
8547  case CSeq_id::e_Ddbj:
8548  if (id.GetDdbj().IsSetAccession()) {
8549  rval = s_IsGenbankMasterAccession(id.GetDdbj().GetAccession());
8550  }
8551  break;
8552  case CSeq_id::e_Embl:
8553  if (id.GetEmbl().IsSetAccession()) {
8554  rval = s_IsGenbankMasterAccession(id.GetEmbl().GetAccession());
8555  }
8556  break;
8557  case CSeq_id::e_Tpg:
8558  if (id.GetTpg().IsSetAccession()) {
8559  rval = s_IsGenbankMasterAccession(id.GetTpg().GetAccession());
8560  }
8561  break;
8562  default:
8563  break;
8564  }
8565 
8566  return rval;
8567 }
8568 */
8569 
8570 
8572 {
8573  if (! seq.IsSetId() || ! seq.IsSetInst() || ! seq.GetInst().IsSetRepr() ||
8575  return false;
8576  }
8577  bool is_master = false;
8578  ITERATE(CBioseq::TId, id, seq.GetId()) {
8579  is_master |= g_IsMasterAccession(**id);
8580  }
8581 
8582  return is_master;
8583 }
8584 
8585 
8587 {
8588  bool is_WP = false;
8589 
8590  FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(sid_itr, bsh) {
8591  CSeq_id_Handle sid = *sid_itr;
8592  switch (sid.Which()) {
8593  case NCBI_SEQID(Embl):
8594  case NCBI_SEQID(Ddbj):
8595  case NCBI_SEQID(Other):
8596  case NCBI_SEQID(Genbank): {
8597  CConstRef<CSeq_id> id = sid.GetSeqId();
8598  const CTextseq_id& tsid = *id->GetTextseq_Id();
8599  if (tsid.IsSetAccession()) {
8600  const string& acc = tsid.GetAccession();
8602 
8603  if (type == NCBI_ACCN(refseq_unique_prot)) {
8604  is_WP = true;
8605  break;
8606  }
8607  }
8608  } break;
8609  default:
8610  break;
8611  }
8612  }
8613  return is_WP;
8614 }
8615 
8616 
8618 {
8619  bool embl_or_ddbj = false;
8620  ITERATE(CBioseq::TId, id, seq.GetId()) {
8621  if ((*id)->IsEmbl() || (*id)->IsDdbj()) {
8622  embl_or_ddbj = true;
8623  break;
8624  }
8625  }
8626 
8627  return embl_or_ddbj;
8628 }
8629 
8630 
8632 {
8633  ITERATE(CBioseq::TId, id, seq.GetId()) {
8634  if ((*id)->IsGenbank()) {
8635  return true;
8636  }
8637  }
8638  return false;
8639 }
8640 
8641 
8643 {
8644  ITERATE(CBioseq::TId, id, seq.GetId()) {
8645  if ((*id)->IsOther()) {
8646  return true;
8647  }
8648  }
8649  return false;
8650 }
8651 
8652 
8654 {
8656  while (di) {
8657  CSeqdesc_CI di2 = di;
8658  ++di2;
8659  while (di2) {
8660  if (NStr::EqualNocase(di->GetComment(), di2->GetComment())) {
8662  "Undesired multiple comment descriptors, identical text",
8663  *(bsh.GetParentEntry().GetCompleteSeq_entry()), *di2);
8664  }
8665  ++di2;
8666  }
8667  ++di;
8668  }
8669 }
8670 
8671 
8673 {
8674  if (! bsh) {
8675  return;
8676  }
8678  if (! b) {
8679  return;
8680  }
8681  if (SeqIsPatent(*b)) {
8682  return;
8683  }
8684  bool is_nc = false;
8685  bool is_ac = false;
8686  FOR_EACH_SEQID_ON_BIOSEQ(id_it, *b) {
8687  if ((*id_it)->IsOther() && (*id_it)->GetOther().IsSetAccession()) {
8688  string accession = (*id_it)->GetOther().GetAccession();
8689  if (NStr::StartsWith(accession, "NC_")) {
8690  is_nc = true;
8691  break;
8692  } else if (NStr::StartsWith(accession, "AC_")) {
8693  is_ac = true;
8694  break;
8695  }
8696  }
8697  }
8698  if (! is_nc && ! is_ac) {
8699  return;
8700  }
8701 
8703  PostErr(eDiag_Error, eErr_SEQ_DESCR_MissingChromosome, "Missing chromosome qualifier on NC or AC RefSeq record",
8704  *b);
8705  }
8706 
8707 }
8708 
8709 
8710 // Validate CSeqdesc within the context of a bioseq.
8711 // See: CValidError_desc for validation of standalone CSeqdesc,
8712 // and CValidError_descr for validation of descriptors in the context
8713 // of descriptor list.
8715 {
8716  const CSeq_entry& ctx = *seq.GetParentEntry();
8717 
8718  size_t num_gb = 0,
8719  num_embl = 0,
8720  num_pir = 0,
8721  num_pdb = 0,
8722  num_prf = 0,
8723  num_sp = 0;
8724  CConstRef<CSeqdesc> last_gb,
8725  last_embl,
8726  last_pir,
8727  last_pdb,
8728  last_prf,
8729  last_sp;
8730  CConstRef<CSeqdesc> create_desc, update_desc;
8731  string create_str;
8732  int biomol = -1;
8733  int tech = -1, completeness = -1;
8734  CConstRef<COrg_ref> org;
8735 
8736  string name_str;
8737  string comment_str;
8738 
8739  bool is_genome_assembly = false;
8740  bool is_assembly = false;
8741  bool is_finished_status = false;
8742 
8744 
8745  // some validation is for descriptors that affect a bioseq,
8746  // other validation is only for descriptors _on_ a bioseq
8747  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
8748  const CSeqdesc& desc = **it;
8749 
8750  switch (desc.Which()) {
8751  case CSeqdesc::e_Title: {
8752  string title = desc.GetTitle();
8753  size_t pos = NStr::Find(title, "[");
8754  if (pos != string::npos) {
8755  pos = NStr::Find(title, "=", pos + 1);
8756  }
8757  if (pos != string::npos) {
8758  pos = NStr::Find(title, "]", pos + 1);
8759  }
8760  if (pos != string::npos) {
8761  bool report_fasta_brackets = true;
8762  FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
8763  if ((*id_it)->IsGeneral()) {
8764  const CDbtag& dbtag = (*id_it)->GetGeneral();
8765  if (dbtag.IsSetDb()) {
8766  if (NStr::EqualNocase(dbtag.GetDb(), "TMSMART")
8767  || NStr::EqualNocase(dbtag.GetDb(), "BankIt")) {
8768  report_fasta_brackets = false;
8769  break;
8770  }
8771  }
8772  }
8773  }
8774  if (report_fasta_brackets) {
8775  IF_EXISTS_CLOSEST_BIOSOURCE (bs_ref, seq, nullptr) {
8776  const CBioSource& bsrc = (*bs_ref).GetSource();
8777  if (bsrc.IsSetOrg()) {
8778  const COrg_ref& orgref = bsrc.GetOrg();
8779  if (orgref.IsSetTaxname()) {
8780  string taxname = orgref.GetTaxname();
8781  size_t pos2 = NStr::Find(taxname, "=");
8782  if (pos2 != string::npos) {
8783  pos2 = NStr::Find(title, taxname);
8784  if (pos2 != string::npos) {
8785  report_fasta_brackets = false;
8786  }
8787  }
8788  }
8789  }
8790  }
8791  }
8792  if (report_fasta_brackets) {
8794  "Title may have unparsed [...=...] construct",
8795  ctx, desc);
8796  }
8797  }
8798  } break;
8799  default:
8800  break;
8801  }
8802  }
8803 
8804  // collect keywords - needed for validating structured comments
8805  vector<string> keywords;
8806  for (CSeqdesc_CI di(m_CurrentHandle, CSeqdesc::e_Genbank); di; ++di) {
8807  FOR_EACH_KEYWORD_ON_GENBANKBLOCK (key, di->GetGenbank()) {
8808  keywords.push_back(*key);
8809  }
8810  }
8811 
8812  for (CSeqdesc_CI di(m_CurrentHandle); di; ++di) {
8813  const CSeqdesc& desc = *di;
8814 
8815  switch (desc.Which()) {
8816 
8817  case CSeqdesc::e_Org:
8818  if (! org) {
8819  org = &(desc.GetOrg());
8820  }
8821  ValidateOrgContext(desc.GetOrg(), *org, seq, desc);
8822  break;
8823 
8824  case CSeqdesc::e_Pir:
8825  num_pir++;
8826  last_pir = &desc;
8827  break;
8828 
8829  case CSeqdesc::e_Genbank:
8830  num_gb++;
8831  last_gb = &desc;
8832  ValidateGBBlock(desc.GetGenbank(), seq, desc);
8833  break;
8834 
8835  case CSeqdesc::e_Sp:
8836  num_sp++;
8837  last_sp = &desc;
8838  break;
8839 
8840  case CSeqdesc::e_Embl:
8841  num_embl++;
8842  last_embl = &desc;
8843  break;
8844 
8845  case CSeqdesc::e_Create_date: {
8846  const CDate& current = desc.GetCreate_date();
8847  if (create_desc) {
8848  if (create_desc->GetCreate_date().Compare(current) != CDate::eCompare_same && m_Imp.HasGiOrAccnVer()) {
8849  string current_str;
8850  GetDateString(current_str, current);
8851  const CSeq_entry* use_ctx = ctx.GetParentEntry();
8852  if (! use_ctx || ! use_ctx->IsSet()
8853  || ! use_ctx->GetSet().IsSetClass()
8854  || use_ctx->GetSet().GetClass() != CBioseq_set::eClass_nuc_prot) {
8855  use_ctx = &ctx;
8856  }
8858  "Inconsistent create_dates [" + current_str +
8859  "] and [" + create_str + "]", *use_ctx, desc);
8860  }
8861  } else {
8862  create_desc = &desc;
8863  GetDateString(create_str, create_desc->GetCreate_date());
8864  }
8865 
8866  if (update_desc) {
8867  ValidateUpdateDateContext(update_desc->GetUpdate_date(), current, seq, *create_desc);
8868  }
8869  } break;
8870 
8872  if (create_desc) {
8874  seq, *create_desc);
8875  } else {
8876  update_desc = &desc;
8877  }
8878  break;
8879 
8880  case CSeqdesc::e_Prf:
8881  num_prf++;
8882  last_prf = &desc;
8883  break;
8884 
8885  case CSeqdesc::e_Pdb:
8886  num_pdb++;
8887  last_pdb = &desc;
8888  break;
8889 
8890  case CSeqdesc::e_Source: {
8891  const CSeqdesc::TSource& source = desc.GetSource();
8892 
8894 
8895  // look at orgref in comparison to other descs
8896  if (source.IsSetOrg()) {
8897  const COrg_ref& orgref = source.GetOrg();
8898  if (! org) {
8899  org = &orgref;
8900  }
8901  ValidateOrgContext(orgref, *org, seq, desc);
8902  }
8903  } break;
8904 
8905  case CSeqdesc::e_Molinfo:
8906  ValidateMolInfoContext(desc.GetMolinfo(), biomol, tech, completeness, seq, desc);
8907  break;
8908 
8909  case CSeqdesc::e_User:
8910  if (desc.GetUser().IsSetType()) {
8911  const CUser_object& usr = desc.GetUser();
8912  const CObject_id& oi = usr.GetType();
8913  if (oi.IsStr() && NStr::CompareNocase(oi.GetStr(), "TpaAssembly") == 0
8915  string id_str;
8916  seq.GetLabel(&id_str, CBioseq::eContent, false);
8918  "Non-TPA record " + id_str + " should not have TpaAssembly object", seq);
8919  }
8920  if (desc.GetUser().IsRefGeneTracking()
8922  && !m_Imp.IsRefSeq()) {
8924  "RefGeneTracking object should only be in RefSeq record",
8925  ctx, desc);
8926  } else if (desc.GetUser().IsStructuredComment()) {
8927  const CUser_object& obj = desc.GetUser();
8928  string keyword = s_GetKeywordForStructuredComment(obj);
8929  if (! NStr::IsBlank(keyword)) {
8930  // does sequence have keyword?
8931  bool found = false;
8932  ITERATE (vector<string>, key, keywords) {
8933  if (NStr::EqualNocase(keyword, *key)) {
8934  found = true;
8935  break;
8936  }
8937  }
8938  // is structured comment valid for this keyword?
8939  if (! m_DescrValidator.ValidateStructuredComment(desc, false)) {
8940  // error if keyword is present
8941  if (found) {
8943  "Structured Comment is non-compliant, keyword should be removed", ctx, desc);
8944  }
8945  }
8946  } else {
8947  ITERATE (CUser_object::TData, field, obj.GetData()) {
8948  if ((*field)->IsSetLabel() && (*field)->GetLabel().IsStr()) {
8949  if (NStr::EqualNocase((*field)->GetLabel().GetStr(), "StructuredCommentPrefix")) {
8950  const string& prefix = (*field)->GetData().GetStr();
8951  if (NStr::EqualCase(prefix, "##Genome-Assembly-Data-START##")) {
8952  is_genome_assembly = true;
8953  } else if (NStr::EqualCase(prefix, "##Assembly-Data-START##")) {
8954  is_assembly = true;
8955  }
8956  } else if (NStr::EqualNocase((*field)->GetLabel().GetStr(), "Current Finishing Status")) {
8957  const string& prefix = (*field)->GetData().GetStr();
8958  if (NStr::EqualCase(prefix, "Finished")) {
8959  is_finished_status = true;
8960  }
8961  }
8962  }
8963  }
8964  }
8965  } else if (oi.IsStr() && NStr::EqualCase(oi.GetStr(), "DBLink")) {
8966  m_dblink_count++;
8967  FOR_EACH_USERFIELD_ON_USEROBJECT (ufd_it, usr) {
8968  const CUser_field& fld = **ufd_it;
8969  if (FIELD_IS_SET_AND_IS(fld, Label, Str)) {
8970  const string& label_str = GET_FIELD(fld.GetLabel(), Str);
8971  if (NStr::EqualNocase(label_str, "Trace Assembly Archive")) {
8972  m_taa_count++;
8973  } else if (NStr::EqualNocase(label_str, "BioSample")) {
8974  m_bs_count++;
8975  } else if (NStr::EqualNocase(label_str, "Assembly")) {
8976  m_as_count++;
8977  } else if (NStr::EqualNocase(label_str, "ProbeDB")) {
8978  m_pdb_count++;
8979  } else if (NStr::EqualNocase(label_str, "Sequence Read Archive")) {
8980  m_sra_count++;
8981  } else if (NStr::EqualNocase(label_str, "BioProject")) {
8982  m_bp_count++;
8983  } else {
8984  m_unknown_count++;
8985  }
8986  }
8987  }
8988  }
8989  }
8990  break;
8991  case CSeqdesc::e_Title: {
8992  string title = desc.GetTitle();
8993 
8994  // nucleotide refseq sequences should start with the organism name,
8995  // protein refseq sequences should end with the organism name.
8996  bool is_refseq = m_Imp.IsRefSeqConventions();
8997  FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
8998  if ((*id_it)->IsOther()) {
8999  is_refseq = true;
9000  break;
9001  }
9002  }
9003  if (is_refseq) {
9004  string taxname;
9006  if (src_i && src_i->GetSource().IsSetOrg() && src_i->GetSource().GetOrg().IsSetTaxname()) {
9007  taxname = src_i->GetSource().GetOrg().GetTaxname();
9008  }
9009  if (seq.IsAa()) {
9011  if (cds) {
9013  cds->GetLocation(),
9016  *m_Scope);
9017  if (src_f && src_f->IsSetData() && src_f->GetData().IsBiosrc()
9018  && src_f->GetData().GetBiosrc().IsSetTaxname()) {
9019  taxname = src_f->GetData().GetBiosrc().GetTaxname();
9020  }
9021  }
9022  }
9023  if (! NStr::IsBlank(taxname) && ! SeqIsPatent(seq)) {
9024  if (NStr::StartsWith(title, "PREDICTED: ")) {
9025  title = title.substr (11);
9026  }
9027  if (seq.IsNa()) {
9028  if (! NStr::StartsWith(title, taxname, NStr::eNocase)) {
9030  "RefSeq nucleotide title does not start with organism name",
9031  ctx, desc);
9032  }
9033  } else if (seq.IsAa()) {
9034  taxname = "[" + taxname + "]";
9035  if (! NStr::EndsWith(title, taxname, NStr::eNocase)) {
9036  if (! IsWp(bsh) || NStr::FindNoCase(title, taxname) == NPOS) {
9038  "RefSeq protein title does not end with organism name",
9039  ctx, desc);
9040  }
9041  }
9042  }
9043  }
9044  }
9045  } break;
9046 
9047  case CSeqdesc::e_Name:
9048  name_str = desc.GetName();
9049  if (! NStr::IsBlank(name_str)) {
9050  CSeqdesc_CI di2 = di;
9051  ++di2;
9052  while (di2) {
9053  if (di2->IsName()) {
9054  if (NStr::EqualNocase(name_str, di2->GetName())) {
9056  "Undesired multiple name descriptors, identical text",
9057  ctx, desc);
9058  } else {
9060  "Undesired multiple name descriptors, different text",
9061  ctx, desc);
9062  }
9063  }
9064  ++di2;
9065  }
9066  }
9067  break;
9068 
9069  case CSeqdesc::e_Method:
9070  if (! seq.IsAa()) {
9072  "Nucleic acid with protein sequence method",
9073  ctx, desc);
9074  }
9075  break;
9076 
9077  default:
9078  break;
9079  }
9080  }
9081 
9083 
9084  if (is_genome_assembly && is_finished_status && tech == CMolInfo::eTech_wgs) {
9085  const string& buf = seq.GetId().front()->AsFastaString();
9086  PostErr(eDiag_Warning, eErr_SEQ_DESCR_FinishedStatusForWGS, "WGS record " + buf + " should not have Finished status", seq);
9087  }
9088 
9089  if (IsMaster(seq)) {
9090  if (tech == CMolInfo::eTech_wgs && ! is_genome_assembly) {
9091  PostErr(IsEmblOrDdbj(seq) ? eDiag_Warning : eDiag_Error, eErr_SEQ_INST_WGSMasterLacksStrucComm, "WGS master without Genome Assembly Data user object", seq);
9092  }
9093  if (tech == CMolInfo::eTech_tsa && ! is_assembly) {
9094  PostErr(IsEmblOrDdbj(seq) ? eDiag_Warning : eDiag_Error, eErr_SEQ_INST_TSAMasterLacksStrucComm, "TSA master without Assembly Data user object", seq);
9095  }
9096  }
9097 
9098  if (num_gb > 1) {
9100  "Multiple GenBank blocks", ctx, *last_gb);
9101  }
9102 
9103  if (num_embl > 1) {
9105  "Multiple EMBL blocks", ctx, *last_embl);
9106  }
9107 
9108  if (num_pir > 1) {
9110  "Multiple PIR blocks", ctx, *last_pir);
9111  }
9112 
9113  if (num_pdb > 1) {
9115  "Multiple PDB blocks", ctx, *last_pdb);
9116  }
9117 
9118  if (num_prf > 1) {
9120  "Multiple PRF blocks", ctx, *last_prf);
9121  }
9122 
9123  if (num_sp > 1) {
9125  "Multiple SWISS-PROT blocks", ctx, *last_sp);
9126  }
9127 
9130 
9133 }
9134 
9135 
9137  const CGB_block& gbblock,
9138  const CBioseq& seq,
9139  const CSeqdesc& desc)
9140 {
9141  const CSeq_entry& ctx = *seq.GetParentEntry();
9142 
9143  bool has_tpa_inf = false, has_tpa_exp = false;
9144  FOR_EACH_KEYWORD_ON_GENBANKBLOCK (it, gbblock) {
9145  if (NStr::EqualNocase(*it, "TPA:experimental")) {
9146  has_tpa_exp = true;
9147  } else if (NStr::EqualNocase(*it, "TPA:inferential")) {
9148  has_tpa_inf = true;
9149  }
9150  }
9151  if (has_tpa_inf && has_tpa_exp) {
9153  "TPA:experimental and TPA:inferential should not both be in the same set of keywords",
9154  ctx, desc);
9155  }
9156 }
9157 
9158 
9160 {
9161  bool rval = false;
9162  if (seq.GetInst().GetMol() == CSeq_inst::eMol_dna) {
9164  "TSA sequence should not be DNA", seq);
9165  rval = true;
9166  }
9167  return rval;
9168 }
9169 
9170 
9172  const CMolInfo& minfo,
9173  int& seq_biomol,
9174  int& last_tech,
9175  int& last_completeness,
9176  const CBioseq& seq,
9177  const CSeqdesc& desc)
9178 {
9179  const CSeq_entry& ctx = *seq.GetParentEntry();
9180 
9181  bool is_synthetic_construct = false;
9182  bool is_artificial = false;
9183 
9185 
9186  while (src_di) {
9187  if (! is_synthetic_construct) {
9188  is_synthetic_construct = m_Imp.IsSyntheticConstruct(src_di->GetSource());
9189  }
9190  if (! is_artificial) {
9191  is_artificial = m_Imp.IsArtificial(src_di->GetSource());
9192  }
9193  x_ValidateMolInfoForBioSource(src_di->GetSource(), minfo, desc);
9194 
9195  ++src_di;
9196  }
9197 
9198  if (minfo.IsSetBiomol()) {
9199  int biomol = minfo.GetBiomol();
9200  if (seq_biomol < 0) {
9201  seq_biomol = biomol;
9202  }
9203 
9204  switch (biomol) {
9206  if (seq.IsNa()) {
9208  "Nucleic acid with Molinfo = peptide", ctx, desc);
9209  }
9210  break;
9211 
9213  if (! is_artificial) {
9215  "Molinfo-biomol = other genetic", ctx, desc);
9216  }
9217  break;
9218 
9220  if (! m_Imp.IsXR()) {
9221  if (! IsSynthetic()) {
9222  if (! x_IsMicroRNA()) {
9224  "Molinfo-biomol other used", ctx, desc);
9225  }
9226  }
9227  }
9228  break;
9229 
9230  default: // the rest are nucleic acid
9231  if (seq.IsAa()) {
9233  "Molinfo-biomol [" + NStr::IntToString(biomol) +
9234  "] used on protein", ctx, desc);
9235  } else {
9236  if (biomol != seq_biomol) {
9238  "Inconsistent Molinfo-biomol [" +
9239  NStr::IntToString(seq_biomol) + "] and [" +
9240  NStr::IntToString(biomol) + "]", ctx, desc);
9241  }
9242  }
9243  }
9244  // look for double-stranded mRNA
9245  if (biomol == CMolInfo::eBiomol_mRNA
9246  && seq.IsNa()
9247  && seq.IsSetInst() && seq.GetInst().IsSetStrand()
9249  && seq.GetInst().GetStrand() != CSeq_inst::eStrand_ss) {
9251  "mRNA should be single stranded not double stranded", ctx, desc);
9252  }
9253  } else {
9254  if (is_synthetic_construct && ! seq.IsAa()) {
9255  PostErr(eDiag_Warning, eErr_SEQ_DESCR_SyntheticConstructWrongMolType, "synthetic construct should have other-genetic", ctx, desc);
9256  }
9257  }
9258 
9259 
9260  if (minfo.IsSetTech()) {
9261  int tech = minfo.GetTech();
9262 
9263  if (seq.IsNa()) {
9264  switch (tech) {
9267  case CMolInfo::eTech_both:
9272  "Nucleic acid with protein sequence method", ctx, desc);
9273  break;
9274  default:
9275  break;
9276  }
9277  } else {
9278  switch (tech) {
9279  case CMolInfo::eTech_est:
9280  case CMolInfo::eTech_sts:
9288  case CMolInfo::eTech_htc:
9289  case CMolInfo::eTech_wgs:
9293  "Protein with nucleic acid sequence method", ctx, desc);
9294  break;
9295  default:
9296  break;
9297  }
9298  }
9299 
9300  switch (tech) {
9301  case CMolInfo::eTech_sts:
9303  case CMolInfo::eTech_wgs:
9309  if (tech == CMolInfo::eTech_sts &&
9310  seq.GetInst().GetMol() == CSeq_inst::eMol_rna &&
9311  minfo.IsSetBiomol() &&
9312  minfo.GetBiomol() == CMolInfo::eBiomol_mRNA) {
9313  // !!!
9314  // Ok, there are some STS sequences derived from
9315  // cDNAs, so do not report these
9316  } else if (! minfo.IsSetBiomol()
9317  || minfo.GetBiomol() != CMolInfo::eBiomol_genomic) {
9319  "HTGS/STS/GSS/WGS sequence should be genomic", seq);
9320  } else if (seq.GetInst().GetMol() != CSeq_inst::eMol_dna &&
9321  seq.GetInst().GetMol() != CSeq_inst::eMol_na &&
9322  ! seq.IsAa()) {
9324  "HTGS/STS/GSS/WGS sequence should not be RNA", seq);
9325  }
9326  break;
9327  case CMolInfo::eTech_est:
9328  if (tech == CMolInfo::eTech_est &&
9329  ((! minfo.IsSetBiomol()) ||
9330  minfo.GetBiomol() != CMolInfo::eBiomol_mRNA)) {
9332  "EST sequence should be mRNA", seq);
9333  }
9334  break;
9335  case CMolInfo::eTech_tsa:
9337  break;
9338  default:
9339  break;
9340  }
9341 
9342  if (tech == CMolInfo::eTech_htgs_3) {
9344  bool has_draft = false;
9345  bool has_prefin = false;
9346  bool has_activefin = false;
9347  bool has_fulltop = false;
9348  while (gb_i) {
9349  if (gb_i->GetGenbank().IsSetKeywords()) {
9350  CGB_block::TKeywords::const_iterator key_it = gb_i->GetGenbank().GetKeywords().begin();
9351  while (key_it != gb_i->GetGenbank().GetKeywords().end()) {
9352  if (NStr::EqualNocase(*key_it, "HTGS_DRAFT")) {
9353  has_draft = true;
9354  } else if (NStr::EqualNocase(*key_it, "HTGS_PREFIN")) {
9355  has_prefin = true;
9356  } else if (NStr::EqualNocase(*key_it, "HTGS_ACTIVEFIN")) {
9357  has_activefin = true;
9358  } else if (NStr::EqualNocase(*key_it, "HTGS_FULLTOP")) {
9359  has_fulltop = true;
9360  }
9361  ++key_it;
9362  }
9363  }
9364  ++gb_i;
9365  }
9366  if (has_draft) {
9368  "HTGS 3 sequence should not have HTGS_DRAFT keyword", seq);
9369  }
9370  if (has_prefin) {
9372  "HTGS 3 sequence should not have HTGS_PREFIN keyword", seq);
9373  }
9374  if (has_activefin) {
9376  "HTGS 3 sequence should not have HTGS_ACTIVEFIN keyword", seq);
9377  }
9378  if (has_fulltop) {
9380  "HTGS 3 sequence should not have HTGS_FULLTOP keyword", seq);
9381  }
9382  }
9383 
9384  if (last_tech > 0) {
9385  if (last_tech != tech) {
9387  "Inconsistent Molinfo-tech [" + NStr::IntToString (last_tech)
9388  + "] and [" + NStr::IntToString(tech) + "]", ctx, desc);
9389  }
9390  } else {
9391  last_tech = tech;
9392  }
9393  } else {
9394  if (last_tech > -1) {
9395  if (last_tech != 0) {
9397  "Inconsistent Molinfo-tech [" + NStr::IntToString (last_tech)
9398  + "] and [0]", ctx, desc);
9399  }
9400  } else {
9401  last_tech = 0;
9402  }
9403  }
9404 
9405  if (minfo.IsSetCompleteness()) {
9406  if (last_completeness > 0) {
9407  if (last_completeness != minfo.GetCompleteness()) {
9409  "Inconsistent Molinfo-completeness [" + NStr::IntToString (last_completeness)
9410  + "] and [" + NStr::IntToString(minfo.GetCompleteness()) + "]", ctx, desc);
9411  }
9412  } else {
9413  last_completeness = minfo.GetCompleteness();
9414  }
9415  } else {
9416  if (last_completeness > -1) {
9417  if (last_completeness != 0) {
9419  "Inconsistent Molinfo-completeness [" + NStr::IntToString (last_completeness)
9420  + "] and [0]", ctx, desc);
9421  }
9422  } else {
9423  last_completeness = 0;
9424  }
9425  }
9426  // need to look at closest molinfo descriptor, not all of them
9428  if (closest_molinfo) {
9429  const CMolInfo& molinfo = closest_molinfo->GetMolinfo();
9430  if (molinfo.IsSetCompleteness()) {
9431  x_ValidateCompletness(seq, molinfo);
9432  }
9433  }
9434 
9435 }
9436 
9437 
9438 // refactored in VR-918
9440  const CBioSource& src,
9441  const CMolInfo& minfo,
9442  const CSeqdesc& desc
9443 )
9444 {
9445  // common reality checks before calling x_CheckSingleStrandedRNAViruses and x_ReportLineageConflictWithMol
9446  if (m_CurrentHandle.IsAa()) {
9447  return;
9448  }
9449  if (! src.IsSetOrg() || ! src.GetOrg().IsSetLineage()) {
9450  return;
9451  }
9452 
9453  const string& lineage = src.GetOrg().GetLineage();
9454  if (! NStr::StartsWith(lineage, "Viruses; ")) {
9455  return;
9456  }
9457 
9458  string stranded_mol = s_GetStrandedMolStringFromLineage(lineage);
9459  if (NStr::FindNoCase(stranded_mol, "unknown") != NPOS) {
9460  return;
9461  }
9462 
9464 
9466  if (minfo.IsSetBiomol()) {
9467  biomol = minfo.GetBiomol();
9468  }
9469 
9470  x_CheckSingleStrandedRNAViruses(src, lineage, stranded_mol, biomol, m_CurrentHandle, desc, ctx);
9471 
9474  mol = m_CurrentHandle.GetInst_Mol();
9475  }
9476 
9477  x_ReportLineageConflictWithMol(lineage, stranded_mol, biomol, mol, desc, ctx);
9478 }
9479 
9480 
9482  const string& lineage,
9483  const string& stranded_mol,
9484  const CMolInfo::TBiomol biomol,
9485  CSeq_inst::EMol mol,
9486  const CSerialObject& obj,
9487  const CSeq_entry *ctx
9488 )
9489 {
9490  if (mol != CSeq_inst::eMol_rna && mol != CSeq_inst::eMol_dna) {
9491  return;
9492  }
9493 
9494  // special cases
9495  if (NStr::FindNoCase(lineage, "Retroviridae") != NPOS && NStr::EqualNocase(stranded_mol, "ssRNA-RT")) {
9496  // retrovirus can be rna or dna
9497  return;
9498  }
9499 
9500  if (NStr::EqualNocase(stranded_mol, "dsRNA")) {
9501  if (biomol != CMolInfo::eBiomol_genomic) {
9503  "dsRNA virus should be genomic RNA",
9504  obj, ctx);
9505  return;
9506  }
9507  }
9508 
9509  // otherwise look for molecule match regardless of strandedness
9510  if (mol == CSeq_inst::eMol_dna) {
9511  if (NStr::FindNoCase(stranded_mol, "DNA") != NPOS) {
9512  return;
9513  }
9514  } else if (mol == CSeq_inst::eMol_rna) {
9515  if (NStr::FindNoCase(stranded_mol, "RNA") != NPOS) {
9516  return;
9517  }
9518  }
9519 
9520  string mssg;
9521  if (NStr::FindNoCase(stranded_mol, "ssRNA") != NPOS) {
9522  mssg = "single-stranded RNA";
9523  } else if (NStr::FindNoCase(stranded_mol, "dsRNA") != NPOS) {
9524  mssg = "double-stranded RNA";
9525  } else if (NStr::FindNoCase(stranded_mol, "ssDNA") != NPOS) {
9526  mssg = "single-stranded DNA";
9527  } else if (NStr::FindNoCase(stranded_mol, "dsDNA") != NPOS) {
9528  mssg = "double-stranded DNA";
9529  } else if (NStr::FindNoCase(stranded_mol, "RNA") != NPOS) {
9530  mssg = "unknown-stranded RNA";
9531  } else if (NStr::FindNoCase(stranded_mol, "DNA") != NPOS) {
9532  mssg = "unknown-stranded DNA";
9533  }
9534 
9536  "Taxonomy indicates " + mssg +
9537  ", molecule type (" + CSeq_inst::GetMoleculeClass(mol) +
9538  ") is conflicting.",
9539  obj, ctx);
9540 }
9541 
9542 
9544  const CBioSource& source,
9545  const string& lineage,
9546  const string& stranded_mol,
9547  const CMolInfo::TBiomol biomol,
9548  const CBioseq_Handle& bsh,
9549  const CSerialObject& obj,
9550  const CSeq_entry *ctx
9551 )
9552 {
9553  if (NStr::FindNoCase(stranded_mol, "ssRNA") == NPOS) {
9554  return;
9555  }
9556  if (NStr::FindNoCase(stranded_mol, "DNA") != NPOS) {
9557  return;
9558  }
9559 
9560  const bool is_ambisense = NStr::EqualNocase(stranded_mol, "ssRNA(+/-)");
9561 
9562  // special cases
9563  if (is_ambisense) {
9564  if (biomol != CMolInfo::eBiomol_genomic && biomol != CMolInfo::eBiomol_cRNA) {
9566  "Ambisense virus should be genomic RNA or cRNA",
9567  obj, ctx);
9568  }
9569  return;
9570  }
9571 
9572  if (NStr::FindNoCase(lineage, "Retroviridae") != NPOS && NStr::EqualNocase(stranded_mol, "ssRNA-RT")) {
9573  if (biomol != CMolInfo::eBiomol_genomic) {
9575  "Retrovirus should be genomic RNA or genomic DNA",
9576  obj, ctx);
9577  }
9578  return;
9579  }
9580 
9581  bool negative_strand_virus = false;
9582  bool plus_strand_virus = false;
9583  if (NStr::FindNoCase(stranded_mol, "-)") != NPOS) {
9584  negative_strand_virus = true;
9585  }
9586  if (NStr::FindNoCase(stranded_mol, "(+") != NPOS) {
9587  plus_strand_virus = true;
9588  }
9589  if (! negative_strand_virus && ! plus_strand_virus) {
9590  return;
9591  }
9592 
9593  bool is_synthetic = false;
9594  if (source.GetOrg().IsSetDivision() && NStr::EqualNocase(source.GetOrg().GetDivision(), "SYN")) {
9595  is_synthetic = true;
9596  } else if (source.IsSetOrigin()) {
9597  CBioSource_Base::TOrigin orig = source.GetOrigin();
9601  is_synthetic = true;
9602  }
9603  }
9604 
9605  bool has_cds = false;
9606  bool has_plus_cds = false;
9607  bool has_minus_cds = false;
9608 
9610  while (cds_ci) {
9611  has_cds = true;
9612  if (cds_ci->GetLocation().GetStrand() == eNa_strand_minus) {
9613  has_minus_cds = true;
9614  } else {
9615  has_plus_cds = true;
9616  }
9617  if (has_minus_cds && has_plus_cds) {
9618  break;
9619  }
9620 
9621  ++cds_ci;
9622  }
9623 
9624  bool has_minus_misc_feat = false;
9625  bool has_plus_misc_feat = false;
9626 
9627  if (! has_cds) {
9629  while (misc_ci) {
9630  if (misc_ci->IsSetComment()
9631  && NStr::FindNoCase(misc_ci->GetComment(), "nonfunctional") != NPOS) {
9632  if (misc_ci->GetLocation().GetStrand() == eNa_strand_minus) {
9633  has_minus_misc_feat = true;
9634  } else {
9635  has_plus_misc_feat = true;
9636  }
9637  }
9638  if (has_minus_misc_feat && has_plus_misc_feat) {
9639  break;
9640  }
9641  ++misc_ci;
9642  }
9643  }
9644 
9645  if (negative_strand_virus) {
9646 
9647  if (has_minus_cds) {
9648  if (biomol != CMolInfo::eBiomol_genomic) {
9650  "Negative-sense single-stranded RNA virus with minus strand CDS should be genomic RNA",
9651  obj, ctx);
9652  }
9653  }
9654 
9655  if (has_plus_cds && ! is_synthetic && ! is_ambisense) {
9656  if (biomol != CMolInfo::eBiomol_cRNA) {
9658  "Negative-sense single-stranded RNA virus with plus strand CDS should be cRNA",
9659  obj, ctx);
9660  }
9661  }
9662 
9663  if (has_minus_misc_feat) {
9664  if (biomol != CMolInfo::eBiomol_genomic) {
9666  "Negative-sense single-stranded RNA virus with nonfunctional minus strand misc_feature should be genomic RNA",
9667  obj, ctx);
9668  }
9669  }
9670 
9671  if (has_plus_misc_feat && ! is_synthetic && ! is_ambisense) {
9672  if (biomol != CMolInfo::eBiomol_cRNA) {
9674  "Negative-sense single-stranded RNA virus with nonfunctional plus strand misc_feature should be cRNA",
9675  obj, ctx);
9676  }
9677  }
9678  }
9679 
9680  if (plus_strand_virus) {
9681 
9682  if (has_minus_cds) {
9684  "CDS should not be on minus strand of a positive-sense single-stranded RNA virus",
9685  obj, ctx);
9686  }
9687 
9688  if (! is_synthetic && ! is_ambisense) {
9689  if (biomol != CMolInfo::eBiomol_genomic) {
9691  "Positive-sense single-stranded RNA virus should be genomic RNA",
9692  obj, ctx);
9693  }
9694  }
9695  }
9696 }
9697 
9698 
9699 MAKE_CONST_MAP(kViralStrandMap, string, string,
9700 {
9701  {"root", "dsDNA"},
9702  {"Alphasatellitidae", "ssDNA"},
9703  {"Anelloviridae", "ssDNA(-)"},
9704  {"Bacilladnaviridae", "ssDNA"},
9705  {"Bidnaviridae", "ssDNA"},
9706  {"Circoviridae", "ssDNA(+/-)"},
9707  {"Geminiviridae", "ssDNA(+/-)"},
9708  {"Genomoviridae", "ssDNA"},
9709  {"Hepadnaviridae", "dsDNA-RT"},
9710  {"Inoviridae", "ssDNA(+)"},
9711  {"Microviridae", "ssDNA(+)"},
9712  {"Nanoviridae", "ssDNA(+)"},
9713  {"Ortervirales", "ssRNA-RT"},
9714  {"Caulimoviridae", "dsDNA-RT"},
9715  {"Parvoviridae", "ssDNA(+/-)"},
9716  {"Alphapleolipovirus", "dsDNA; ssDNA"},
9717  {"Riboviria", "RNA"},
9718  {"Albetovirus", "ssRNA(+)"},
9719  {"Alphatetraviridae", "ssRNA(+)"},
9720  {"Alvernaviridae", "ssRNA(+)"},
9721  {"Amalgaviridae", "dsRNA"},
9722  {"Astroviridae", "ssRNA(+)"},
9723  {"Aumaivirus", "ssRNA(+)"},
9724  {"Avsunviroidae", "ssRNA"},
9725  {"Barnaviridae", "ssRNA(+)"},
9726  {"Benyviridae", "ssRNA(+)"},
9727  {"Birnaviridae", "dsRNA"},
9728  {"Botourmiaviridae", "ssRNA(+)"},
9729  {"Botybirnavirus", "dsRNA"},
9730  {"Bromoviridae", "ssRNA(+)"},
9731  {"Caliciviridae", "ssRNA(+)"},
9732  {"Carmotetraviridae", "ssRNA(+)"},
9733  {"Chrysoviridae", "dsRNA"},
9734  {"Closteroviridae", "ssRNA(+)"},
9735  {"Cystoviridae", "dsRNA"},
9736  {"Deltavirus", "ssRNA(-)"},
9737  {"dsRNA viruses", "dsRNA"},
9738  {"Endornaviridae", "dsRNA"},
9739  {"Flaviviridae", "ssRNA(+)"},
9740  {"Hepeviridae", "ssRNA(+)"},
9741  {"Hypoviridae", "ssRNA(+)"},
9742  {"Idaeovirus", "ssRNA(+)"},
9743  {"Kitaviridae", "ssRNA(+)"},
9744  {"Leviviridae", "ssRNA(+)"},
9745  {"Luteoviridae", "ssRNA(+)"},
9746  {"Matonaviridae", "ssRNA(+)"},
9747  {"Megabirnaviridae", "dsRNA"},
9748  {"Narnaviridae", "ssRNA(+)"},
9749  {"Haploviricotina", "ssRNA(-)"},
9750  {"Arenaviridae", "ssRNA(+/-)"},
9751  {"Coguvirus", "ssRNA(-)"},
9752  {"Cruliviridae", "ssRNA(-)"},
9753  {"Fimoviridae", "ssRNA(-)"},
9754  {"Hantaviridae", "ssRNA(-)"},
9755  {"Leishbuviridae", "ssRNA(-)"},
9756  {"Mypoviridae", "ssRNA(-)"},
9757  {"Nairoviridae", "ssRNA(-)"},
9758  {"Peribunyaviridae", "ssRNA(-)"},
9759  {"Phasmaviridae", "ssRNA(-)"},
9760  {"Banyangvirus", "ssRNA(+/-)"},
9761  {"Beidivirus", "ssRNA(-)"},
9762  {"Goukovirus", "ssRNA(-)"},
9763  {"Horwuvirus", "ssRNA(-)"},
9764  {"Hudivirus", "ssRNA(-)"},
9765  {"Hudovirus", "ssRNA(-)"},
9766  {"Kabutovirus", "ssRNA(-)"},
9767  {"Laulavirus", "ssRNA(-)"},
9768  {"Mobuvirus", "ssRNA(-)"},
9769  {"Phasivirus", "ssRNA(-)"},
9770  {"Phlebovirus", "ssRNA(+/-)"},
9771  {"Pidchovirus", "ssRNA(-)"},
9772  {"Tenuivirus", "ssRNA(-)"},
9773  {"Wenrivirus", "ssRNA(-)"},
9774  {"Wubeivirus", "ssRNA(-)"},
9775  {"Tospoviridae", "ssRNA(+/-)"},
9776  {"Wupedeviridae", "ssRNA(-)"},
9777  {"Insthoviricetes", "ssRNA(-)"},
9778  {"Nidovirales", "ssRNA(+)"},
9779  {"Nodaviridae", "ssRNA(+)"},
9780  {"Papanivirus", "ssRNA(+)"},
9781  {"Partitiviridae", "dsRNA"},
9782  {"Permutotetraviridae", "ssRNA(+)"},
9783  {"Picobirnaviridae", "dsRNA"},
9784  {"Picornavirales", "ssRNA(+)"},
9785  {"Pospiviroidae", "ssRNA"},
9786  {"Potyviridae", "ssRNA(+)"},
9787  {"Quadriviridae", "dsRNA"},
9788  {"Reoviridae", "dsRNA"},
9789  {"Sarthroviridae", "ssRNA(+)"},
9790  {"Sinaivirus", "ssRNA(+)"},
9791  {"Solemoviridae", "ssRNA(+)"},
9792  {"Solinviviridae", "ssRNA(+)"},
9793  {"Togaviridae", "ssRNA(+)"},
9794  {"Tombusviridae", "ssRNA(+)"},
9795  {"Totiviridae", "dsRNA"},
9796  {"Tymovirales", "ssRNA(+)"},
9797  {"Virgaviridae", "ssRNA(+)"},
9798  {"Virtovirus", "ssRNA(+)"},
9799  {"ssRNA viruses", "ssRNA"},
9800  {"unclassified ssRNA viruses", "ssRNA"},
9801  {"unclassified ssRNA negative-strand viruses", "ssRNA(-)"},
9802  {"unclassified ssRNA positive-strand viruses", "ssRNA(+)"},
9803  {"unclassified viroids", "ssRNA"},
9804  {"DNA satellites", "DNA"},
9805  {"RNA satellites", "RNA"},
9806  {"Smacoviridae", "ssDNA"},
9807  {"Spiraviridae", "ssDNA(+)"},
9808  {"Tolecusatellitidae", "ssDNA"},
9809  {"unclassified viruses", "unknown"},
9810  {"unclassified DNA viruses", "DNA"},
9811  {"unclassified archaeal dsDNA viruses", "dsDNA"},
9812  {"unclassified dsDNA phages", "dsDNA"},
9813  {"unclassified dsDNA viruses", "dsDNA"},
9814  {"unclassified ssDNA bacterial viruses", "ssDNA"},
9815  {"unclassified ssDNA viruses", "ssDNA"},
9816  {"environmental samples", "unknown"},
9817 });
9818 
9820 
9822 {
9823  //auto tax_update = m_Imp.SetContext().m_taxon_update;
9824  try {
9825  CTaxon1 tax;
9826  CTaxon1::TInfoList moltypes;
9827  TViralMap viral_map;
9828  if (tax.GetInheritedPropertyDefines("genomic_moltype", moltypes)) {
9829  string sName;
9830  for (auto it : moltypes) {
9831  if (tax.GetScientificName(TAX_ID_FROM(TIntId, it->GetIval1()), sName )) {
9832  if (it->GetIval2() == 1) {
9833  viral_map [sName] = it->GetSval();
9834  }
9835  }
9836  }
9837  }
9838  return viral_map;
9839  } catch (const CException& e) {
9840  // report if desired (at severity info or warning, probably)
9841  LOG_POST_XX(Corelib_App, 1, e.GetMsg());
9842  }
9843 
9844  return {};
9845 }
9846 
9847 
9849 {
9850  // Retroviridae no longer in list
9851  if (NStr::FindNoCase(lineage, "Retroviridae") != NPOS) {
9852  return "ssRNA-RT";
9853  }
9854 
9855  // Topsovirus (ambisense) not in list
9856  if (NStr::FindNoCase(lineage, "Tospovirus") != NPOS) {
9857  return "ssRNA(+/-)";
9858  }
9859 
9860  // Tenuivirus has several segments, most of which are ambisense
9861  if (NStr::FindNoCase(lineage, "Tenuivirus") != NPOS) {
9862  return "ssRNA(+/-)";
9863  }
9864 
9865  // Arenaviridae is ambisense, has priority over old-style checks
9866  if (NStr::FindNoCase(lineage, "Arenaviridae") != NPOS) {
9867  return "ssRNA(+/-)";
9868  }
9869 
9870  // Phlebovirus is ambisense, has priority over old-style checks
9871  if (NStr::FindNoCase(lineage, "Phlebovirus") != NPOS) {
9872  return "ssRNA(+/-)";
9873  }
9874 
9875  // unclassified viruses have old-style lineage
9876  if (NStr::FindNoCase(lineage, "negative-strand viruses") != NPOS) {
9877  return "ssRNA(-)";
9878  }
9879  if (NStr::FindNoCase(lineage, "positive-strand viruses") != NPOS) {
9880  return "ssRNA(+)";
9881  }
9882 
9883  static const auto s_ViralMap = s_InitializeViralMap();
9884 
9885  if (s_ViralMap.empty()) {
9886  for (const auto& x : kViralStrandMap) {
9887  if (NStr::FindNoCase(lineage, x.first) != NPOS) {
9888  return x.second;
9889  }
9890  }
9891  } else {
9892  for (const auto& x : s_ViralMap) {
9893  if (NStr::FindNoCase(lineage, x.first) != NPOS) {
9894  return x.second;
9895  }
9896  }
9897  }
9898 
9899  // use root value for default
9900  return "dsDNA";
9901 }
9902 
9903 
9904 void CValidError_bioseq::ReportModifInconsistentError(int new_mod, int& old_mod, const CSeqdesc& desc, const CSeq_entry& ctx)
9905 {
9906  if (old_mod >= 0) {
9907  if (new_mod != old_mod) {
9909  "Inconsistent GIBB-mod [" + NStr::IntToString (old_mod) + "] and ["
9910  + NStr::IntToString (new_mod) + "]", ctx, desc);
9911  }
9912  } else {
9913  old_mod = new_mod;
9914  }
9915 }
9916 
9917 
9919 {
9920  const CSeq_entry& ctx = *seq.GetParentEntry();
9921 
9922  int last_na_mod = -1;
9923  int last_organelle = -1;
9924  int last_partialness = -1;
9925  int last_left_right = -1;
9926 
9928  while (desc_ci) {
9929  CSeqdesc::TModif modif = desc_ci->GetModif();
9930  CSeqdesc::TModif::const_iterator it = modif.begin();
9931  while (it != modif.end()) {
9932  int modval = *it;
9933  switch (modval) {
9934  case eGIBB_mod_dna:
9935  case eGIBB_mod_rna:
9938  "Nucleic acid GIBB-mod [" + NStr::IntToString(modval) + "] on protein",
9939  ctx, *desc_ci);
9940  } else {
9941  ReportModifInconsistentError(modval, last_na_mod, *desc_ci, ctx);
9942  }
9943  break;
9945  case eGIBB_mod_chloroplast:
9946  case eGIBB_mod_kinetoplast:
9947  case eGIBB_mod_cyanelle:
9949  ReportModifInconsistentError(modval, last_organelle, *desc_ci, ctx);
9950  break;
9951  case eGIBB_mod_partial:
9952  case eGIBB_mod_complete:
9953  ReportModifInconsistentError(modval, last_partialness, *desc_ci, ctx);
9954  if (last_left_right >= 0 && modval == eGIBB_mod_complete) {
9956  "Inconsistent GIBB-mod [" + NStr::IntToString (last_left_right) + "] and ["
9957  + NStr::IntToString (modval) + "]",
9958  ctx, *desc_ci);
9959  }
9960  break;
9961  case eGIBB_mod_no_left:
9962  case eGIBB_mod_no_right:
9963  if (last_partialness == eGIBB_mod_complete) {
9965  "Inconsistent GIBB-mod [" + NStr::IntToString (last_partialness) + "] and ["
9966  + NStr::IntToString (modval) + "]",
9967  ctx, *desc_ci);
9968  }
9969  last_left_right = modval;
9970  break;
9971  default:
9972  break;
9973  }
9974  ++it;
9975  }
9976  ++desc_ci;
9977  }
9978 }
9979 
9980 
9982 {
9983  const CSeq_entry& ctx = *seq.GetParentEntry();
9984 
9985  int last_na_mol = 0;
9987  while (desc_ci) {
9988  int modval = desc_ci->GetMol_type();
9989  switch (modval) {
9990  case eGIBB_mol_peptide:
9991  if (! seq.IsAa()) {
9993  "Nucleic acid with GIBB-mol = peptide",
9994  ctx, *desc_ci);
9995  }
9996  break;
9997  case eGIBB_mol_unknown:
9998  case eGIBB_mol_other:
10000  "GIBB-mol unknown or other used",
10001  ctx, *desc_ci);
10002  break;
10003  default: // the rest are nucleic acid
10004  if (seq.IsAa()) {
10006  "GIBB-mol [" + NStr::IntToString (modval) + "] used on protein",
10007  ctx, *desc_ci);
10008  } else {
10009  if (last_na_mol) {
10010  if (last_na_mol != modval) {
10012  "Inconsistent GIBB-mol [" + NStr::IntToString (last_na_mol)
10013  + "] and [" + NStr::IntToString (modval) + "]",
10014  ctx, *desc_ci);
10015  }
10016  } else {
10017  last_na_mol = modval;
10018  }
10019  }
10020  break;
10021  }
10022  ++desc_ci;
10023  }
10024 }
10025 
10026 
10028 {
10029  if (m_CurrentHandle) {
10031  if (sd) {
10032  const CSeqdesc::TSource& source = sd->GetSource();
10033  if (source.CanGetOrigin() &&
10034  source.GetOrigin() == CBioSource::eOrigin_synthetic) {
10035  return true;
10036  }
10037  if (source.CanGetOrg() && source.GetOrg().CanGetOrgname()) {
10038  const COrgName& org_name = source.GetOrg().GetOrgname();
10039  if (org_name.CanGetDiv()) {
10040  if (NStr::CompareNocase(org_name.GetDiv(), "SYN") == 0) {
10041  return true;
10042  }
10043  }
10044  }
10045  }
10046  }
10047  return false;
10048 }
10049 
10050 
10052 {
10055  CFeat_CI fi(m_CurrentHandle, selector);
10056 
10057  for (; fi; ++fi) {
10058  const CRNA_ref& rna_ref = fi->GetData().GetRna();
10059  if (rna_ref.IsSetExt() && rna_ref.GetExt().IsName()) {
10060  if (NStr::Find(rna_ref.GetExt().GetName(), "microRNA") != NPOS) {
10061  return true;
10062  }
10063  }
10064  }
10065  return false;
10066 }
10067 
10068 
10070  const CDate& update,
10071  const CDate& create,
10072  const CBioseq& seq,
10073  const CSeqdesc& desc)
10074 {
10075  if (update.Compare(create) == CDate::eCompare_before && m_Imp.HasGiOrAccnVer()) {
10076 
10077  string create_str;
10078  GetDateString(create_str, create);
10079  string update_str;
10080  GetDateString(update_str, update);
10081 
10082  string err_msg = "Inconsistent create_date [";
10083  err_msg += create_str;
10084  err_msg += "] and update_date [";
10085  err_msg += update_str;
10086  err_msg += "]";
10087 
10088  CSeq_entry* ctx = seq.GetParentEntry();
10090  err_msg, *ctx, desc);
10091  }
10092 }
10093 
10094 
10096  const COrg_ref& this_org,
10097  const COrg_ref& org,
10098  const CBioseq& seq,
10099  const CSeqdesc& desc)
10100 {
10101  if (this_org.IsSetTaxname() && org.IsSetTaxname()) {
10102  if (this_org.GetTaxname() != org.GetTaxname()) {
10103  bool is_wp = false;
10104  FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, seq) {
10105  const CSeq_id& sid = **sid_itr;
10106  CSeq_id::E_Choice typ = sid.Which();
10107  if (typ == CSeq_id::e_Other) {
10108  if (sid.GetOther().IsSetAccession()) {
10109  string acc = sid.GetOther().GetAccession().substr(0, 3);
10110  if (acc == "WP_") {
10111  is_wp = true;
10112  }
10113  }
10114  }
10115  }
10116  if (! is_wp) {
10118  "Inconsistent organism names [" + this_org.GetTaxname() +
10119  "] and [" + org.GetTaxname() + "]",
10120  *seq.GetParentEntry(), desc);
10121  }
10122  }
10123  }
10124 }
10125 
10126 
10127 static bool s_ReportableCollision(const CGene_ref& g1, const CGene_ref& g2)
10128 {
10129  if (g1.IsSetLocus() && ! NStr::IsBlank(g1.GetLocus())
10130  && g2.IsSetLocus() && ! NStr::IsBlank(g2.GetLocus())
10131  && NStr::EqualNocase(g1.GetLocus(), g2.GetLocus())) {
10132  return true;
10133  } else if (g1.IsSetLocus_tag() && ! NStr::IsBlank(g1.GetLocus_tag())
10134  && g2.IsSetLocus_tag() && ! NStr::IsBlank(g2.GetLocus_tag())
10135  && NStr::EqualNocase(g1.GetLocus_tag(), g2.GetLocus_tag())) {
10136  return true;
10137  } else if (g1.IsSetDesc() && ! NStr::IsBlank(g1.GetDesc())
10138  && g2.IsSetDesc() && ! NStr::IsBlank(g2.GetDesc())
10139  && NStr::EqualNocase(g1.GetDesc(), g2.GetDesc())) {
10140  return false;
10141  } else {
10142  return true;
10143  }
10144 }
10145 
10146 
10148  const TStrFeatMap& str_feat_map,
10149  const string& type)
10150 {
10151  bool is_gene_locus = NStr::EqualNocase(type, "names");
10152 
10153  // iterate through multimap and compare strings
10154  bool first = true;
10155  bool reported_first = false;
10156  bool lastIsSplit = false;
10157  const string* strp = nullptr;
10158  const CSeq_feat* feat = nullptr;
10159  ITERATE (TStrFeatMap, it, str_feat_map) {
10160  if (first) {
10161  first = false;
10162  strp = &(it->first);
10163  feat = it->second;
10164  lastIsSplit = (bool) (feat->IsSetExcept() &&
10165  feat->IsSetExcept_text() &&
10166  NStr::FindNoCase(feat->GetExcept_text(), "gene split at ") != string::npos);
10167  continue;
10168  }
10169 
10170  string message;
10171  if (NStr::Equal(*strp, it->first)) {
10172  message = "Colliding " + type + " in gene features";
10173  } else if (NStr::EqualNocase(*strp, it->first)) {
10174  message = "Colliding " + type + " (with different capitalization) in gene features";
10175  }
10176 
10177  if (! NStr::IsBlank(message)
10178  && s_ReportableCollision(feat->GetData().GetGene(), it->second->GetData().GetGene())) {
10179 
10180  bool suppress_message = false;
10181  if (m_Imp.IsSmallGenomeSet()) {
10182  if (feat->IsSetExcept() && feat->IsSetExcept_text()
10183  && NStr::FindNoCase(feat->GetExcept_text(), "trans-splicing") != string::npos &&
10184  it->second->IsSetExcept() && it->second->IsSetExcept_text()
10185  && NStr::FindNoCase(it->second->GetExcept_text(), "trans-splicing") != string::npos) {
10186  // suppress for trans-spliced genes on small genome set
10187  suppress_message = true;
10188  }
10189  }
10190 
10191  if (suppress_message) {
10192  // suppress for trans-spliced genes on small genome set
10193  } else if (is_gene_locus && sequence::Compare(feat->GetLocation(),
10194  (*it->second).GetLocation(),
10195  m_Scope,
10198  message + ", but feature locations are identical", *it->second);
10199  } else if (! is_gene_locus) {
10200  const CSeq_feat* nfeat = it->second;
10201  bool isSplit = (bool) (nfeat->IsSetExcept() &&
10202  nfeat->IsSetExcept_text() &&
10203  NStr::FindNoCase(nfeat->GetExcept_text(), "gene split at ") != string::npos);
10204 
10205  if (! suppress_message && ((! isSplit) || (! lastIsSplit))) {
10206  if (! reported_first) {
10207  // for now, don't report first colliding gene - C Toolkit doesn't
10208  // PostErr(sev, err, message, *feat);
10209  reported_first = true;
10210  }
10211 
10212 
10213  PostErr(eDiag_Error, eErr_SEQ_FEAT_CollidingLocusTags, message, *it->second);
10214  }
10215  }
10216  }
10217  strp = &(it->first);
10218  feat = it->second;
10219  lastIsSplit = (bool) (feat->IsSetExcept() &&
10220  feat->IsSetExcept_text() &&
10221  NStr::FindNoCase(feat->GetExcept_text(), "gene split at ") != string::npos);
10222  }
10223 }
10224 
10225 
10227 {
10228  try {
10229  TStrFeatMap label_map;
10230  TStrFeatMap locus_tag_map;
10231  TStrFeatMap locus_map;
10232  TStrFeatMap syn_map;
10233 
10234  // Loop through genes and insert into multimap sorted by
10235  // gene label / locus_tag -- case insensitive
10236  if (m_GeneIt) {
10238  const CSeq_feat& feat = fi->GetOriginalFeature();
10239  // record label
10240  string label;
10242  label_map.insert(TStrFeatMap::value_type(label, &feat));
10243  // record locus_tag
10244  const CGene_ref& gene = feat.GetData().GetGene();
10245  if (gene.IsSetLocus_tag() && ! NStr::IsBlank(gene.GetLocus_tag())) {
10246  locus_tag_map.insert(TStrFeatMap::value_type(gene.GetLocus_tag(), &feat));
10247  }
10248  // record locus
10249  if (gene.IsSetLocus() && ! NStr::IsBlank(gene.GetLocus())) {
10250  locus_map.insert(TStrFeatMap::value_type(gene.GetLocus(), &feat));
10251  }
10252  // record synonyms
10253  FOR_EACH_SYNONYM_ON_GENEREF (syn_it, gene) {
10254  syn_map.insert(TStrFeatMap::value_type((*syn_it), &feat));
10255  }
10256  }
10257  x_CompareStrings(label_map, "names");
10258  x_CompareStrings(locus_tag_map, "locus_tags");
10259  // look for synonyms on genes that match locus of different genes,
10260  // but not if gpipe
10261  if (! m_Imp.IsGpipe()) {
10262  ITERATE (TStrFeatMap, syngene_it, syn_map) {
10263  TStrFeatMap::iterator gene_it = locus_map.find(syngene_it->first);
10264  if (gene_it != locus_map.end()) {
10265  bool found = false;
10266  FOR_EACH_SYNONYM_ON_GENEREF (syn_it, gene_it->second->GetData().GetGene()) {
10267  if (NStr::Equal(*syn_it, syngene_it->first)) {
10268  found = true;
10269  break;
10270  }
10271  }
10272  if (! found) {
10274  "gene synonym has same value (" + syngene_it->first + ") as locus of another gene feature",
10275  *syngene_it->second);
10276  }
10277  }
10278  }
10279  }
10280  }
10281 
10282  } catch (const exception& e) {
10283  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
10285  string("Exception while validating colliding genes. EXCEPTION: ") +
10286  e.what(), seq);
10287  }
10288  }
10289 }
10290 
10292 {
10293  // This check is applicable to nucleotide sequences only
10294  if (! seq.IsNa()) {
10295  return;
10296  }
10297 
10298  // EMBL or DDBJ check
10299  bool embl_ddbj = false;
10300  ITERATE(CBioseq::TId, id, seq.GetId()) {
10301  if ((*id)->IsDdbj() || (*id)->IsEmbl()) {
10302  embl_ddbj = true;
10303  break;
10304  }
10305  }
10306 
10307  if (! embl_ddbj) {
10308  return;
10309  }
10310 
10311  // Completness check
10312  bool complete_genome = false;
10313  string title;
10315  if (ti) {
10316  title = ti->GetTitle();
10317  } else {
10318  sequence::CDeflineGenerator defline_generator;
10319  title = defline_generator.GenerateDefline(seq, *m_Scope, sequence::CDeflineGenerator::fIgnoreExisting);
10320  }
10321  complete_genome = (! NStr::IsBlank(title) && NStr::Find(title, "complete genome") != string::npos);
10322 
10323  if (! complete_genome) {
10324 
10326  if (ei && ei->GetEmbl().IsSetKeywords()) {
10327 
10328  ITERATE(CEMBL_block::TKeywords, keyword, ei->GetEmbl().GetKeywords()) {
10329  if (NStr::EqualNocase(*keyword, "complete genome")) {
10330  complete_genome = true;
10331  break;
10332  }
10333  }
10334  }
10335  }
10336 
10337  if (! complete_genome) {
10338  return;
10339  }
10340 
10341  // Division check
10343  if (! si || ! si->GetSource().IsSetDivision() || si->GetSource().GetDivision() != "BCT") {
10344  return;
10345  }
10346 
10347  // Check for BioProject accession
10348  bool bioproject_accession_set = false;
10349  for (CSeqdesc_CI ui(m_CurrentHandle, CSeqdesc::e_User); ui; ++ui) {
10350 
10351  if (ui->GetUser().IsSetData() && ui->GetUser().IsSetType() && ui->GetUser().GetType().IsStr() && NStr::EqualCase(ui->GetUser().GetType().GetStr(), "DBLink")) {
10352  bioproject_accession_set = ! ui->GetUser().GetData().empty();
10353  break;
10354  }
10355  }
10356 
10357  if (bioproject_accession_set)
10358  return;
10359 
10360  // Delta-seq check
10361  bool no_gaps = true;
10362  if (seq.IsSetInst() && seq.GetInst().IsSetRepr() && seq.GetInst().GetRepr() == CSeq_inst::eRepr_delta && seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta()) {
10363 
10364  const CDelta_ext& delta = seq.GetInst().GetExt().GetDelta();
10365  if (delta.IsSet()) {
10366 
10367  ITERATE(CDelta_ext::Tdata, part, delta.Get()) {
10368 
10369  if ((*part)->IsLiteral()) {
10370  const CSeq_literal& literal = (*part)->GetLiteral();
10371  if (literal.IsSetLength() && ! literal.IsSetSeq_data()) {
10372  no_gaps = false;
10373  break;
10374  }
10375 
10376  if (literal.IsSetSeq_data() && literal.GetSeq_data().IsGap()) {
10377  no_gaps = false;
10378  break;
10379  }
10380  }
10381  }
10382  }
10383  }
10384 
10385  // Check for instantiated gaps
10386  if (no_gaps) {
10388  if (feat) {
10389  no_gaps = false;
10390  }
10391  }
10392 
10393 
10394  if (no_gaps) {
10396  "No BioProject Accession exists for what appears to be a complete genome",
10397  seq);
10398  }
10399 }
10400 
10401 
10403 {
10404  const CSeq_id* gb_id = nullptr;
10405  CRef<CSeq_id> db_gb_id;
10406  TGi gi = ZERO_GI,
10407  db_gi = ZERO_GI;
10408  CRef<CSeq_id> db_general_id;
10409  const CDbtag* general_id = nullptr;
10410 
10411  FOR_EACH_SEQID_ON_BIOSEQ (id, seq) {
10412  switch ((*id)->Which()) {
10413  case CSeq_id::e_Genbank:
10414  gb_id = id->GetPointer();
10415  break;
10416 
10417  case CSeq_id::e_Gi:
10418  gi = (*id)->GetGi();
10419  break;
10420 
10421  case CSeq_id::e_General:
10422  general_id = &((*id)->GetGeneral());
10423  break;
10424 
10425  default:
10426  break;
10427  }
10428  }
10429 
10430  if (gi == ZERO_GI && gb_id) {
10431  gi = GetGIForSeqId(*gb_id);
10432  }
10433 
10434  if (gi <= ZERO_GI) {
10435  return;
10436  }
10437 
10438  CScope::TIds id_set = GetSeqIdsForGI(gi);
10439  if (! id_set.empty()) {
10440  ITERATE( CScope::TIds, id, id_set ) {
10441  switch ((*id).Which()) {
10442  case CSeq_id::e_Genbank:
10443  db_gb_id = CRef<CSeq_id> (new CSeq_id);
10444  db_gb_id->Assign(*(id->GetSeqId()));
10445  break;
10446  case CSeq_id::e_Gi:
10447  db_gi = (*id).GetGi();
10448  break;
10449  case CSeq_id::e_General:
10450  db_general_id.Reset(new CSeq_id());
10451  db_general_id->Assign(*((*id).GetSeqId()));
10452  break;
10453  default:
10454  break;
10455  }
10456  }
10457 
10458  string gi_str = NStr::NumericToString(gi);
10459 
10460  if (db_gi != gi) {
10462  "New gi number (" + gi_str + ")" +
10463  " does not match one in NCBI sequence repository (" + NStr::NumericToString(db_gi) + ")",
10464  seq);
10465  }
10466  if (gb_id && db_gb_id) {
10467  if (! gb_id->Match(*db_gb_id)) {
10469  "New accession (" + gb_id->AsFastaString() +
10470  ") does not match one in NCBI sequence repository (" + db_gb_id->AsFastaString() +
10471  ") on gi (" + gi_str + ")", seq);
10472  }
10473  } else if (gb_id) {
10475  "Gain of accession (" + gb_id->AsFastaString() + ") on gi (" +
10476  gi_str + ") compared to the NCBI sequence repository", seq);
10477  } else if (db_gb_id) {
10479  "Loss of accession (" + db_gb_id->AsFastaString() +
10480  ") on gi (" + gi_str + ") compared to the NCBI sequence repository", seq);
10481  }
10482 
10483  string new_gen_label, old_gen_label;
10484  if (general_id && db_general_id) {
10485  if (! general_id->Match(db_general_id->GetGeneral())) {
10486  db_general_id->GetGeneral().GetLabel(&old_gen_label);
10487  general_id->GetLabel(&new_gen_label);
10489  "New general ID (" + new_gen_label +
10490  ") does not match one in NCBI sequence repository (" + old_gen_label +
10491  ") on gi (" + gi_str + ")", seq);
10492  }
10493  } else if (general_id) {
10494  general_id->GetLabel(&new_gen_label);
10496  "Gain of general ID (" + new_gen_label + ") on gi (" +
10497  gi_str + ") compared to the NCBI sequence repository", seq);
10498  } else if (db_general_id) {
10499  db_general_id->GetGeneral().GetLabel(&old_gen_label);
10501  "Loss of general ID (" + old_gen_label + ") on gi (" +
10502  gi_str + ") compared to the NCBI sequence repository", seq);
10503  }
10504  }
10505 }
10506 
10507 
10509 {
10510  return CGraph_CI(m_CurrentHandle);
10511 }
10512 
10513 
10515 {
10517  if (gb_desc) {
10518  const CGB_block& gb = gb_desc->GetGenbank();
10519  if (gb.IsSetKeywords()) {
10521  if (NStr::CompareNocase(*iter, "HTGS_ACTIVEFIN") == 0) {
10522  return true;
10523  }
10524  }
10525  }
10526  }
10527  return false;
10528 }
10529 
10530 
10532 {
10533  if (! lit.CanGetSeq_data()) {
10534  return 0;
10535  }
10536 
10537  const CSeq_data& lit_data = lit.GetSeq_data();
10538  CSeq_data data;
10539  size_t count = 0;
10540  size_t max = 0;
10541  size_t pos = 0;
10542 
10543  if (lit_data.IsIupacna()
10544  || lit_data.IsNcbi2na()
10545  || lit_data.IsNcbi4na()
10546  || lit_data.IsNcbi8na()) {
10547  CSeqportUtil::Convert(lit_data, &data, CSeq_data::e_Iupacna);
10548  ITERATE(string, res, data.GetIupacna().Get() ) {
10549  if (*res == 'N') {
10550  ++count;
10551  if (count > max) {
10552  max = count;
10553  }
10554  } else {
10555  count = 0;
10556  }
10557  // CSeqportUtil::Convert will convert more data
10558  //than the specified length, if the data is present
10559  ++pos;
10560  if (pos >= lit.GetLength()) {
10561  break;
10562  }
10563  }
10564  } else {
10565  CSeqportUtil::Convert(lit_data, &data, CSeq_data::e_Iupacaa);
10566  ITERATE(string, res, data.GetIupacaa().Get() ) {
10567  if (*res == 'N') {
10568  ++count;
10569  if (count > max) {
10570  max = count;
10571  }
10572  } else {
10573  count = 0;
10574  }
10575  ++pos;
10576  // CSeqportUtil::Convert will convert more data
10577  //than the specified length, if the data is present
10578  if (pos >= lit.GetLength()) {
10579  break;
10580  }
10581  }
10582  }
10583 
10584  return max;
10585 }
10586 
10587 
10589 {
10590  if (inst.CanGetExt() && inst.GetExt().IsDelta()) {
10591  ITERATE(CDelta_ext::Tdata, iter, inst.GetExt().GetDelta().Get()) {
10592  if ((*iter)->IsLoc()) {
10593  return false;
10594  }
10595  }
10596  }
10597  return true;
10598 }
10599 
10600 
10602 {
10603  for (CSeqdesc_CI it(bsh, CSeqdesc::e_User); it; ++it) {
10604  const CUser_object& uo = it->GetUser();
10605 
10606  if (uo.CanGetType() && uo.GetType().IsStr() &&
10607  NStr::CompareNocase(uo.GetType().GetStr(), "TpaAssembly") == 0) {
10608  return true;
10609  }
10610  }
10611 
10612  return false;
10613 }
10614 
10615 
10617 {
10619  return;
10620  }
10621 
10622  if (seq.CanGetInst() &&
10623  seq.GetInst().CanGetHist() &&
10624  ! seq.GetInst().GetHist().GetAssembly().empty()) {
10626  } else {
10628  }
10629 }
10630 
10631 
10633 {
10634  CConstRef<CSeq_feat> gene3 = m_Imp.GetCachedGene(&utr3);
10635  if (gene3) {
10636  CConstRef<CSeq_feat> gene5 = m_Imp.GetCachedGene(&utr5);
10637  if (gene5 && gene3.GetPointer() == gene5.GetPointer()) {
10638  return true;
10639  }
10640  }
10641  return false;
10642 }
10643 
10644 
10646 {
10647  if (! m_AllFeatIt) {
10648  return;
10649  }
10650  CConstRef<CSeq_feat> cds_minus;
10651  CConstRef<CSeq_feat> utr3_minus;
10652  CConstRef<CSeq_feat> utr5_minus;
10653  CConstRef<CSeq_feat> cds_plus;
10654  CConstRef<CSeq_feat> utr3_plus;
10655  CConstRef<CSeq_feat> utr5_plus;
10656 
10658  ENa_strand strand = f->GetLocation().GetStrand();
10659  if (f->GetData().IsCdregion()) {
10660  if (strand == eNa_strand_minus) {
10661  cds_minus = f->GetSeq_feat();
10662  } else {
10663  cds_plus = f->GetSeq_feat();
10664  }
10665  } else if (f->GetData().GetSubtype() == CSeqFeatData::eSubtype_3UTR) {
10666  if (strand == eNa_strand_minus) {
10667  utr3_minus = f->GetSeq_feat();
10668  } else {
10669  utr3_plus = f->GetSeq_feat();
10670  if (! cds_plus && utr5_plus && x_ReportUTRPair(*utr5_plus, *utr3_plus)) {
10672  "CDS not between 5'UTR and 3'UTR on plus strand", *utr3_plus);
10673  }
10674  utr5_plus.Reset();
10675  cds_plus.Reset();
10676  utr3_plus.Reset();
10677  }
10678  } else if (f->GetData().GetSubtype() == CSeqFeatData::eSubtype_5UTR) {
10679  if (strand == eNa_strand_minus) {
10680  utr5_minus = f->GetSeq_feat();
10681  if (! cds_minus && utr3_minus && x_ReportUTRPair(*utr5_minus, *utr3_minus)) {
10683  "CDS not between 5'UTR and 3'UTR on minus strand", *utr5_minus);
10684  }
10685  utr5_minus.Reset();
10686  cds_minus.Reset();
10687  utr3_minus.Reset();
10688  } else {
10689  utr5_plus = f->GetSeq_feat();
10690  }
10691  }
10692  }
10693 }
10694 
10695 
10696 #if 0
10697 CValidError_bioseq::CmRNACDSIndex::CmRNACDSIndex()
10698 {
10699 }
10700 
10701 
10702 CValidError_bioseq::CmRNACDSIndex::~CmRNACDSIndex()
10703 {
10704 }
10705 #endif
10706 
10707 bool s_IdXrefsAreReciprocal(const CMappedFeat& cds, const CMappedFeat& mrna)
10708 {
10709  if (! cds.IsSetId() || ! cds.GetId().IsLocal()
10710  || ! mrna.IsSetId() || ! mrna.GetId().IsLocal()) {
10711  return false;
10712  }
10713 
10714  bool match = false;
10715 
10716  FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, cds) {
10717  if ((*itx)->IsSetId() && s_FeatureIdsMatch((*itx)->GetId(), mrna.GetId())) {
10718  match = true;
10719  break;
10720  }
10721  }
10722  if (! match) {
10723  return false;
10724  }
10725  match = false;
10726 
10727  FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, mrna) {
10728  if ((*itx)->IsSetId() && s_FeatureIdsMatch((*itx)->GetId(), cds.GetId())) {
10729  match = true;
10730  break;
10731  }
10732  }
10733 
10734  return match;
10735 }
10736 
10737 
10738 unsigned int s_IdXrefsNotReciprocal(const CSeq_feat& cds, const CSeq_feat& mrna)
10739 {
10740  if (! cds.IsSetId() || ! cds.GetId().IsLocal()
10741  || ! mrna.IsSetId() || ! mrna.GetId().IsLocal()) {
10742  return 0;
10743  }
10744 
10745  FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, cds) {
10746  if ((*itx)->IsSetId() && ! s_FeatureIdsMatch((*itx)->GetId(), mrna.GetId())) {
10747  return 1;
10748  }
10749  }
10750 
10751  FOR_EACH_SEQFEATXREF_ON_SEQFEAT (itx, mrna) {
10752  if ((*itx)->IsSetId() && ! s_FeatureIdsMatch((*itx)->GetId(), cds.GetId())) {
10753  return 1;
10754  }
10755  }
10756 
10757  if (! cds.IsSetProduct() || ! mrna.IsSetExt()) {
10758  return 0;
10759  }
10760 
10761  TGi gi = ZERO_GI;
10762  if (cds.GetProduct().GetId()->IsGi()) {
10763  gi = cds.GetProduct().GetId()->GetGi();
10764  } else {
10765  // TODO: get gi for other kinds of SeqIds
10766  }
10767 
10768  if (gi == ZERO_GI) {
10769  return 0;
10770  }
10771 
10772  if (mrna.IsSetExt() && mrna.GetExt().IsSetType() && mrna.GetExt().GetType().IsStr()
10773  && NStr::EqualNocase(mrna.GetExt().GetType().GetStr(), "MrnaProteinLink")
10774  && mrna.GetExt().IsSetData()
10775  && mrna.GetExt().GetData().front()->IsSetLabel()
10776  && mrna.GetExt().GetData().front()->GetLabel().IsStr()
10777  && NStr::EqualNocase(mrna.GetExt().GetData().front()->GetLabel().GetStr(), "protein seqID")
10778  && mrna.GetExt().GetData().front()->IsSetData()
10779  && mrna.GetExt().GetData().front()->GetData().IsStr()) {
10780  string str = mrna.GetExt().GetData().front()->GetData().GetStr();
10781  try {
10782  CSeq_id id(str);
10783  if (id.IsGi()) {
10784  if (id.GetGi() == gi) {
10785  return 0;
10786  } else {
10787  return 2;
10788  }
10789  }
10790  } catch (const CException&) {
10791  } catch (const std::exception&) {
10792  }
10793  }
10794  return 0;
10795 }
10796 
10797 
10798 END_SCOPE(validator)
static CRef< CScope > m_Scope
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
EErrType
@ eErr_SEQ_INST_HTGS_STS_GSS_WGSshouldNotBeRNA
@ eErr_SEQ_INST_BadDeltaSeq
@ eErr_SEQ_DESCR_InconsistentBioSources_ConLocation
@ eErr_SEQ_FEAT_mRNAgeneRange
@ eErr_SEQ_INST_ShortSeq
@ eErr_SEQ_DESCR_FinishedStatusForWGS
@ eErr_SEQ_DESCR_InconsistentTaxName
@ eErr_GENERIC_MissingPubRequirement
@ eErr_SEQ_FEAT_TRNAinsideTMRNA
@ eErr_SEQ_INST_CompleteGenomeHasGaps
@ eErr_SEQ_INST_BadSeqIdCharacter
@ eErr_SEQ_INST_CompleteTitleProblem
@ eErr_SEQ_INST_HistoryGiCollision
@ eErr_SEQ_DESCR_UnwantedCompleteFlag
@ eErr_SEQ_INST_mRNAshouldBeSingleStranded
@ eErr_SEQ_FEAT_MultipleGenCodes
@ eErr_SEQ_DESCR_DBLinkBadAssembly
@ eErr_SEQ_DESCR_WGSmasterLacksBioProject
@ eErr_SEQ_INST_HighNContentStretch
@ eErr_SEQ_INST_HighNcontent3Prime
@ eErr_SEQ_INST_TerminalGap
@ eErr_SEQ_INST_FuzzyLen
@ eErr_SEQ_INST_MultipleAccessions
@ eErr_SEQ_DESCR_MultipleDBLinkObjects
@ eErr_SEQ_INST_BadProteinStart
@ eErr_SEQ_FEAT_PartialProblem3Prime
@ eErr_SEQ_FEAT_ProductShouldBeWhole
@ eErr_SEQ_INST_ProteinShouldNotHaveGaps
@ eErr_SEQ_INST_ESTshouldBemRNA
@ eErr_SEQ_DESCR_BadKeywordUnverified
@ eErr_SEQ_FEAT_ITSdoesNotAbutRRNA
@ eErr_SEQ_DESCR_InvalidMolInfo
@ eErr_SEQ_DESCR_InconsistentMolInfoTechnique
@ eErr_SEQ_DESCR_NoOrganismInTitle
@ eErr_SEQ_DESCR_InconsistentMolInfo
@ eErr_SEQ_INST_TSAMasterLacksStrucComm
@ eErr_SEQ_INST_WholeComponent
@ eErr_SEQ_FEAT_BadRRNAcomponentOrder
@ eErr_SEQ_INST_ReprInvalid
@ eErr_SEQ_INST_TSAseqGapProblem
@ eErr_SEQ_INST_HTGS_STS_GSS_WGSshouldBeGenomic
@ eErr_SEQ_INST_SeqLitDataLength0
@ eErr_SEQ_INST_CircBactGenomeProblem
@ eErr_SEQ_INST_WGSMasterLacksStrucComm
@ eErr_SEQ_INST_ContigsTooShort
@ eErr_SEQ_DESCR_NoMolInfoFound
@ eErr_SEQ_PKG_OrphanedProtein
@ eErr_SEQ_INST_SeqGapBadLinkage
@ eErr_SEQ_INST_SelfReferentialSequence
@ eErr_SEQ_DESCR_TransgenicProblem
@ eErr_SEQ_INST_DeltaComponentIsGi0
@ eErr_SEQ_FEAT_CDSmRNANotMatched
@ eErr_SEQ_FEAT_FeatContentDup
@ eErr_SEQ_INST_MolNotSet
@ eErr_SEQ_DESCR_WGSMasterLacksBothBioSampleBioProject
@ eErr_SEQ_INST_GiWithoutAccession
@ eErr_SEQ_INST_MissingGaps
@ eErr_SEQ_DESCR_InvalidForType
@ eErr_SEQ_FEAT_BadRRNAcomponentOverlapRRNA
@ eErr_SEQ_DESCR_FastaBracketTitle
@ eErr_SEQ_FEAT_MisMatchAA
@ eErr_SEQ_INST_StopInProtein
@ eErr_SEQ_INST_UnknownLengthGapNot100
@ eErr_SEQ_FEAT_MultipleProtRefs
@ eErr_SEQ_FEAT_MultipleEquivPublications
@ eErr_SEQ_DESCR_DBLinkProblem
@ eErr_SEQ_INST_InvalidLen
@ eErr_SEQ_DESCR_TPAassemblyWithoutTPAKeyword
@ eErr_SEQ_DESCR_InvalidForTypeGIBB
@ eErr_SEQ_FEAT_InvalidFeatureForProtein
@ eErr_SEQ_INST_HighNContentPercent
@ eErr_SEQ_DESCR_RefGeneTrackingOnNonRefSeq
@ eErr_SEQ_FEAT_IdenticalGeneSymbolAndSynonym
@ eErr_SEQ_FEAT_MultipleEquivBioSources
@ eErr_SEQ_INST_HighNcontent5Prime
@ eErr_SEQ_INST_TSAshouldBNotBeDNA
@ eErr_SEQ_DESCR_MissingChromosome
@ eErr_SEQ_INST_BadProteinMoltype
@ eErr_SEQ_INST_AllNs
@ eErr_SEQ_DESCR_NucleotideTechniqueOnProtein
@ eErr_SEQ_INST_CompleteCircleProblem
@ eErr_SEQ_FEAT_CDSwithMultipleMRNAs
@ eErr_SEQ_FEAT_CDSmRNAMismatchProteinIDs
@ eErr_SEQ_FEAT_CDSmRNAMismatchTranscriptIDs
@ eErr_SEQ_FEAT_PartialProblemOrganelle3Prime
@ eErr_SEQ_INST_OverlappingDeltaRange
@ eErr_SEQ_FEAT_OverlappingPeptideFeat
@ eErr_SEQ_DESCR_BadKeywordNoTechnique
@ eErr_SEQ_FEAT_ExtraProteinFeature
@ eErr_SEQ_INST_SeqLocLength
@ eErr_SEQ_INST_FarLocationExcludesFeatures
@ eErr_SEQ_DESCR_InconsistentVirusMoltype
@ eErr_SEQ_INST_IdOnMultipleBioseqs
@ eErr_SEQ_DESCR_MoltypeOtherGenetic
@ eErr_SEQ_INST_HighNpercent3Prime
@ eErr_SEQ_INST_BadSecondaryAccn
@ eErr_SEQ_INST_InvalidAlphabet
@ eErr_SEQ_FEAT_CDSonMinusStrandMRNA
@ eErr_SEQ_INST_MolNuclAcid
@ eErr_SEQ_DESCR_MoltypeOther
@ eErr_SEQ_DESCR_Inconsistent
@ eErr_SEQ_INST_ExtNotAllowed
@ eErr_SEQ_DESCR_InconsistentRefSeqMoltype
@ eErr_SEQ_FEAT_PartialProblem5Prime
@ eErr_SEQ_FEAT_CDSmRNAMismatchLocation
@ eErr_SEQ_INST_TrailingX
@ eErr_SEQ_DESCR_InconsistentDates
@ eErr_SEQ_INST_CircularProtein
@ eErr_SEQ_INST_NoIdOnBioseq
@ eErr_SEQ_INST_LeadingX
@ eErr_SEQ_INST_PartsOutOfOrder
@ eErr_SEQ_FEAT_BadFullLengthFeature
@ eErr_SEQ_DESCR_InconsistentGenBankblocks
@ eErr_SEQ_FEAT_FarLocation
@ eErr_SEQ_INST_MolinfoOther
@ eErr_SEQ_INST_BadSeqIdLength
@ eErr_SEQ_INST_SeqDataNotAllowed
@ eErr_SEQ_INST_BadHTGSeq
@ eErr_SEQ_FEAT_PartialProblemOrganelle5Prime
@ eErr_SEQ_DESCR_NoKeywordHasTechnique
@ eErr_SEQ_INST_UnexpectedIdentifierChange
@ eErr_SEQ_INST_WGSseqGapProblem
@ eErr_SEQ_DESCR_MultipleStrucComms
@ eErr_SEQ_FEAT_InconsistentRRNAstrands
@ eErr_SEQ_FEAT_PartialProblemNotSpliceConsensus5Prime
@ eErr_SEQ_FEAT_BadRRNAcomponentOverlapAndOrder
@ eErr_SEQ_DESCR_DBLinkBadFormat
@ eErr_SEQ_FEAT_InvalidForType
@ eErr_SEQ_FEAT_GeneLocusCollidesWithLocusTag
@ eErr_SEQ_FEAT_CDSgeneRange
@ eErr_SEQ_INST_MitoMetazoanTooLong
@ eErr_SEQ_DESCR_CompleteGenomeLacksBioProject
@ eErr_SEQ_DESCR_CollidingPubMedID
@ eErr_SEQ_FEAT_DuplicateFeat
@ eErr_SEQ_INST_ExtBadOrMissing
@ eErr_SEQ_FEAT_FeatureProductInconsistency
@ eErr_SEQ_DESCR_SyntheticConstructWrongMolType
@ eErr_SEQ_FEAT_DuplicateGeneConflictingLocusTag
@ eErr_SEQ_DESCR_MolInfoConflictsWithBioSource
@ eErr_SEQ_INST_InstantiatedGapMismatch
@ eErr_SEQ_FEAT_UTRdoesNotAbutCDS
@ eErr_SEQ_INST_PartialInconsistent
@ eErr_SEQ_FEAT_CollidingLocusTags
@ eErr_SEQ_DESCR_MultipleNames
@ eErr_SEQ_FEAT_PartialProblemNotSpliceConsensus3Prime
@ eErr_SEQ_INST_BadSeqIdFormat
@ eErr_SEQ_FEAT_NoCDSbetweenUTRs
@ eErr_SEQ_INST_ZeroGiNumber
@ eErr_INTERNAL_Exception
@ eErr_SEQ_INST_ConflictingIdsOnBioseq
@ eErr_SEQ_DESCR_WrongOrganismFor16SrRNA
@ eErr_SEQ_INST_HistAssemblyMissing
@ eErr_SEQ_PKG_NoCdRegionPtr
@ eErr_SEQ_INST_InternalNsInSeqRaw
@ eErr_SEQ_INST_TerminalNs
@ eErr_SEQ_FEAT_SeqFeatXrefProblem
@ eErr_SEQ_DESCR_BadKeywordForStrucComm
@ eErr_SEQ_FEAT_CDSdoesNotMatchVDJC
@ eErr_SEQ_DESCR_InconsistentMolType
@ eErr_SEQ_FEAT_CDSmRNAMissingProteinIDs
@ eErr_SEQ_DESCR_WGSmasterLacksBioSample
@ eErr_SEQ_FEAT_MultiIntervalIntron
@ eErr_SEQ_DESCR_InconsistentTPA
@ eErr_SEQ_FEAT_LocusTagProblem
@ eErr_SEQ_INST_HighNpercent5Prime
@ eErr_SEQ_DESCR_ScaffoldLacksBioProject
@ eErr_SEQ_INST_InternalNsAdjacentToGap
@ eErr_SEQ_FEAT_PartialProblem
@ eErr_SEQ_DESCR_MultipleComments
@ eErr_SEQ_INST_SeqDataNotFound
@ eErr_SEQ_INST_InternalGapsInSeqRaw
@ eErr_SEQ_FEAT_MultipleGeneOverlap
@ eErr_SEQ_INST_DuplicateSegmentReferences
@ eErr_SEQ_DESCR_InconsistentWGSFlags
@ eErr_SEQ_FEAT_CDSmRNAmismatchCount
@ eErr_SEQ_FEAT_UTRdoesNotExtendToEnd
@ eErr_SEQ_INST_SeqLitGapLength0
@ eErr_SEQ_INST_SeqIdNameHasSpace
@ eErr_SEQ_DESCR_ProteinTechniqueOnNucleotide
@ eErr_SEQ_DESCR_CollidingPublications
@ eErr_SEQ_FEAT_PartialProblemmRNASequence3Prime
@ eErr_SEQ_INST_InternalNsInSeqLit
@ eErr_SEQ_INST_SeqDataLenWrong
@ eErr_SEQ_INST_GapInProtein
@ eErr_SEQ_INST_SeqGapProblem
@ eErr_SEQ_INST_InvalidResidue
@ eErr_SEQ_FEAT_PartialProblemmRNASequence5Prime
@ eErr_SEQ_FEAT_InvalidFeatureForMRNA
@ eErr_SEQ_FEAT_CDSwithNoMRNA
@ eErr_GENERIC_DeltaSeqError
ncbi::TMaskedQueryRegions mask
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
#define bool
Definition: bool.h:34
AutoPtr –.
Definition: ncbimisc.hpp:401
const string & GetLineage(void) const
Definition: BioSource.cpp:360
const string & GetTaxname(void) const
Definition: BioSource.cpp:340
bool IsSetLineage(void) const
Definition: BioSource.cpp:355
bool IsSetTaxname(void) const
Definition: BioSource.cpp:335
size_t IterateFeatures(Fnc m)
Definition: indexer.hpp:1082
CBioseq_Handle –.
CBioseq_set_Handle –.
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq.hpp:174
@ eContent
Definition: Bioseq.hpp:103
@ eBoth
Definition: Bioseq.hpp:104
CConstRef< CSeqdesc > GetClosestDescriptor(CSeqdesc::E_Choice choice, int *level=NULL) const
Definition: Seq_entry.cpp:212
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
void GetLabel(string *label, ELabelType type, bool worst=false) const
Definition: Bioseq.cpp:202
bool IsNa(void) const
Definition: Bioseq.cpp:345
bool IsAa(void) const
Definition: Bioseq.cpp:350
CCdregion –.
Definition: Cdregion.hpp:66
bool AssignMatch(TmRNAList &mrna_map, CFeatTree &feat_tree, CScope &scope)
bool HasMatch() const
bool Overlaps(const CSeq_feat &mrna) const
bool NeedsMatch() const
sequence::EOverlapType m_OverlapType
const CSeq_feat & GetSeqfeat() const
bool AssignXrefMatch(TmRNAList &unmatched_mrnas, const CTSE_Handle &tse)
CConstRef< CSeq_feat > m_Cds
bool AssignOverlapMatch(TmRNAList &unmatched_mrnas, CScope &scope)
size_t CountOtherMrnas()
CCdsMatchInfo(const CSeq_feat &cds, CScope *scope)
bool IsPseudo() const
bool AreMrnaProductsUnique()
CRef< CMrnaMatchInfo > m_BestMatch
const CMrnaMatchInfo & GetMatch() const
void SetMatch(CRef< CMrnaMatchInfo > match)
list< CConstRef< CSeq_feat > > m_OtherMrnas
void UpdateOtherMrnas(const TmRNAList &unmatched_mrnas)
static string KeywordForPrefix(const string &prefix)
static string GetStructuredCommentPrefix(const CUser_object &user, bool normalize=true)
static bool IsStructuredComment(const CUser_object &user)
Definition: Date.hpp:53
ECompare Compare(const CDate &date) const
Definition: Date.cpp:83
void GetDate(string *label, bool year_only=false) const
Append a standardized string representation of the date to the label.
Definition: Date.hpp:149
@ eCompare_before
*this comes first.
Definition: Date.hpp:74
@ eCompare_same
They're equivalent.
Definition: Date.hpp:75
Definition: Dbtag.hpp:53
void GetLabel(string *label) const
Definition: Dbtag.cpp:187
bool Match(const CDbtag &dbt2) const
Definition: Dbtag.cpp:158
int Compare(const CDbtag &dbt2) const
Definition: Dbtag.cpp:176
CDelta_seq –.
Definition: Delta_seq.hpp:66
CFeatTree.
Definition: feature.hpp:173
CFeat_CI –.
Definition: feat_ci.hpp:64
CSeqFeatData::ESubtype GetSubtype(void) const
Definition: indexer.hpp:909
CRef< CFeatureIndex > GetBestParent(void)
Definition: indexer.cpp:3230
CRef< CFeatureIndex > GetBestGene(void)
Definition: indexer.cpp:3203
CSeq_feat_Handle GetSeqFeatHandle(void) const
Definition: indexer.hpp:896
const CMappedFeat GetMappedFeat(void) const
Definition: indexer.hpp:897
CConstRef< CSeq_loc > GetMappedLocation(void) const
Definition: indexer.hpp:900
CRef< feature::CFeatTree > GetFeatTreeFromCache(const CSeq_loc &loc, CScope &scope)
Definition: gene_cache.cpp:79
static bool IsPseudo(const CSeq_feat &feat)
Definition: gene_cache.cpp:156
CConstRef< CSeq_feat > GetGeneFromCache(const CSeq_feat *feat, CScope &scope)
Definition: gene_cache.cpp:106
void GetLabel(string *label) const
Definition: Gene_ref.cpp:57
bool IsSuppressed(void) const
Definition: Gene_ref.cpp:75
CGraph_CI –.
Definition: graph_ci.hpp:234
@Imp_feat.hpp User-defined methods of the data storage class.
Definition: Imp_feat.hpp:54
CMappedFeat –.
Definition: mapped_feat.hpp:59
CConstRef< CSeq_feat > m_Mrna
bool Overlaps(const CSeq_feat &cds) const
CMrnaMatchInfo(const CSeq_feat &mrna, CScope *scope)
void SetPseudo(bool val=true)
const CSeq_feat & GetSeqfeat() const
bool OkWithoutCds(bool isGenbank=false) const
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CObject –.
Definition: ncbiobj.hpp:180
Exceptions for objmgr/util library.
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
const string & GetLineage(void) const
Definition: Org_ref.cpp:124
bool IsSetLineage(void) const
Definition: Org_ref.cpp:119
Definition: Pub.hpp:56
@ eContent
Definition: Pub.hpp:66
@Pubdesc.hpp User-defined methods of the data storage class.
Definition: Pubdesc.hpp:54
@RNA_ref.hpp User-defined methods of the data storage class.
Definition: RNA_ref.hpp:54
CScope –.
Definition: scope.hpp:92
CRef< CBioseqIndex > GetBioseqIndex(void)
Definition: indexer.cpp:114
ESubtype GetSubtype(void) const
@ eSubtype_transit_peptide_aa
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_CI –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
CSeq_entry * GetParentEntry(void) const
Definition: Seq_entry.hpp:131
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
const CGene_ref * GetGeneXref(void) const
See related function in util/feature.hpp.
Definition: Seq_feat.cpp:181
CSeq_hist_rec –.
CSeq_hist –.
Definition: Seq_hist.hpp:66
static bool IsAa(EMol mol)
Definition: Seq_inst.hpp:99
bool IsAa(void) const
Definition: Seq_inst.hpp:113
static string GetMoleculeClass(EMol mol)
Definition: Seq_inst.cpp:72
bool IsNa(void) const
Definition: Seq_inst.hpp:106
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
static void Validate(const CSeq_data &in_seq, vector< TSeqPos > *badIdx, TSeqPos uBeginIdx=0, TSeqPos uLength=0)
Base class for all serializable objects.
Definition: serialbase.hpp:150
static bool NeedsNoText(const TSubtype &subtype)
Definition: SubSource.cpp:231
CBioseq_Handle GetBioseqHandle(const CSeq_id &id) const
Get Bioseq handle from this TSE.
Definition: tse_handle.cpp:217
TSeq_feat_Handles GetFeaturesWithId(CSeqFeatData::E_Choice type, TFeatureIdInt id) const
Definition: tse_handle.cpp:604
bool GetInheritedPropertyDefines(const string &prop_name, TInfoList &results_out, TTaxId subtree_root=TAX_ID_CONST(1))
Definition: taxon1.cpp:1927
bool GetScientificName(TTaxId tax_id, string &name_out)
Definition: taxon1.cpp:2390
list< CRef< CTaxon1_info > > TInfoList
Definition: taxon1.hpp:71
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CTime –.
Definition: ncbitime.hpp:296
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
bool IsRefGeneTracking() const
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user object.
Definition: User_object.cpp:71
bool IsStructuredComment() const
EObjectType GetObjectType() const
void ValidateSeqAnnot(const CSeq_annot_Handle &annot)
void ValidateSeqAnnotContext(const CSeq_annot &annot, const CBioseq &seq)
CCacheImpl & GetCache()
static CSeq_entry_Handle GetAppropriateXrefParent(CSeq_entry_Handle seh)
CValidError_imp & m_Imp
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
static bool IsPdb(const CBioseq &seq)
void ValidateUpdateDateContext(const CDate &update, const CDate &create, const CBioseq &seq, const CSeqdesc &desc)
void ValidateOrgContext(const COrg_ref &this_org, const COrg_ref &org, const CBioseq &seq, const CSeqdesc &desc)
CBioseq_Handle m_CurrentHandle
void ValidateInst(const CBioseq &seq)
static bool IsRefSeq(const CBioseq &seq)
void x_ValidateMultiplePubs(const CBioseq_Handle &bsh)
bool IsHistAssemblyMissing(const CBioseq &seq)
void ReportBadAssemblyGap(const CBioseq &seq)
static bool IsSelfReferential(const CBioseq &seq)
EDiagSev x_DupFeatSeverity(const CSeq_feat &curr, const CSeq_feat &prev, bool viral, bool htgs, bool same_annot, bool same_label)
CRef< CSeq_loc > GetLocFromSeq(const CBioseq &seq)
void x_ValidateBarcode(const CBioseq &seq)
void x_CompareStrings(const TStrFeatMap &str_feat_map, const string &type)
void x_CheckGeneralIDs(const CBioseq &seq)
void x_TranscriptIDsMatch(const string &protein_id, const CSeq_feat &cds)
static bool IsTSAAccession(const CSeq_id &id)
static bool IsEmblOrDdbj(const CBioseq &seq)
void x_CheckMrnaProteinLink(const CCdsMatchInfo &cds_match)
bool x_IsRangeGap(const CBioseq_Handle &seq, int start, int stop)
void ValidateBioseq(const CBioseq &seq)
void ValidateWGSMaster(CBioseq_Handle bsh)
CValidError_descr m_DescrValidator
void ValidateDeltaLoc(const CSeq_loc &loc, const CBioseq &seq, TSeqPos &len)
bool x_IsSameAsCDS(const CMappedFeat &feat)
void x_ValidateMolInfoForBioSource(const CBioSource &src, const CMolInfo &minfo, const CSeqdesc &desc)
void x_CheckForMultiplemRNAs(CCdsMatchInfo &cds_match, const TmRNAList &unmatched_mrnas)
void ValidateHistory(const CBioseq &seq)
void x_ValidateCompletness(const CBioseq &seq, const CMolInfo &mi)
bool SuppressTrailingXMsg(const CBioseq &seq)
void ValidateMolInfoContext(const CMolInfo &minfo, int &seq_biomol, int &tech, int &completeness, const CBioseq &seq, const CSeqdesc &desc)
bool x_HasCitSub(CBioseq_Handle bsh) const
static bool x_HasGap(const CBioseq &seq)
void ValidateSeqParts(const CBioseq &seq)
void x_ReportOverlappingPeptidePair(CSeq_feat_Handle f1, CSeq_feat_Handle f2, const CBioseq &bioseq, bool &reported_last_peptide)
void ValidateSegRef(const CBioseq &seq)
void x_CheckSingleStrandedRNAViruses(const CBioSource &source, const string &lineage, const string &stranded_mol, const CMolInfo::TBiomol biomol, const CBioseq_Handle &bsh, const CSerialObject &obj, const CSeq_entry *ctx)
void ValidateSecondaryAccConflict(const string &primary_acc, const CBioseq &seq, int choice)
static bool IsWGSMaster(const CBioseq &seq, CScope &scope)
void x_ValidateTitle(const CBioseq &seq)
void ValidateMultipleGeneOverlap(const CBioseq_Handle &bsh)
void ValidateSeqFeatContext(const CBioseq &seq, bool is_complete)
void ValidateDelta(const CBioseq &seq)
static bool HasBadWGSGap(const CBioseq &seq)
static bool x_HasPGAPStructuredComment(CBioseq_Handle bsh)
CValidError_annot m_AnnotValidator
void ValidateTwintrons(const CBioseq &seq)
unsigned int x_IdXrefsNotReciprocal(const CSeq_feat &cds, const CSeq_feat &mrna)
void x_ReportStartStopPartialProblem(int partial_type, bool at_splice_or_gap, bool abuts_n, const CSeq_feat &feat)
static size_t x_BadMetazoanMitochondrialLength(const CBioSource &src, const CSeq_inst &inst)
void ReportBadTSAGap(const CBioseq &seq)
void ValidateSeqGap(const CSeq_gap &gap, const CBioseq &seq)
void ValidateBadGeneOverlap(const CSeq_feat &feat)
bool x_IsPartialAtSpliceSiteOrGap(const CSeq_loc &loc, unsigned int tag, bool &bad_seq, bool &is_gap, bool &abuts_n)
void x_SetupCommonFlags(CBioseq_Handle bsh)
bool x_IsDeltaLitOnly(const CSeq_inst &inst) const
void ValidateNsAndGaps(const CBioseq &seq)
void ValidateCompleteGenome(const CBioseq &seq)
CValidError_bioseq(CValidError_imp &imp)
void ValidateRawConst(const CBioseq &seq)
void ValidateBioseqContext(const CBioseq &seq)
bool CdError(const CBioseq_Handle &bsh)
void x_ReportLineageConflictWithMol(const string &lineage, const string &stranded_mol, const CMolInfo::TBiomol biomol, CSeq_inst::EMol mol, const CSerialObject &obj, const CSeq_entry *ctx)
bool ValidateRepr(const CSeq_inst &inst, const CBioseq &seq)
void ValidateFeatPartialInContext(const CMappedFeat &feat, bool is_complete)
void ValidateGBBlock(const CGB_block &gbblock, const CBioseq &seq, const CSeqdesc &desc)
bool IsMrna(const CBioseq_Handle &bsh)
void ReportBadWGSGap(const CBioseq &seq)
bool x_SuppressDicistronic(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2, bool fruit_fly)
static bool IsWGSAccession(const CSeq_id &id)
void ValidateSeqLen(const CBioseq &seq)
bool x_PartialAdjacentToIntron(const CSeq_loc &loc)
void x_CheckOrigProteinAndTranscriptIds(const CCdsMatchInfo &cds_match)
size_t GetDataLen(const CSeq_inst &inst)
void CheckForPubOnBioseq(const CBioseq &seq)
void x_CalculateNsStretchAndTotal(const CSeqVector &seqvec, TSeqPos &num_ns, TSeqPos &max_stretch, bool &n5, bool &n3)
void CheckForMolinfoOnBioseq(const CBioseq &seq)
static bool IsAllNs(const CSeqVector &vec)
static string s_GetStrandedMolStringFromLineage(const string &lineage)
bool GetTSAConflictingBiomolTechErrors(const CBioseq &seq)
void CheckTpaHistory(const CBioseq &seq)
static bool IsPartial(const CBioseq &seq, CScope &scope)
const CCacheImpl::TFeatValue * m_AllFeatIt
void x_ValidateCDSmRNAmatch(const CBioseq_Handle &seq)
void ReportModifInconsistentError(int new_mod, int &old_mod, const CSeqdesc &desc, const CSeq_entry &ctx)
static bool x_IgnoreEndGap(CBioseq_Handle bsh, CSeq_gap::TType gap_type)
static bool x_ParentAndComponentLocationsDiffer(CBioseq_Handle bsh, CBioSource::TGenome parent_location)
void x_ValidateCDSVDJCmatch(const CBioseq_Handle &seq)
bool x_ShowBioProjectWarning(const CBioseq &seq)
void CheckForMultipleStructuredComments(const CBioseq &seq)
void ValidateCollidingGenes(const CBioseq &seq)
bool x_IdXrefsAreReciprocal(const CSeq_feat &cds, const CSeq_feat &mrna)
static bool IsGenbank(const CBioseq &seq)
void x_ReportDuplicatePubLabels(const CBioseq &seq, const vector< CTempString > &labels)
void ValidateSeqIds(const CBioseq &seq)
void x_ReportInternalPartial(const CSeq_feat &feat)
void ValidateModifDescriptors(const CBioseq &seq)
void x_ReportSuspiciousUseOfComplete(const CBioseq &seq, EDiagSev sev)
CValidError_feat m_FeatValidator
static int PctNs(CBioseq_Handle bsh)
void ReportBadGenomeGap(const CBioseq &seq)
static bool IsWp(CBioseq_Handle bsh)
void ValidateDupOrOverlapFeats(const CBioseq &seq)
bool x_MatchesOverlappingFeaturePartial(const CMappedFeat &feat, unsigned int partial_type)
void CheckForMissingChromosome(CBioseq_Handle bsh)
bool IsIdIn(const CSeq_id &id, const CBioseq &seq)
void ValidateMoltypeDescriptors(const CBioseq &seq)
size_t NumOfIntervals(const CSeq_loc &loc)
void x_ReportImproperPartial(const CSeq_feat &feat)
bool IsFlybaseDbxrefs(const TDbtags &dbxrefs)
void CheckSourceDescriptor(const CBioseq_Handle &bsh)
void x_ReportGeneOverlapError(const CSeq_feat &feat, const string &gene_label)
void x_CheckForMultipleComments(CBioseq_Handle bsh)
void ValidateIDSetAgainstDb(const CBioseq &seq)
static bool IsMaster(const CBioseq &seq)
bool x_ReportUTRPair(const CSeq_feat &utr5, const CSeq_feat &utr3)
void x_ValidateAbuttingRNA(const CBioseq_Handle &seq)
void x_ValidateSourceFeatures(const CBioseq_Handle &bsh)
void ValidateSeqId(const CSeq_id &id, const CBioseq &ctx, bool longer_general=false)
void x_ValidateAbuttingUTR(const CBioseq_Handle &seq)
bool x_ReportDupOverlapFeaturePair(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2, bool fruit_fly, bool viral, bool htgs)
void x_ValidateCDSagainstVDJC(const CBioseq_Handle &seq)
static bool IsWGS(const CBioseq &seq)
size_t x_CountAdjacentNs(const CSeq_literal &lit)
void ValidateSeqDescContext(const CBioseq &seq)
void x_ValidateOverlappingRNAFeatures(const CBioseq_Handle &bsh)
bool GetTSANStretchErrors(const CBioseq &seq)
const CCacheImpl::TFeatValue * m_GeneIt
void GapByGapInst(const CBioseq &seq)
void x_ValidatePubFeatures(const CBioseq_Handle &bsh)
void ValidateSeqDescr(const CSeq_descr &descr, const CSeq_entry &ctx)
bool ValidateStructuredComment(const CSeqdesc &desc, bool report)
void ValidateSeqFeatContext(const CSeq_feat &feat, const CBioseq &seq)
void SetScope(CScope &scope)
void SetTSE(CSeq_entry_Handle seh)
void ValidateGraphsOnBioseq(const CBioseq &seq)
const CSeq_entry_Handle & GetTSEH()
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
Definition: validatorp.cpp:358
bool IsSyntheticConstruct(const CBioSource &src)
bool HasGiOrAccnVer() const
const SValidatorContext & GetContext() const
Definition: validatorp.cpp:204
bool IsPDB() const
void AddBioseqWithNoBiosource(const CBioseq &seq)
CConstRef< CSeq_feat > GetCachedGene(const CSeq_feat *f)
bool IsValidateIdSet() const
bool IsGenbank() const
void PostObjErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
static bool IsWGSIntermediate(const CBioseq &seq)
bool IsGenomic() const
bool IsNoCitSubPubs() const
CConstRef< CSeq_feat > GetCDSGivenProduct(const CBioseq &seq)
CBioseq_Handle GetLocalBioseqHandle(const CSeq_id &id)
Definition: validatorp.cpp:257
bool IsRefSeq() const
bool IsSeqSubmitParent() const
bool IsINSDInSep() const
bool x_IsFarFetchFailure(const CSeq_loc &loc)
bool IsNoPubs() const
bool IsOvlPepErr() const
void AddBioseqWithNoPub(const CBioseq &seq)
Definition: valid_pub.cpp:1190
bool IsGenomeSubmission() const
void AddProtWithoutFullRef(const CBioseq_Handle &seq)
bool IsArtificial(const CBioSource &src)
void ValidateBioSourceForSeq(const CBioSource &bsrc, const CSerialObject &obj, const CSeq_entry *ctx, const CBioseq_Handle &bsh)
bool IsGI() const
bool IsGpipe() const
bool IsPatent() const
void IncrementTpaWithHistoryCount()
bool IsNoBioSource() const
bool IsLocalGeneralOnly() const
void IncrementTpaWithoutHistoryCount()
bool IsRefSeqConventions() const
bool IsIndexerVersion() const
CGeneCache & GetGeneCache()
bool IsSmallGenomeSet() const
bool IsEmbl() const
void ValidateSeqLoc(const CSeq_loc &loc, const CBioseq_Handle &seq, bool report_abutting, const string &prefix, const CSerialObject &obj, bool lowerSev=false)
bool IsXR() const
bool DoCompareVDJCtoCDS() const
bool IsDdbj() const
bool ShouldSubdivide() const
bool IsTransgenic(const CBioSource &bsrc)
const TFeatValue & GetFeatFromCache(const SFeatKey &featKey)
AutoPtr< TFeatValue > GetFeatFromCacheMulti(const vector< SFeatKey > &featKeys)
const CPubdescInfo & GetPubdescToInfo(CConstRef< CPubdesc > pub)
static const CSeqFeatData::ESubtype kAnyFeatSubtype
Definition: cache_impl.hpp:168
static const CSeqFeatData::E_Choice kAnyFeatType
Definition: cache_impl.hpp:167
std::vector< CMappedFeat > TFeatValue
Definition: cache_impl.hpp:164
@ fLabel_Unique
Append a unique tag [V1].
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
container_type::const_iterator const_iterator
Definition: map.hpp:53
container_type::iterator iterator
Definition: map.hpp:54
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
bool empty() const
Definition: map.hpp:149
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
const_iterator find(const key_type &key) const
Definition: map.hpp:293
const_iterator end() const
Definition: map.hpp:292
iterator insert(const value_type &val)
Definition: map.hpp:305
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
CS_CONTEXT * ctx
Definition: t0006.c:12
static const char si[8][64]
Definition: des.c:146
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
bool AllowOrphanedProtein(const CBioseq &seq, bool force_refseq=false)
Definition: dup_feats.cpp:82
#define FOR_EACH_USERFIELD_ON_USEROBJECT(Itr, Var)
FOR_EACH_USERFIELD_ON_USEROBJECT EDIT_EACH_USERFIELD_ON_USEROBJECT.
SStrictId_Entrez::TId TEntrezId
TEntrezId type for entrez ids which require the same strictness as TGi.
Definition: ncbimisc.hpp:1041
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
Int8 TIntId
Definition: ncbimisc.hpp:999
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define ZERO_ENTREZ_ID
Definition: ncbimisc.hpp:1102
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
#define LOG_POST_XX(error_name, err_subcode, message)
Definition: ncbidiag.hpp:569
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
void Critical(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1203
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
const string & FindName(TEnumValueType value, bool allowBadValue) const
Find name of the enum by its numeric value.
Definition: enumerated.cpp:146
const TPrim & Get(void) const
Definition: serialbase.hpp:347
#define ENUM_METHOD_NAME(EnumName)
Definition: serialbase.hpp:994
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
static const size_t kMaxLocalIDLength
ID length restrictions.
Definition: Seq_id.hpp:809
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2039
CConstRef< CSeq_id > GetSeqId(void) const
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
int CompareOrdered(const CSeq_id &sid2) const
Definition: Seq_id.cpp:486
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1033
static bool IsValidLocalID(const CTempString &s)
Perform rudimentary validation on potential local IDs, whose contents should be pure ASCII and limite...
Definition: Seq_id.cpp:2575
static const size_t kMaxGeneralTagLength
Definition: Seq_id.hpp:811
CSeq_id::E_Choice Which(void) const
string GetLabel(const CSeq_id &id)
static const size_t kMaxGeneralDBLength
Definition: Seq_id.hpp:810
@ eAcc_wgs
Definition: Seq_id.hpp:264
@ fAcc_prot
Definition: Seq_id.hpp:227
@ eAcc_segset
Definition: Seq_id.hpp:248
@ fAcc_nuc
Definition: Seq_id.hpp:226
@ eAcc_tsa
Definition: Seq_id.hpp:247
@ eAcc_unknown
Definition: Seq_id.hpp:294
@ fAcc_master
Definition: Seq_id.hpp:230
@ eAcc_division_mask
Definition: Seq_id.hpp:273
@ e_DIFF
some problem
Definition: Seq_id.hpp:549
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:551
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:573
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
CConstRef< CSeq_loc > GetRangeAsSeq_loc(void) const
Get seq-loc for the current iterator position.
Definition: Seq_loc.cpp:2585
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
bool IsSetStrand(EIsSetStrand flag=eIsSetStrand_Any) const
Check if strand is set for any/all part(s) of the seq-loc depending on the flag.
Definition: Seq_loc.cpp:858
const CSeq_loc & GetEmbeddingSeq_loc(void) const
Get the nearest seq-loc containing the current range.
Definition: Seq_loc.cpp:2573
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
TRange GetRange(void) const
Get the range.
Definition: Seq_loc.hpp:1042
ENa_strand GetStrand(void) const
Definition: Seq_loc.hpp:1056
void GetLabel(string *label) const
Appends a label suitable for display (e.g., error messages) label must point to an existing string ob...
Definition: Seq_loc.cpp:3467
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
Definition: iterator.hpp:1012
CMappedFeat GetParent(const CMappedFeat &feat)
Return nearest parent of a feature.
Definition: feature.cpp:3067
CMappedFeat GetBestOverlappingFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype need_subtype, sequence::EOverlapType overlap_type, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3653
@ fFGL_Content
Include its content if there is any.
Definition: feature.hpp:73
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
EOverlapType
Int8 TestForOverlapEx(const CSeq_loc &loc1, const CSeq_loc &loc2, EOverlapType type, CScope *scope=0, TOverlapFlags flags=fOverlap_Default)
Updated version of TestForOverlap64().
int SeqLocPartialCheck(const CSeq_loc &loc, CScope *scope)
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
bool IsOneBioseq(const CSeq_loc &loc, CScope *scope)
Returns true if all embedded CSeq_ids represent the same CBioseq, else false.
ECompare
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
@ eSeqlocPartial_Nostart
@ eSeqlocPartial_Nostop
@ eSeqlocPartial_Nointernal
@ eSeqlocPartial_Complete
@ eSeqlocPartial_Stop
@ eSeqlocPartial_Limwrong
@ eSeqlocPartial_Start
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eOverlap_SubsetRev
1st is a subset of 2nd ranges
@ eOverlap_CheckIntervals
2nd is a subset of 1st with matching boundaries
@ eOverlap_Contains
2nd contains 1st extremes
@ eOverlap_CheckIntRev
1st is a subset of 2nd with matching boundaries
@ eOverlap_Simple
any overlap of extremes
@ eOverlap_Contained
2nd contained within 1st extremes
@ eOverlap_Subset
2nd is a subset of 1st ranges
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
@ eNoOverlap
CSeq_locs do not overlap or abut.
const CSeq_feat * GetCDSForProduct(const CBioseq &product, CScope *scope)
Get the encoding CDS feature of a given protein sequence.
Definition: sequence.cpp:2549
bool IsPseudo(const CSeq_feat &feat, CScope &scope)
Determines whether given feature is pseudo, using gene associated with feature if necessary Checks to...
Definition: sequence.cpp:1428
CConstRef< CSeq_feat > GetOverlappingOperon(const CSeq_loc &loc, CScope &scope)
Definition: sequence.cpp:1600
const CSeq_feat * GetPROTForProduct(const CBioseq &product, CScope *scope)
Get the mature peptide feature of a protein.
Definition: sequence.cpp:2593
vector< TFeatScore > TFeatScores
Definition: sequence.hpp:353
void GetOverlappingFeatures(const CSeq_loc &loc, CSeqFeatData::E_Choice feat_type, CSeqFeatData::ESubtype feat_subtype, EOverlapType overlap_type, TFeatScores &feats, CScope &scope, const TBestFeatOpts opts=0, CGetOverlappingFeaturesPlugin *plugin=NULL)
Find all features overlapping the location.
Definition: sequence.cpp:945
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CSeq_feat_Handle GetSeq_featHandle(const CSeq_feat &feat, EMissing action=eMissing_Default)
Definition: scope.cpp:200
vector< CSeq_id_Handle > TIds
Definition: scope.hpp:143
@ eGetBioseq_All
Search bioseq, load if not loaded yet.
Definition: scope.hpp:128
bool IsSetExcept(void) const
const CFeat_id & GetId(void) const
bool IsSetId(void) const
bool IsSetInst_Mol(void) const
bool IsSetComment(void) const
const CTSE_Handle & GetTSE_Handle(void) const
Get CTSE_Handle of containing TSE.
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
TClass GetClass(void) const
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
TBioseqCore GetBioseqCore(void) const
Get bioseq core structure.
virtual CConstRef< CSeq_feat > GetSeq_feat(void) const
const CSeqFeatData & GetData(void) const
TSeqPos GetBioseqLength(void) const
TSet GetSet(void) const
bool IsAa(void) const
CSeq_entry_Handle GetSeq_entry_Handle(void) const
Get parent Seq-entry handle.
bool IsSetExcept_text(void) const
TInst_Mol GetInst_Mol(void) const
bool IsSetProduct(void) const
bool IsSetInst_Length(void) const
TInst_Topology GetInst_Topology(void) const
const string & GetComment(void) const
TInst_Length GetInst_Length(void) const
const string & GetExcept_text(void) const
bool IsSetInst(void) const
void Reset(void)
Reset handle and make it not to point to any bioseq.
bool IsSetInst_Repr(void) const
bool IsSetClass(void) const
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
TInst_Repr GetInst_Repr(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
CConstRef< CSeq_feat > GetOriginalSeq_feat(void) const
CSeq_entry_Handle GetExactComplexityLevel(CBioseq_set::EClass cls) const
Return level with exact complexity, or empty handle if not found.
bool IsSet(void) const
CSeqFeatData::ESubtype GetFeatSubtype(void) const
bool IsSetInst_Topology(void) const
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
const TId & GetId(void) const
TMol GetBioseqMolType(void) const
Get some values from core:
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
bool CanGetInst_Mol(void) const
const TInst & GetInst(void) const
@ eCoding_Ncbi
Set coding to binary coding (Ncbi4na or Ncbistdaa)
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
TSeqPos GetEndPosition(void) const
return end position of current segment in sequence (exclusive)
Definition: seq_map_ci.hpp:679
SSeqMapSelector & SetResolveCount(size_t res_cnt)
Set max depth of resolving seq-map.
Definition: seq_map_ci.hpp:151
SAnnotSelector & IncludeFeatSubtype(TFeatSubtype subtype)
Include feature subtype in the search.
bool IsSetPartial(void) const
const CSeq_loc & GetLocation(void) const
bool GetPartial(void) const
SSeqMapSelector & SetFlags(TFlags flags)
Select segment type(s)
Definition: seq_map_ci.hpp:179
const CSeq_feat & GetOriginalFeature(void) const
Get original feature with unmapped location/product.
const CSeq_feat_Handle & GetSeq_feat_Handle(void) const
Get original feature handle.
Definition: mapped_feat.hpp:71
unsigned char TResidue
const CSeq_loc & GetProduct(void) const
SAnnotSelector & SetFeatSubtype(TFeatSubtype subtype)
Set feature subtype (also set annotation and feat type)
TSeqPos GetPosition(void) const
return position of current segment in sequence
Definition: seq_map_ci.hpp:665
CConstRef< CSeq_feat > GetSeq_feat(void) const
Get current seq-feat.
TCoding GetCoding(void) const
Target sequence coding.
Definition: seq_vector.hpp:312
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
Definition: seq_vector.hpp:277
TSeqPos size(void) const
Definition: seq_vector.hpp:291
@ fFindGap
Definition: seq_map.hpp:130
@ fFindLeafRef
Definition: seq_map.hpp:131
@ fFindData
Definition: seq_map.hpp:129
@ fIgnoreUnresolved
Definition: seq_map.hpp:134
@ eSeqData
real sequence data
Definition: seq_map.hpp:98
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:1401
CConstRef< C > ConstRef(const C *object)
Template function for conversion of const object pointer to CConstRef.
Definition: ncbiobj.hpp:2024
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define NCBI_FALLTHROUGH
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
CTempString literal(const char(&str)[Size])
Templatized initialization from a string literal.
Definition: tempstr.hpp:441
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5325
static int Compare(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Compare of a substring with another string.
Definition: ncbistr.hpp:5297
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5109
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
static const char label[]
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
bool IsSetExtra_acc(void) const
Check if a value has been assigned to Extra_acc data member.
const TExtra_acc & GetExtra_acc(void) const
Get the Extra_acc member data.
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
list< string > TKeywords
bool IsSetExtra_accessions(void) const
Check if a value has been assigned to Extra_accessions data member.
Definition: GB_block_.hpp:442
const TExtra_accessions & GetExtra_accessions(void) const
Get the Extra_accessions member data.
Definition: GB_block_.hpp:454
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
Definition: GB_block_.hpp:526
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
Definition: GB_block_.hpp:514
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: BioSource_.hpp:447
TSubtype GetSubtype(void) const
Get the Subtype member data.
Definition: SubSource_.hpp:310
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
const TName & GetName(void) const
Get the Name member data.
Definition: SubSource_.hpp:350
bool IsSetIs_focus(void) const
to distinguish biological focus Check if a value has been assigned to Is_focus data member.
Definition: BioSource_.hpp:552
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: SubSource_.hpp:338
@ eOrigin_synthetic
purely synthetic
Definition: BioSource_.hpp:134
@ eOrigin_mut
artificially mutagenized
Definition: BioSource_.hpp:132
@ eOrigin_artificial
artificially engineered
Definition: BioSource_.hpp:133
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
const TDesc & GetDesc(void) const
Get the Desc member data.
Definition: Gene_ref_.hpp:599
bool IsSetPseudo(void) const
pseudogene Check if a value has been assigned to Pseudo data member.
Definition: Gene_ref_.hpp:681
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
bool IsSetDesc(void) const
descriptive name Check if a value has been assigned to Desc data member.
Definition: Gene_ref_.hpp:587
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
TPseudo GetPseudo(void) const
Get the Pseudo member data.
Definition: Gene_ref_.hpp:706
const TStr & GetStr(void) const
Get the variant data.
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetDb(void) const
name of database or system Check if a value has been assigned to Db data member.
Definition: Dbtag_.hpp:208
bool CanGetType(void) const
Check if it is safe to call GetType method.
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
bool IsLim(void) const
Check if variant Lim is selected.
Definition: Int_fuzz_.hpp:636
bool IsSetYear(void) const
full year (including 1900) Check if a value has been assigned to Year data member.
Definition: Date_std_.hpp:407
bool IsStd(void) const
Check if variant Std is selected.
Definition: Date_.hpp:320
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
bool IsStrs(void) const
Check if variant Strs is selected.
const TStrs & GetStrs(void) const
Get the variant data.
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
const TData & GetData(void) const
Get the Data member data.
bool IsSetTag(void) const
appropriate tag Check if a value has been assigned to Tag data member.
Definition: Dbtag_.hpp:255
const TDb & GetDb(void) const
Get the Db member data.
Definition: Dbtag_.hpp:220
TLim GetLim(void) const
Get the variant data.
Definition: Int_fuzz_.hpp:642
bool IsStr(void) const
Check if variant Str is selected.
bool IsSetLabel(void) const
field label Check if a value has been assigned to Label data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TData & GetData(void) const
Get the Data member data.
const TStr & GetStr(void) const
Get the variant data.
Definition: Date_.hpp:306
const TLabel & GetLabel(void) const
Get the Label member data.
const TType & GetType(void) const
Get the Type member data.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
vector< CStringUTF8 > TStrs
const TStd & GetStd(void) const
Get the variant data.
Definition: Date_.cpp:109
vector< CRef< CUser_field > > TData
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
bool IsStr(void) const
Check if variant Str is selected.
Definition: Date_.hpp:300
@ eLim_unk
unknown
Definition: Int_fuzz_.hpp:210
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
Definition: Org_ref_.hpp:479
const TLineage & GetLineage(void) const
Get the Lineage member data.
Definition: OrgName_.hpp:864
TSubtype GetSubtype(void) const
Get the Subtype member data.
Definition: OrgMod_.hpp:307
bool CanGetDiv(void) const
Check if it is safe to call GetDiv method.
Definition: OrgName_.hpp:999
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
const TSubname & GetSubname(void) const
Get the Subname member data.
Definition: OrgMod_.hpp:347
bool IsSetLineage(void) const
lineage with semicolon separators Check if a value has been assigned to Lineage data member.
Definition: OrgName_.hpp:852
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
const TName & GetName(void) const
Get the Name member data.
Definition: Prot_ref_.hpp:378
TProcessed GetProcessed(void) const
Get the Processed member data.
Definition: Prot_ref_.hpp:538
bool IsSetProcessed(void) const
Check if a value has been assigned to Processed data member.
Definition: Prot_ref_.hpp:513
bool IsSetName(void) const
protein name Check if a value has been assigned to Name data member.
Definition: Prot_ref_.hpp:366
list< CRef< CPub > > Tdata
Definition: Pub_equiv_.hpp:90
const Tdata & Get(void) const
Get the member data.
Definition: Pub_equiv_.hpp:165
const TEquiv & GetEquiv(void) const
Get the variant data.
Definition: Pub_.cpp:387
bool IsEquiv(void) const
Check if variant Equiv is selected.
Definition: Pub_.hpp:671
bool IsSub(void) const
Check if variant Sub is selected.
Definition: Pub_.hpp:590
@ e_Pmid
PubMedId.
Definition: Pub_.hpp:114
@ e_Muid
medline uid
Definition: Pub_.hpp:105
TType GetType(void) const
Get the Type member data.
Definition: RNA_ref_.hpp:529
bool IsSetExt(void) const
generic fields for ncRNA, tmRNA, miscRNA Check if a value has been assigned to Ext data member.
Definition: RNA_ref_.hpp:604
bool CanGetExt(void) const
Check if it is safe to call GetExt method.
Definition: RNA_ref_.hpp:610
bool IsSetType(void) const
Check if a value has been assigned to Type data member.
Definition: RNA_ref_.hpp:510
const TName & GetName(void) const
Get the variant data.
Definition: RNA_ref_.hpp:484
const TExt & GetExt(void) const
Get the Ext member data.
Definition: RNA_ref_.hpp:616
bool IsName(void) const
Check if variant Name is selected.
Definition: RNA_ref_.hpp:478
const TKey & GetKey(void) const
Get the Key member data.
Definition: Imp_feat_.hpp:259
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsSetQual(void) const
qualifiers Check if a value has been assigned to Qual data member.
Definition: Seq_feat_.hpp:1135
E_Choice Which(void) const
Which variant is currently selected.
bool IsSetCode(void) const
genetic code used Check if a value has been assigned to Code data member.
Definition: Cdregion_.hpp:700
bool IsSetExt(void) const
user defined structure extension Check if a value has been assigned to Ext data member.
Definition: Seq_feat_.hpp:1207
bool IsCdregion(void) const
Check if variant Cdregion is selected.
bool IsImp(void) const
Check if variant Imp is selected.
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Seq_feat_.hpp:1147
bool IsSetPartial(void) const
incomplete in some way? Check if a value has been assigned to Partial data member.
Definition: Seq_feat_.hpp:943
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_feat_.hpp:904
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
E_Choice
Choice variants.
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Feat_id_.hpp:353
bool IsGene(void) const
Check if variant Gene is selected.
TFrame GetFrame(void) const
Get the Frame member data.
Definition: Cdregion_.hpp:534
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool IsSetExcept(void) const
something funny about this? Check if a value has been assigned to Except data member.
Definition: Seq_feat_.hpp:990
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
Definition: Seq_feat_.hpp:1405
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
const TCode & GetCode(void) const
Get the Code member data.
Definition: Cdregion_.hpp:712
const TDbxref & GetDbxref(void) const
Get the Dbxref member data.
Definition: Seq_feat_.hpp:1333
list< CRef< C_E > > Tdata
const TCdregion & GetCdregion(void) const
Get the variant data.
const TBiosrc & GetBiosrc(void) const
Get the variant data.
bool IsSetId(void) const
Check if a value has been assigned to Id data member.
Definition: Seq_feat_.hpp:892
TPseudo GetPseudo(void) const
Get the Pseudo member data.
Definition: Seq_feat_.hpp:1365
const TProduct & GetProduct(void) const
Get the Product member data.
Definition: Seq_feat_.hpp:1096
const Tdata & Get(void) const
Get the member data.
bool IsSetPseudo(void) const
annotated on pseudogene? Check if a value has been assigned to Pseudo data member.
Definition: Seq_feat_.hpp:1346
bool IsBiosrc(void) const
Check if variant Biosrc is selected.
void SetPseudo(TPseudo value)
Assign a value to Pseudo data member.
Definition: Seq_feat_.hpp:1374
const TGene & GetGene(void) const
Get the variant data.
TPartial GetPartial(void) const
Get the Partial member data.
Definition: Seq_feat_.hpp:962
const TProt & GetProt(void) const
Get the variant data.
TExcept GetExcept(void) const
Get the Except member data.
Definition: Seq_feat_.hpp:1009
vector< CRef< CGb_qual > > TQual
Definition: Seq_feat_.hpp:117
const TRna & GetRna(void) const
Get the variant data.
bool IsSetDbxref(void) const
support for xref to other databases Check if a value has been assigned to Dbxref data member.
Definition: Seq_feat_.hpp:1321
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
Definition: Seq_feat_.hpp:1084
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_feat_.hpp:1219
bool IsRna(void) const
Check if variant Rna is selected.
bool IsRegion(void) const
Check if variant Region is selected.
const TImp & GetImp(void) const
Get the variant data.
bool IsSetFrame(void) const
Check if a value has been assigned to Frame data member.
Definition: Cdregion_.hpp:509
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
Definition: Seq_feat_.hpp:1105
@ e_Pub
publication applies to this seq
bool IsGenbank(void) const
Check if variant Genbank is selected.
Definition: Seq_id_.hpp:841
TChain GetChain(void) const
Get the Chain member data.
bool IsSetChain_id(void) const
chain identifier; length-independent generalization of 'chain' Check if a value has been assigned to ...
bool IsSetChain(void) const
Deprecated: 'chain' can't support multiple character PDB chain identifiers (introduced in 2015).
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
bool IsTpg(void) const
Check if variant Tpg is selected.
Definition: Seq_id_.hpp:928
const TName & GetName(void) const
Get the Name member data.
list< CRef< CSeq_interval > > Tdata
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
const Tdata & Get(void) const
Get the member data.
const TId & GetId(void) const
Get the Id member data.
const TPnt & GetPnt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:238
bool IsTpd(void) const
Check if variant Tpd is selected.
Definition: Seq_id_.hpp:940
TPoint GetPoint(void) const
Get the Point member data.
Definition: Seq_point_.hpp:303
bool IsOther(void) const
Check if variant Other is selected.
Definition: Seq_id_.hpp:871
TFrom GetFrom(void) const
Get the From member data.
bool IsGeneral(void) const
Check if variant General is selected.
Definition: Seq_id_.hpp:877
bool IsEmbl(void) const
Check if variant Embl is selected.
Definition: Seq_id_.hpp:847
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
TVersion GetVersion(void) const
Get the Version member data.
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
const TOther & GetOther(void) const
Get the variant data.
Definition: Seq_id_.cpp:347
bool IsPacked_int(void) const
Check if variant Packed_int is selected.
Definition: Seq_loc_.hpp:534
Tdata & Set(void)
Assign a value to data member.
const TChain_id & GetChain_id(void) const
Get the Chain_id member data.
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
TTo GetTo(void) const
Get the To member data.
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
bool IsTpe(void) const
Check if variant Tpe is selected.
Definition: Seq_id_.hpp:934
bool IsPnt(void) const
Check if variant Pnt is selected.
Definition: Seq_loc_.hpp:540
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TPacked_int & GetPacked_int(void) const
Get the variant data.
Definition: Seq_loc_.cpp:216
const TAccession & GetAccession(void) const
Get the Accession member data.
bool IsDdbj(void) const
Check if variant Ddbj is selected.
Definition: Seq_id_.hpp:910
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
bool IsSetClass(void) const
Check if a value has been assigned to Class data member.
TClass GetClass(void) const
Get the Class member data.
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
list< CRef< CSeq_entry > > TSeq_set
@ eClass_parts
parts for 2 or 3
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ eClass_gen_prod_set
genomic products, chrom+mRNA+protein
@ eClass_segset
segmented sequence + parts
const TIupacaa & GetIupacaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:530
bool IsSetLinkage(void) const
Check if a value has been assigned to Linkage data member.
Definition: Seq_gap_.hpp:310
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
bool IsMap(void) const
Check if variant Map is selected.
Definition: Seq_ext_.hpp:330
const TSeg & GetSeg(void) const
Get the variant data.
Definition: Seq_ext_.cpp:114
bool IsSetCompleteness(void) const
Check if a value has been assigned to Completeness data member.
Definition: MolInfo_.hpp:569
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
bool IsRef(void) const
Check if variant Ref is selected.
Definition: Seq_ext_.hpp:324
bool IsSetReplaced_by(void) const
these seqs make this one obsolete Check if a value has been assigned to Replaced_by data member.
Definition: Seq_hist_.hpp:546
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
TLinkage GetLinkage(void) const
Get the Linkage member data.
Definition: Seq_gap_.hpp:329
TStrand GetStrand(void) const
Get the Strand member data.
Definition: Seq_inst_.hpp:777
ERepr
representation class
Definition: Seq_inst_.hpp:91
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TGap & GetGap(void) const
Get the variant data.
Definition: Seq_data_.cpp:184
bool IsSetAssembly(void) const
how was this assembled? Check if a value has been assigned to Assembly data member.
Definition: Seq_hist_.hpp:500
TTopology GetTopology(void) const
Get the Topology member data.
Definition: Seq_inst_.hpp:733
const TIupacna & GetIupacna(void) const
Get the variant data.
Definition: Seq_data_.hpp:510
const TUpdate_date & GetUpdate_date(void) const
Get the variant data.
Definition: Seqdesc_.cpp:494
const TNcbipna & GetNcbipna(void) const
Get the variant data.
Definition: Seq_data_.hpp:610
bool IsSetRepr(void) const
Check if a value has been assigned to Repr data member.
Definition: Seq_inst_.hpp:546
const TNcbipaa & GetNcbipaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:670
TType GetType(void) const
Get the Type member data.
Definition: Seq_gap_.hpp:282
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
Definition: Seq_inst_.hpp:593
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
const TPub & GetPub(void) const
Get the variant data.
Definition: Seqdesc_.cpp:356
bool IsSetStrand(void) const
Check if a value has been assigned to Strand data member.
Definition: Seq_inst_.hpp:758
const TNcbi8aa & GetNcbi8aa(void) const
Get the variant data.
Definition: Seq_data_.hpp:630
const TLiteral & GetLiteral(void) const
Get the variant data.
Definition: Delta_seq_.cpp:124
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
Definition: MolInfo_.hpp:422
bool IsLoc(void) const
Check if variant Loc is selected.
Definition: Delta_seq_.hpp:257
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
bool IsSetHist(void) const
sequence history Check if a value has been assigned to Hist data member.
Definition: Seq_inst_.hpp:847
bool IsNcbi4na(void) const
Check if variant Ncbi4na is selected.
Definition: Seq_data_.hpp:564
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool IsSetReplaces(void) const
seq makes these seqs obsolete Check if a value has been assigned to Replaces data member.
Definition: Seq_hist_.hpp:525
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
Definition: Bioseq_.hpp:324
bool IsNcbi8na(void) const
Check if variant Ncbi8na is selected.
Definition: Seq_data_.hpp:584
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
const TOrg & GetOrg(void) const
Get the variant data.
Definition: Seqdesc_.cpp:240
TLength GetLength(void) const
Get the Length member data.
const TAssembly & GetAssembly(void) const
Get the Assembly member data.
Definition: Seq_hist_.hpp:512
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
const TGenbank & GetGenbank(void) const
Get the variant data.
Definition: Seqdesc_.cpp:334
bool IsSeg(void) const
Check if variant Seg is selected.
Definition: Seq_ext_.hpp:318
list< CRef< CSeq_id > > TIds
bool CanGetLength(void) const
Check if it is safe to call GetLength method.
list< EGIBB_mod > TModif
Definition: Seqdesc_.hpp:191
const TFuzz & GetFuzz(void) const
Get the Fuzz member data.
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
const TIds & GetIds(void) const
Get the Ids member data.
const TLinkage_evidence & GetLinkage_evidence(void) const
Get the Linkage_evidence member data.
Definition: Seq_gap_.hpp:369
bool IsName(void) const
Check if variant Name is selected.
Definition: Seqdesc_.hpp:1006
const TNcbieaa & GetNcbieaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:650
bool IsSetFuzz(void) const
could be unsure Check if a value has been assigned to Fuzz data member.
TType GetType(void) const
Get the Type member data.
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
const TNcbistdaa & GetNcbistdaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:690
bool IsSetLength(void) const
length of sequence in residues Check if a value has been assigned to Length data member.
Definition: Seq_inst_.hpp:640
bool CanGetHist(void) const
Check if it is safe to call GetHist method.
Definition: Seq_inst_.hpp:853
const THist & GetHist(void) const
Get the Hist member data.
Definition: Seq_inst_.hpp:859
bool IsSetType(void) const
Check if a value has been assigned to Type data member.
Definition: Seq_gap_.hpp:263
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
bool CanGetRepr(void) const
Check if it is safe to call GetRepr method.
Definition: Seq_inst_.hpp:552
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
E_Choice
Choice variants.
Definition: Seqdesc_.hpp:109
TMol_type GetMol_type(void) const
Get the variant data.
Definition: Seqdesc_.hpp:938
const TEmbl & GetEmbl(void) const
Get the variant data.
Definition: Seqdesc_.cpp:450
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
bool CanGetType(void) const
Check if it is safe to call GetType method.
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
bool IsSetLength(void) const
must give a length in residues Check if a value has been assigned to Length data member.
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
bool IsSetPub(void) const
the citation(s) Check if a value has been assigned to Pub data member.
Definition: Pubdesc_.hpp:593
const TNcbi4na & GetNcbi4na(void) const
Get the variant data.
Definition: Seq_data_.hpp:570
const TLoc & GetLoc(void) const
Get the variant data.
Definition: Delta_seq_.cpp:102
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
const TModif & GetModif(void) const
Get the variant data.
Definition: Seqdesc_.hpp:965
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Delta_ext_.hpp:152
bool CanGetSeq_data(void) const
Check if it is safe to call GetSeq_data method.
const TNcbi2na & GetNcbi2na(void) const
Get the variant data.
Definition: Seq_data_.hpp:550
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
bool IsSetDate(void) const
Check if a value has been assigned to Date data member.
bool CanGetExt(void) const
Check if it is safe to call GetExt method.
Definition: Seq_inst_.hpp:832
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
Definition: Bioseq_.hpp:278
const TCreate_date & GetCreate_date(void) const
Get the variant data.
Definition: Seqdesc_.cpp:472
bool IsLiteral(void) const
Check if variant Literal is selected.
Definition: Delta_seq_.hpp:263
bool IsSetSeq_data(void) const
may have the data Check if a value has been assigned to Seq_data data member.
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
const TReplaces & GetReplaces(void) const
Get the Replaces member data.
Definition: Seq_hist_.hpp:537
const Tdata & Get(void) const
Get the member data.
Definition: Seg_ext_.hpp:164
bool IsGap(void) const
Check if variant Gap is selected.
Definition: Seq_data_.hpp:704
const TPub & GetPub(void) const
Get the Pub member data.
Definition: Pubdesc_.hpp:605
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
bool IsNcbi2na(void) const
Check if variant Ncbi2na is selected.
Definition: Seq_data_.hpp:544
const TReplaced_by & GetReplaced_by(void) const
Get the Replaced_by member data.
Definition: Seq_hist_.hpp:558
list< CRef< CSeq_loc > > Tdata
Definition: Seg_ext_.hpp:89
const TNcbi8na & GetNcbi8na(void) const
Get the variant data.
Definition: Seq_data_.hpp:590
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
const TComment & GetComment(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1058
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
bool IsIupacna(void) const
Check if variant Iupacna is selected.
Definition: Seq_data_.hpp:504
const TName & GetName(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1012
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
list< CRef< CLinkage_evidence > > TLinkage_evidence
Definition: Seq_gap_.hpp:118
const TRef & GetRef(void) const
Get the variant data.
Definition: Seq_ext_.cpp:136
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
Definition: Bioseq_.hpp:330
bool IsSetLinkage_evidence(void) const
Check if a value has been assigned to Linkage_evidence data member.
Definition: Seq_gap_.hpp:357
bool IsSetTopology(void) const
Check if a value has been assigned to Topology data member.
Definition: Seq_inst_.hpp:708
bool IsSetFuzz(void) const
length uncertainty Check if a value has been assigned to Fuzz data member.
Definition: Seq_inst_.hpp:687
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ eRepr_const
constructed sequence
Definition: Seq_inst_.hpp:96
@ eRepr_ref
reference to another sequence
Definition: Seq_inst_.hpp:97
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_map
ordered map of any kind
Definition: Seq_inst_.hpp:99
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
@ eRepr_not_set
empty
Definition: Seq_inst_.hpp:92
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
@ eCompleteness_no_left
missing 5' or NH3 end
Definition: MolInfo_.hpp:158
@ eCompleteness_partial
partial but no details given
Definition: MolInfo_.hpp:157
@ eCompleteness_no_right
missing 3' or COOH end
Definition: MolInfo_.hpp:159
@ eCompleteness_no_ends
missing both ends
Definition: MolInfo_.hpp:160
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_physmap
from physical mapping techniques
Definition: MolInfo_.hpp:129
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_both
concept transl. w/ partial pept. seq.
Definition: MolInfo_.hpp:133
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_seq_pept_homol
sequenced peptide, ordered by homology
Definition: MolInfo_.hpp:135
@ eTech_composite_wgs_htgs
composite of WGS and HTGS
Definition: MolInfo_.hpp:145
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_seq_pept_overlap
sequenced peptide, ordered by overlap
Definition: MolInfo_.hpp:134
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_concept_trans
conceptual translation
Definition: MolInfo_.hpp:131
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_standard
standard sequencing
Definition: MolInfo_.hpp:124
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_seq_pept
peptide was sequenced
Definition: MolInfo_.hpp:132
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_barcode
barcode of life project
Definition: MolInfo_.hpp:144
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ eTech_concept_trans_a
conceptual transl. supplied by author
Definition: MolInfo_.hpp:136
@ eTech_genemap
from genetic mapping techniques
Definition: MolInfo_.hpp:128
@ e_not_set
No variant selected.
Definition: Seq_data_.hpp:103
@ e_Ncbipna
nucleic acid probabilities
Definition: Seq_data_.hpp:109
@ e_Gap
gap types
Definition: Seq_data_.hpp:114
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Ncbi2na
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
@ e_Ncbipaa
amino acid probabilities
Definition: Seq_data_.hpp:112
@ e_Ncbi8na
8 bit extended nucleic acid code
Definition: Seq_data_.hpp:108
@ e_Ncbi4na
4 bit nucleic acid code
Definition: Seq_data_.hpp:107
@ e_Iupacaa
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
@ e_Ncbi8aa
8 bit extended amino acid codes
Definition: Seq_data_.hpp:110
@ eBiomol_pre_RNA
precursor RNA of any sort really
Definition: MolInfo_.hpp:102
@ eBiomol_cRNA
viral RNA genome copy intermediate
Definition: MolInfo_.hpp:111
@ eBiomol_snoRNA
small nucleolar RNA
Definition: MolInfo_.hpp:112
@ eBiomol_transcribed_RNA
transcribed RNA other than existing classes
Definition: MolInfo_.hpp:113
@ eBiomol_other_genetic
other genetic material
Definition: MolInfo_.hpp:109
@ eGIBB_mod_no_right
missing right end (3' or COOH)
Definition: GIBB_mod_.hpp:82
@ eGIBB_mod_complete
Definition: GIBB_mod_.hpp:76
@ eGIBB_mod_chloroplast
Definition: GIBB_mod_.hpp:70
@ eGIBB_mod_cyanelle
Definition: GIBB_mod_.hpp:72
@ eGIBB_mod_mitochondrial
Definition: GIBB_mod_.hpp:69
@ eGIBB_mod_kinetoplast
Definition: GIBB_mod_.hpp:71
@ eGIBB_mod_macronuclear
Definition: GIBB_mod_.hpp:83
@ eGIBB_mod_dna
Definition: GIBB_mod_.hpp:65
@ eGIBB_mod_rna
Definition: GIBB_mod_.hpp:66
@ eGIBB_mod_partial
Definition: GIBB_mod_.hpp:75
@ eGIBB_mod_no_left
missing left end (5' for na, NH2 for aa)
Definition: GIBB_mod_.hpp:81
@ e_Embl
EMBL specific information.
Definition: Seqdesc_.hpp:127
@ e_Org
if all from one organism
Definition: Seqdesc_.hpp:116
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Update_date
date of last update
Definition: Seqdesc_.hpp:129
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Pir
PIR specific info.
Definition: Seqdesc_.hpp:120
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Prf
PRF specific information.
Definition: Seqdesc_.hpp:130
@ e_Mol_type
type of molecule
Definition: Seqdesc_.hpp:111
@ e_Sp
SWISSPROT specific info.
Definition: Seqdesc_.hpp:125
@ e_Comment
a more extensive comment
Definition: Seqdesc_.hpp:117
@ e_Method
sequencing method
Definition: Seqdesc_.hpp:113
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Modif
modifiers
Definition: Seqdesc_.hpp:112
@ e_Create_date
date entry first created/released
Definition: Seqdesc_.hpp:128
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
@ e_Pdb
PDB specific information.
Definition: Seqdesc_.hpp:131
@ e_Name
a name for this sequence
Definition: Seqdesc_.hpp:114
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eType_contamination
Definition: Seq_gap_.hpp:99
@ eType_clone
Deprecated. Used only for AGP 1.1.
Definition: Seq_gap_.hpp:91
@ eType_heterochromatin
Definition: Seq_gap_.hpp:93
@ eType_fragment
Deprecated. Used only for AGP 1.1.
Definition: Seq_gap_.hpp:90
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
@ eStrand_ss
single strand
Definition: Seq_inst_.hpp:135
@ eGIBB_mol_unknown
Definition: GIBB_mol_.hpp:65
@ eGIBB_mol_other
Definition: GIBB_mol_.hpp:76
@ eGIBB_mol_peptide
Definition: GIBB_mol_.hpp:73
@ e_Literal
a piece of sequence
Definition: Delta_seq_.hpp:90
@ e_Loc
point to a sequence
Definition: Delta_seq_.hpp:89
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
Definition of all error codes used in objtools libraries.
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is orig
char * buf
int i
if(yy_accept[yy_current_state])
yy_size_t n
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
static MDB_envinfo info
Definition: mdb_load.c:37
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int islower(Uchar c)
Definition: ncbictype.hpp:66
int isupper(Uchar c)
Definition: ncbictype.hpp:70
Miscellaneous common-use basic types and functionality.
#define nullptr
Definition: ncbimisc.hpp:45
Defines: CTimeFormat - storage class for time format.
T max(T x_, T y_)
Int4 delta(size_t dimension_, const Int4 *score_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
double df(double x_, const double &y_)
Definition: njn_root.hpp:189
bool IsResidue(unsigned char residue)
Definition: utilities.hpp:88
CBioseq_set_Handle GetNucProtSetParent(const CBioseq_Handle &bioseq)
Definition: utilities.cpp:581
CScope::TIds GetSeqIdsForGI(TGi gi)
Definition: utilities.cpp:142
bool ShouldCheckForNsAndGap(const CBioseq_Handle &bsh)
Definition: utilities.cpp:1307
EBioseqEndIsType
Definition: utilities.hpp:156
@ eBioseqEndIsType_None
Definition: utilities.hpp:157
@ eBioseqEndIsType_All
Definition: utilities.hpp:159
bool IsNTNCNWACAccession(const string &acc)
Definition: utilities.cpp:2755
CBioseq_Handle GetNucBioseq(const CBioseq_set_Handle &bioseq_set)
Definition: utilities.cpp:587
void AppendBioseqLabel(string &str, const CBioseq &sq, bool supress_context)
Definition: utilities.cpp:1064
bool s_FeatureIdsMatch(const CFeat_id &f1, const CFeat_id &f2)
Definition: utilities.cpp:717
bool PartialsSame(const CSeq_loc &loc1, const CSeq_loc &loc2)
Definition: utilities.cpp:1463
bool IsLocFullLength(const CSeq_loc &loc, const CBioseq_Handle &bsh)
Definition: utilities.cpp:1451
string GetSequenceStringFromLoc(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:175
bool IsOrganelle(int genome)
Definition: utilities.cpp:2831
EDuplicateFeatureType IsDuplicate(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2, bool check_partials=false, bool case_sensitive=false)
Reports how two features duplicate each other.
Definition: utilities.cpp:1838
bool IsDicistronicGene(const CSeq_feat_Handle &f)
Indicates whether feature is a dicistronic gene.
Definition: utilities.cpp:1818
bool HasBadProteinStart(const CSeqVector &sv)
Definition: utilities.cpp:2467
void CheckBioseqEndsForNAndGap(const CBioseq_Handle &bsh, EBioseqEndIsType &begin_n, EBioseqEndIsType &begin_gap, EBioseqEndIsType &end_n, EBioseqEndIsType &end_gap, bool &begin_ambig, bool &end_ambig)
Definition: utilities.cpp:1422
bool IsNG(const CSeq_id &id)
Definition: utilities.cpp:2790
bool SeqIsPatent(const CBioseq &seq)
Definition: utilities.cpp:1155
bool IsFarLocation(const CSeq_loc &loc, const CSeq_entry_Handle &seh)
Definition: utilities.cpp:159
CRef< CSeqVector > MakeSeqVectorForResidueCounting(const CBioseq_Handle &bsh)
Definition: utilities.cpp:2455
EAccessionFormatError ValidateAccessionString(const string &accession, bool require_version)
Definition: utilities.cpp:624
EDuplicateFeatureType
Definition: utilities.hpp:191
@ eDuplicate_Duplicate
Definition: utilities.hpp:193
@ eDuplicate_DuplicateDifferentTable
Definition: utilities.hpp:195
@ eDuplicate_SameIntervalDifferentLabel
Definition: utilities.hpp:194
@ eDuplicate_Not
Definition: utilities.hpp:192
@ eDuplicate_SameIntervalDifferentLabelDifferentTable
Definition: utilities.hpp:196
const CSeq_feat::TDbxref TDbtags
Definition: utilities.hpp:199
bool IsTemporary(const CSeq_id &id)
Definition: utilities.cpp:2817
bool IsBioseqTSA(const CBioseq &seq, CScope *scope)
Definition: utilities.cpp:884
@ eAccessionFormat_valid
Definition: utilities.hpp:106
bool g_IsMasterAccession(const CSeq_id &id)
Definition: utilities.cpp:2999
size_t CountProteinStops(const CSeqVector &sv)
Definition: utilities.cpp:2504
bool s_PartialAtGapOrNs(CScope *scope, const CSeq_loc &loc, unsigned int tag, bool only_gap=false)
Definition: utilities.cpp:1176
static char tmp[2048]
Definition: utf8.c:42
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
#define fi
static const char * prefix[]
Definition: pcregrep.c:405
#define FOR_EACH_PUB_ON_PUBDESC(Itr, Var)
FOR_EACH_PUB_ON_PUBDESC EDIT_EACH_PUB_ON_PUBDESC.
Definition: pub_macros.hpp:127
int offset
Definition: replacements.h:160
#define FOR_EACH_DESCRIPTOR_ON_BIOSEQ
Definition: seq_macros.hpp:241
#define FOR_EACH_ANNOT_ON_BIOSEQ
Definition: seq_macros.hpp:286
#define FOR_EACH_SEQID_ON_BIOSEQ(Itr, Var)
FOR_EACH_SEQID_ON_BIOSEQ EDIT_EACH_SEQID_ON_BIOSEQ.
Definition: seq_macros.hpp:308
#define IF_EXISTS_CLOSEST_BIOSOURCE(Cref, Var, Lvl)
IF_EXISTS_CLOSEST_BIOSOURCE.
Definition: seq_macros.hpp:159
#define FOR_EACH_KEYWORD_ON_GENBANKBLOCK(Itr, Var)
FOR_EACH_KEYWORD_ON_GENBANKBLOCK EDIT_EACH_KEYWORD_ON_GENBANKBLOCK.
CSubSource::TSubtype TSUBSOURCE_SUBTYPE
#define NCBI_GENOME(Type)
@NAME Convenience macros for NCBI objects
#define FOR_EACH_SYNONYM_ON_GENEREF(Itr, Var)
FOR_EACH_SYNONYM_ON_GENEREF EDIT_EACH_SYNONYM_ON_GENEREF.
#define NCBI_ORGMOD(Type)
COrgMod definitions.
#define FOR_EACH_GBQUAL_ON_SEQFEAT(Itr, Var)
FOR_EACH_GBQUAL_ON_SEQFEAT EDIT_EACH_GBQUAL_ON_SEQFEAT.
#define FOR_EACH_SUBSOURCE_ON_BIOSOURCE(Itr, Var)
FOR_EACH_SUBSOURCE_ON_BIOSOURCE EDIT_EACH_SUBSOURCE_ON_BIOSOURCE.
COrgMod::TSubtype TORGMOD_SUBTYPE
#define FOR_EACH_SEQFEATXREF_ON_SEQFEAT(Itr, Var)
FOR_EACH_SEQFEATXREF_ON_SEQFEAT EDIT_EACH_SEQFEATXREF_ON_SEQFEAT.
#define FOR_EACH_DBXREF_ON_FEATURE
#define NCBI_SEQID(Type)
@NAME Convenience macros for NCBI objects
#define NCBI_ACCN(Type)
#define FOR_EACH_SEQENTRY_ON_SEQSET(Itr, Var)
FOR_EACH_SEQENTRY_ON_SEQSET EDIT_EACH_SEQENTRY_ON_SEQSET.
#define FIELD_IS_SET_AND_IS(Var, Fld, Chs)
FIELD_IS_SET_AND_IS base macro.
#define FOR_EACH_STRING_IN_LIST(Itr, Var)
FOR_EACH_STRING_IN_LIST EDIT_EACH_STRING_IN_LIST.
#define RAW_FIELD_IS_EMPTY_OR_UNSET(Var, Fld)
RAW_FIELD_IS_EMPTY_OR_UNSET macro.
#define GET_FIELD(Var, Fld)
GET_FIELD base macro.
#define FOR_EACH_CHAR_IN_STRING(Itr, Var)
FOR_EACH_CHAR_IN_STRING EDIT_EACH_CHAR_IN_STRING.
bool seq_mac_is_unique(Iterator iter1, Iterator iter2, Predicate pred)
#define BEGIN_COMMA_END(container)
static const char * str(char *buf, int n)
Definition: stats.c:84
CSeqFeatData::ESubtype feat_subtype
Definition: cache_impl.hpp:158
SAnnotSelector –.
bool operator()(const CTempString &lhs, const CTempString &rhs) const
bool operator()(const CTempString &lhs, const CTempString &rhs) const
Selector used in CSeqMap methods returning iterators.
Definition: seq_map_ci.hpp:113
Definition: type.c:6
#define _ASSERT
else result
Definition: token2.c:20
map< string, string > TViralMap
bool HasExcludedAnnotation(const CSeq_loc &loc, CBioseq_Handle far_bsh)
static bool s_NotPeptideException(const CSeq_feat &curr, const CSeq_feat &prev)
static char CheckForBadFileIDSeqIdChars(const string &id)
bool s_ContainedIn(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
bool s_FieldHasLabel(const CUser_field &field, const string &label)
bool s_AfterIsGapORN(TSeqPos pos, TSeqPos after, TSeqPos len, const CSeqVector &vec)
bool s_CheckIntervals(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
MAKE_CONST_MAP(kViralStrandMap, string, string, { {"root", "dsDNA"}, {"Alphasatellitidae", "ssDNA"}, {"Anelloviridae", "ssDNA(-)"}, {"Bacilladnaviridae", "ssDNA"}, {"Bidnaviridae", "ssDNA"}, {"Circoviridae", "ssDNA(+/-)"}, {"Geminiviridae", "ssDNA(+/-)"}, {"Genomoviridae", "ssDNA"}, {"Hepadnaviridae", "dsDNA-RT"}, {"Inoviridae", "ssDNA(+)"}, {"Microviridae", "ssDNA(+)"}, {"Nanoviridae", "ssDNA(+)"}, {"Ortervirales", "ssRNA-RT"}, {"Caulimoviridae", "dsDNA-RT"}, {"Parvoviridae", "ssDNA(+/-)"}, {"Alphapleolipovirus", "dsDNA; ssDNA"}, {"Riboviria", "RNA"}, {"Albetovirus", "ssRNA(+)"}, {"Alphatetraviridae", "ssRNA(+)"}, {"Alvernaviridae", "ssRNA(+)"}, {"Amalgaviridae", "dsRNA"}, {"Astroviridae", "ssRNA(+)"}, {"Aumaivirus", "ssRNA(+)"}, {"Avsunviroidae", "ssRNA"}, {"Barnaviridae", "ssRNA(+)"}, {"Benyviridae", "ssRNA(+)"}, {"Birnaviridae", "dsRNA"}, {"Botourmiaviridae", "ssRNA(+)"}, {"Botybirnavirus", "dsRNA"}, {"Bromoviridae", "ssRNA(+)"}, {"Caliciviridae", "ssRNA(+)"}, {"Carmotetraviridae", "ssRNA(+)"}, {"Chrysoviridae", "dsRNA"}, {"Closteroviridae", "ssRNA(+)"}, {"Cystoviridae", "dsRNA"}, {"Deltavirus", "ssRNA(-)"}, {"dsRNA viruses", "dsRNA"}, {"Endornaviridae", "dsRNA"}, {"Flaviviridae", "ssRNA(+)"}, {"Hepeviridae", "ssRNA(+)"}, {"Hypoviridae", "ssRNA(+)"}, {"Idaeovirus", "ssRNA(+)"}, {"Kitaviridae", "ssRNA(+)"}, {"Leviviridae", "ssRNA(+)"}, {"Luteoviridae", "ssRNA(+)"}, {"Matonaviridae", "ssRNA(+)"}, {"Megabirnaviridae", "dsRNA"}, {"Narnaviridae", "ssRNA(+)"}, {"Haploviricotina", "ssRNA(-)"}, {"Arenaviridae", "ssRNA(+/-)"}, {"Coguvirus", "ssRNA(-)"}, {"Cruliviridae", "ssRNA(-)"}, {"Fimoviridae", "ssRNA(-)"}, {"Hantaviridae", "ssRNA(-)"}, {"Leishbuviridae", "ssRNA(-)"}, {"Mypoviridae", "ssRNA(-)"}, {"Nairoviridae", "ssRNA(-)"}, {"Peribunyaviridae", "ssRNA(-)"}, {"Phasmaviridae", "ssRNA(-)"}, {"Banyangvirus", "ssRNA(+/-)"}, {"Beidivirus", "ssRNA(-)"}, {"Goukovirus", "ssRNA(-)"}, {"Horwuvirus", "ssRNA(-)"}, {"Hudivirus", "ssRNA(-)"}, {"Hudovirus", "ssRNA(-)"}, {"Kabutovirus", "ssRNA(-)"}, {"Laulavirus", "ssRNA(-)"}, {"Mobuvirus", "ssRNA(-)"}, {"Phasivirus", "ssRNA(-)"}, {"Phlebovirus", "ssRNA(+/-)"}, {"Pidchovirus", "ssRNA(-)"}, {"Tenuivirus", "ssRNA(-)"}, {"Wenrivirus", "ssRNA(-)"}, {"Wubeivirus", "ssRNA(-)"}, {"Tospoviridae", "ssRNA(+/-)"}, {"Wupedeviridae", "ssRNA(-)"}, {"Insthoviricetes", "ssRNA(-)"}, {"Nidovirales", "ssRNA(+)"}, {"Nodaviridae", "ssRNA(+)"}, {"Papanivirus", "ssRNA(+)"}, {"Partitiviridae", "dsRNA"}, {"Permutotetraviridae", "ssRNA(+)"}, {"Picobirnaviridae", "dsRNA"}, {"Picornavirales", "ssRNA(+)"}, {"Pospiviroidae", "ssRNA"}, {"Potyviridae", "ssRNA(+)"}, {"Quadriviridae", "dsRNA"}, {"Reoviridae", "dsRNA"}, {"Sarthroviridae", "ssRNA(+)"}, {"Sinaivirus", "ssRNA(+)"}, {"Solemoviridae", "ssRNA(+)"}, {"Solinviviridae", "ssRNA(+)"}, {"Togaviridae", "ssRNA(+)"}, {"Tombusviridae", "ssRNA(+)"}, {"Totiviridae", "dsRNA"}, {"Tymovirales", "ssRNA(+)"}, {"Virgaviridae", "ssRNA(+)"}, {"Virtovirus", "ssRNA(+)"}, {"ssRNA viruses", "ssRNA"}, {"unclassified ssRNA viruses", "ssRNA"}, {"unclassified ssRNA negative-strand viruses", "ssRNA(-)"}, {"unclassified ssRNA positive-strand viruses", "ssRNA(+)"}, {"unclassified viroids", "ssRNA"}, {"DNA satellites", "DNA"}, {"RNA satellites", "RNA"}, {"Smacoviridae", "ssDNA"}, {"Spiraviridae", "ssDNA(+)"}, {"Tolecusatellitidae", "ssDNA"}, {"unclassified viruses", "unknown"}, {"unclassified DNA viruses", "DNA"}, {"unclassified archaeal dsDNA viruses", "dsDNA"}, {"unclassified dsDNA phages", "dsDNA"}, {"unclassified dsDNA viruses", "dsDNA"}, {"unclassified ssDNA bacterial viruses", "ssDNA"}, {"unclassified ssDNA viruses", "ssDNA"}, {"environmental samples", "unknown"}, })
static TViralMap s_InitializeViralMap()
static bool s_MatchPartialType(const CSeq_loc &loc1, const CSeq_loc &loc2, unsigned int partial_type)
@ e_RnaPosition_MIDDLE_RIBOSOMAL_SUBUNIT
@ e_RnaPosition_INTERNAL_SPACER_X
@ e_RnaPosition_LEFT_RIBOSOMAL_SUBUNIT
@ e_RnaPosition_Ignore
@ e_RnaPosition_INTERNAL_SPACER_2
@ e_RnaPosition_RIGHT_RIBOSOMAL_SUBUNIT
@ e_RnaPosition_INTERNAL_SPACER_1
static bool s_SubsequentIntron(CFeat_CI feat_ci_dup, Int4 start, Int4 stop, Int4 max)
#define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var)
static bool s_GetFlankingGapTypes(const CSeq_inst &inst, CSeq_gap::TType &fst, CSeq_gap::TType &lst)
static bool s_SeqIdMatch(const CConstRef< CSeq_id > &q1, const CConstRef< CSeq_id > &q2)
bool x_IsPseudo(const CGene_ref &ref)
static bool s_LocSortCompare(const CConstRef< CSeq_loc > &q1, const CConstRef< CSeq_loc > &q2)
static int CountNs(const CSeq_data &seq_data, TSeqPos len)
bool s_BeforeIsGapOrN(TSeqPos pos, TSeqPos before, const CSeqVector &vec)
static bool x_BadCDSinVDJC(const CSeq_loc &cdsloc, const CSeq_loc &vdjcloc, CScope *scope)
static bool x_FeatIsVDJC(const CSeq_feat &ft)
static int s_MaxNsInSeqLitForTech(CMolInfo::TTech tech)
unsigned int s_IdXrefsNotReciprocal(const CSeq_feat &cds, const CSeq_feat &mrna)
bool s_IsCDDFeat(const CMappedFeat &feat)
static EDiagSev GetBioseqEndWarning(const CBioseq &seq, bool is_circular, EBioseqEndIsType end_is_char)
bool s_CheckPosNOrGap(TSeqPos pos, const CSeqVector &vec)
bool s_DbtagEqual(const CRef< CDbtag > &dbt1, const CRef< CDbtag > &dbt2)
static bool x_FeatIsCDS(const CSeq_feat &ft)
bool s_HasGI(const CBioseq &seq)
bool s_AfterIsGap(TSeqPos pos, TSeqPos after, TSeqPos len, const CSeqVector &vec)
static optional< int > s_MaxSeqStretchIfLessThanThreshold(const CSeqVector &vec, int threshold)
bool HasUnverified(CBioseq_Handle bsh)
static bool s_OrgModEqual(const CRef< COrgMod > &om1, const CRef< COrgMod > &om2)
string s_GetMrnaProductString(const CSeq_feat &mrna)
static bool s_SubsourceEquivalent(const CRef< CSubSource > &st1, const CRef< CSubSource > &st2)
bool x_HasNamedQual(const CSeq_feat &feat, const string &qual)
static char CheckForBadSeqIdChars(const string &id)
static string s_GetKeywordForStructuredComment(const CUser_object &obj)
TGi GetGIForSeqId(const CSeq_id &id, CScope &scope)
bool StrandsMatch(ENa_strand s1, ENa_strand s2)
static CBioseq_Handle s_GetParent(const CBioseq_Handle &part)
static ERnaPosition s_RnaPosition(const CSeq_feat &feat)
bool s_AreAdjacent(ERnaPosition pos1, ERnaPosition pos2)
bool lists_match(Iterator iter1, Iterator iter1_stop, Iterator iter2, Iterator iter2_stop, Predicate pred)
static bool s_IsConWithGaps(const CBioseq &seq)
static bool s_BiosrcFullLengthIsOk(const CBioSource &src)
static bool s_StandaloneProt(const CBioseq_Handle &bsh)
static TSeqPos s_GetDeltaLen(const CDelta_seq &seg, CScope *scope)
bool s_IdXrefsAreReciprocal(const CSeq_feat &cds, const CSeq_feat &mrna)
static bool HasAssemblyOrNullGap(const CBioseq &seq)
static bool s_IsTPAAssemblyOkForBioseq(const CBioseq &seq, bool has_refseq)
bool s_HasTpaUserObject(CBioseq_Handle bsh)
static bool s_OrgrefEquivalent(const COrg_ref &org1, const COrg_ref &org2)
bool s_GeneralTagsMatch(const string &protein_id, const CDbtag &dbtag)
static bool s_WillReportTerminalGap(const CBioseq &seq, CBioseq_Handle bsh)
string s_GetMrnaProteinLink(const CUser_field &field)
static bool s_ReportableCollision(const CGene_ref &g1, const CGene_ref &g2)
static char CheckForBadLocalIdChars(const string &id)
USING_SCOPE(sequence)
bool s_BeforeIsGap(TSeqPos pos, TSeqPos before, const CSeqVector &vec)
static bool s_IsSkippableDbtag(const CDbtag &dbt)
static void s_MakePubLabelString(const CPubdesc &pd, string &label)
static void s_GetGeneTextLabel(const CSeq_feat &feat, string &label)
static vector< int > s_LocationToStartStopPairs(const CSeq_loc &loc)
static void GetDateString(string &out_date_str, const CDate &date)
static bool s_SeqIdCompare(const CConstRef< CSeq_id > &q1, const CConstRef< CSeq_id > &q2)
static bool s_IsSwissProt(const CBioseq &seq)
bool s_FieldHasNonBlankValue(const CUser_field &field)
static string linkEvStrings[]
static bool s_IsUnspecified(const CSeq_gap &gap)
static bool s_SuppressMultipleEquivBioSources(const CBioSource &src)
bool s_OverlapOrAbut(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
static bool x_IsWgsSecondary(const CBioseq &seq)
Modified on Thu Mar 28 17:07:59 2024 by modify_doxy.py rev. 669887