NCBI C++ ToolKit
validerror_bioseq.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validerror_bioseq.cpp 100298 2023-07-18 17:48:57Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat ......
27  *
28  * File Description:
29  * validation of bioseq
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbitime.hpp>
37 #include <corelib/ncbimisc.hpp>
38 
42 #include <objtools/error_codes.hpp>
43 
45 
46 #include <objmgr/seqdesc_ci.hpp>
47 #include <objmgr/graph_ci.hpp>
48 #include <objmgr/util/sequence.hpp>
49 
51 
52 #include <optional>
53 
54 
55 #define NCBI_USE_ERRCODE_X Objtools_Validator
56 
59 BEGIN_SCOPE(validator)
60 USING_SCOPE(sequence);
61 USING_SCOPE(feature);
62 
63 class CCdsMatchInfo;
64 
65 class CMrnaMatchInfo : public CObject {
66 public:
67  CMrnaMatchInfo(const CSeq_feat& mrna, CScope* scope);
68  const CSeq_feat& GetSeqfeat() const;
69  bool Overlaps(const CSeq_feat& cds) const;
70  void SetMatch();
71  bool HasMatch() const;
72  void SetPseudo(bool val = true) { m_IsPseudo = val; }
73  bool OkWithoutCds(bool isGenbank = false) const;
74 
75 private:
77 
79  bool m_HasMatch;
80  bool m_IsPseudo;
81 };
82 
83 
84 class CCdsMatchInfo : public CObject {
85 public:
86  CCdsMatchInfo(const CSeq_feat& cds, CScope* scope);
87  const CSeq_feat& GetSeqfeat() const;
88  bool Overlaps(const CSeq_feat& mrna) const;
89  bool AssignXrefMatch(TmRNAList& unmatched_mrnas, const CTSE_Handle& tse);
90  bool AssignOverlapMatch(TmRNAList& unmatched_mrnas, CScope& scope);
91  void UpdateOtherMrnas(const TmRNAList& unmatched_mrnas);
92  size_t CountOtherMrnas() { return m_OtherMrnas.size(); }
95  bool AssignMatch(TmRNAList& mrna_map, CFeatTree& feat_tree, CScope& scope);
96  bool HasMatch() const;
97  void NeedsMatch(bool needs_match);
98  bool NeedsMatch() const;
99  const CMrnaMatchInfo& GetMatch() const;
100  bool IsPseudo() const;
101  void SetPseudo();
102 
103 private:
106 
111  list<CConstRef<CSeq_feat> > m_OtherMrnas;
113 };
114 
115 
116 // =============================================================================
117 // Public
118 // =============================================================================
119 
121  CValidError_base(imp), m_AnnotValidator(imp), m_DescrValidator(imp), m_FeatValidator(imp), m_GeneIt(nullptr), m_AllFeatIt(nullptr)
122 {
123 }
124 
125 
127 {
128 }
129 
131 {
132 
135 
136  if (bsh.IsSetInst_Repr()) {
137  repr = bsh.GetInst_Repr();
138  }
139 
141  while (m)
142  {
143  const CSeqdesc::TMolinfo& mi = m->GetMolinfo();
144  if (mi.IsSetTech()) {
145  tech = mi.GetTech();
146  }
147 
148  ++m;
149  }
150 
151  for (auto id : bsh.GetId()) {
152  CSeq_id::EAccessionInfo acc_info = id.IdentifyAccession();
153  unsigned int acc_div = acc_info & CSeq_id::eAcc_division_mask;
154  if (acc_div == CSeq_id::eAcc_wgs && tech == CMolInfo::eTech_wgs && repr == CSeq_inst::eRepr_virtual) {
155  bool is_wgs_master = (acc_info & CSeq_id::fAcc_master) != 0;
156  if (is_wgs_master) {
157  m_report_short_seq = false;
158  }
159  }
160  }
161 
163  while (d)
164  {
165  const CSeqdesc::TSource& source = d->GetSource();
166 
167  // look for chromosome, prokaryote, linkage group
169  if ((*it)->IsSetSubtype() && (*it)->IsSetName() && !NStr::IsBlank((*it)->GetName())) {
170  if ((*it)->GetSubtype() == CSubSource::eSubtype_chromosome) {
172  } else if ((*it)->GetSubtype() == CSubSource::eSubtype_linkage_group) {
174  }
175  }
176  }
177  if (source.IsSetLineage()) {
178  string lineage = source.GetLineage();
179  if (NStr::StartsWith(lineage, "Bacteria; ") ||
180  NStr::StartsWith(lineage, "Archaea; ")) {
183  m_is_bact_or_arch = true;
184  }
185  if (NStr::StartsWith(lineage, "Viruses; ")) {
187  }
188  }
189  if (source.IsSetDivision()) {
190  string div = source.GetDivision();
191  if (NStr::Equal(div, "BCT") || NStr::Equal(div, "VRL")) {
194  }
195  }
196  if (source.IsSetGenome()) {
197  CBioSource::TGenome genome = source.GetGenome();
198  // check for organelle
199  if (IsOrganelle(genome)) {
201  }
202  m_is_plasmid = (genome == NCBI_GENOME(plasmid));
203  m_is_chromosome = (genome == NCBI_GENOME(chromosome));
204  m_is_extrachrom = (genome == NCBI_GENOME(extrachrom));
205  }
206 
207  ++d;
208  }
209 
210 }
211 
212 
214  const CBioseq& seq)
215 {
216  m_splicing_not_expected = false;
218  m_report_short_seq = true;
219  m_is_bact_or_arch = false;
220  m_is_plasmid = false;
221  m_is_chromosome = false;
222  m_is_extrachrom = false;
223 
224  try {
226 
228 
229  CSeq_entry_Handle appropriate_parent;
230  if (m_Imp.ShouldSubdivide()) {
232  }
233  if (appropriate_parent) {
234  CRef<CScope> tmp_scope(new CScope(*(CObjectManager::GetInstance())));
235  tmp_scope->AddDefaults();
236  CSeq_entry_Handle this_seh = tmp_scope->AddTopLevelSeqEntry(*(appropriate_parent.GetCompleteSeq_entry()));
237  m_FeatValidator.SetScope(*tmp_scope);
238  m_FeatValidator.SetTSE(this_seh);
239  } else {
242  }
243 
244  try {
245  CCacheImpl::SFeatKey gene_key(
248  m_GeneIt = &GetCache().GetFeatFromCache(gene_key);
249 
250  CCacheImpl::SFeatKey all_feat_key(
253  m_AllFeatIt = &GetCache().GetFeatFromCache(all_feat_key);
254  } catch ( const exception& ) {
255  // sequence might be too broken to validate features
256  m_GeneIt = nullptr;
257  m_AllFeatIt = nullptr;
258  }
259  ValidateSeqIds(seq);
260  ValidateInst(seq);
262  ValidateHistory(seq);
263  FOR_EACH_ANNOT_ON_BIOSEQ (annot, seq) {
266  }
267  if (seq.IsSetDescr()) {
268  if (m_CurrentHandle) {
270  if (ctx) {
271  m_DescrValidator.ValidateSeqDescr (seq.GetDescr(), *(ctx.GetCompleteSeq_entry()));
272  }
273  }
274  }
275  if (IsWGSMaster(seq, m_CurrentHandle.GetScope())) {
277  }
278  if (appropriate_parent) {
281  }
282 
283  } catch ( const exception& e ) {
285  string("Exception while validating bioseq. EXCEPTION: ") +
286  e.what(), seq);
287  }
289  if (m_GeneIt) {
290  m_GeneIt = nullptr;
291  }
292  if (m_AllFeatIt) {
293  m_AllFeatIt = nullptr;
294  }
295 }
296 
297 
298 static bool s_IsSkippableDbtag (const CDbtag& dbt)
299 {
300  if (!dbt.IsSetDb()) {
301  return false;
302  }
303  const string& db = dbt.GetDb();
304  if (NStr::EqualNocase(db, "TMSMART")
305  || NStr::EqualNocase(db, "BankIt")
306  || NStr::EqualNocase (db, "NCBIFILE")) {
307  return true;
308  } else {
309  return false;
310  }
311 }
312 
313 static char CheckForBadSeqIdChars (const string& id)
314 
315 {
316  FOR_EACH_CHAR_IN_STRING(itr, id) {
317  const char& ch = *itr;
318  if (ch == '|' || ch == ',') return ch;
319  }
320  return '\0';
321 }
322 
323 // VR-748
324 static char CheckForBadLocalIdChars(const string& id)
325 {
326  for (size_t i = 0; i < id.length(); i++) {
327  if (!CSeq_id::IsValidLocalID(id.substr(i, 1))) {
328  return id.c_str()[i];
329  }
330  }
331  return '\0';
332 }
333 
334 
335 static char CheckForBadFileIDSeqIdChars(const string& id)
336 {
337  FOR_EACH_CHAR_IN_STRING(itr, id) {
338  const char& ch = *itr;
339  if (ch == '|' || ch == ',') return ch;
340  }
341  return '\0';
342 }
343 
344 
345 // validation for individual Seq-id
346 void CValidError_bioseq::ValidateSeqId(const CSeq_id& id, const CBioseq& ctx, bool longer_general)
347 {
348  // see if ID can be used to find ctx
349  CBioseq_Handle ctx_handle = m_Scope->GetBioseqHandle(ctx);
350  if (!ctx_handle) {
351  if (!m_Imp.IsPatent()) {
353  "BioseqFind (" + id.AsFastaString() +
354  ") unable to find itself - possible internal error", ctx);
355  }
356  return;
357  }
358  CTSE_Handle tse = ctx_handle.GetTSE_Handle();
359  CBioseq_Handle bsh = tse.GetBioseqHandle(id);
360 
361  if (bsh) {
362  CConstRef<CBioseq> core = bsh.GetBioseqCore();
363  if (!core) {
364  if ( !m_Imp.IsPatent() ) {
366  "BioseqFind (" + id.AsFastaString() +
367  ") unable to find itself - possible internal error", ctx);
368  }
369  } else if ( core.GetPointer() != &ctx ) {
371  "SeqID " + id.AsFastaString() +
372  " is present on multiple Bioseqs in record", ctx);
373  }
374  } else {
376  "BioseqFind (" + id.AsFastaString() +
377  ") unable to find itself - possible internal error", ctx);
378  }
379 
380  //check formatting
381  const CTextseq_id* tsid = id.GetTextseq_Id();
382 
383  switch (id.Which()) {
384  case CSeq_id::e_Tpg:
385  case CSeq_id::e_Tpe:
386  case CSeq_id::e_Tpd:
387  if ( IsHistAssemblyMissing(ctx) && ctx.IsNa() ) {
389  "TPA record " + ctx.GetId().front()->AsFastaString() +
390  " should have Seq-hist.assembly for PRIMARY block",
391  ctx);
392  }
393  // Fall thru
394  case CSeq_id::e_Genbank:
395  case CSeq_id::e_Embl:
396  case CSeq_id::e_Ddbj:
397  if ( tsid && tsid->IsSetAccession() ) {
398  const string& acc = tsid->GetAccession();
399  const char badch = CheckForBadSeqIdChars (acc);
400  if (badch != '\0') {
402  "Bad character '" + string(1, badch) + "' in accession '" + acc + "'", ctx);
403  }
405  if (info == CSeq_id::eAcc_unknown ||
406  (ctx.IsNa() && (info & CSeq_id::fAcc_prot)) ||
407  (ctx.IsAa() && (info & CSeq_id::fAcc_nuc))) {
409  "Bad accession " + acc, ctx);
410  }
411  // Check for secondary conflicts
414  }
415  // Fall thru
416  case CSeq_id::e_Other:
417  if ( tsid ) {
418  if ( tsid->IsSetName() ) {
419  const string& name = tsid->GetName();
420  ITERATE (string, s, name) {
421  if (isspace((unsigned char)(*s))) {
424  "Seq-id.name '" + name + "' should be a single "
425  "word without any spaces", ctx);
426  break;
427  }
428  }
429  }
430 
431  if ( tsid->IsSetAccession() && id.IsOther() ) {
432  const string& acc = tsid->GetAccession();
433  const char badch = CheckForBadSeqIdChars (acc);
434  if (badch != '\0') {
436  "Bad character '" + string(1, badch) + "' in accession '" + acc + "'", ctx);
437  }
438  size_t num_letters = 0;
439  size_t num_digits = 0;
440  size_t num_underscores = 0;
441  bool bad_id_chars = false;
442  bool is_NZ = (NStr::CompareNocase(acc, 0, 3, "NZ_") == 0);
443  size_t i = 0;
444  bool letter_after_digit = false;
445 
446  if ( is_NZ ) {
447  i = 3;
448  }
449 
450  for ( ; i < acc.length(); ++i ) {
451  if ( isupper((unsigned char) acc[i]) ) {
452  num_letters++;
453  } else if ( isdigit((unsigned char) acc[i]) ) {
454  num_digits++;
455  } else if ( acc[i] == '_' ) {
456  num_underscores++;
457  if ( num_digits > 0 || num_underscores > 1 ) {
458  letter_after_digit = true;
459  }
460  } else {
461  bad_id_chars = true;
462  }
463  }
464 
465  if ( letter_after_digit || bad_id_chars ) {
467  "Bad accession " + acc, ctx);
468  } else if ( is_NZ && ( num_letters == 4 || num_letters == 6 ) &&
469  ( num_digits >= 8 && num_digits <= 11 ) && num_underscores == 0 ) {
470  // valid accession - do nothing!
471  } else if ( is_NZ && ValidateAccessionString (acc, false) == eAccessionFormat_valid ) {
472  // valid accession - do nothing!
473  } else if ( num_letters == 2 &&
474  (num_digits == 6 || num_digits == 8 || num_digits == 9) &&
475  num_underscores == 1 ) {
476  // valid accession - do nothing!
477  } else if (num_letters == 4 && num_digits == 10 && ctx.IsNa()) {
478  } else {
480  "Bad accession " + acc, ctx);
481  }
482  }
483  }
484  // Fall thru
485  case CSeq_id::e_Pir:
487  case CSeq_id::e_Prf:
488  if ( tsid ) {
489  if ( ctx.IsNa() &&
490  (!tsid->IsSetAccession() || tsid->GetAccession().empty())) {
491  if ( ctx.GetInst().GetRepr() != CSeq_inst::eRepr_seg ||
492  m_Imp.IsGI()) {
493  if (!id.IsDdbj() ||
494  ctx.GetInst().GetRepr() != CSeq_inst::eRepr_seg) {
495  string msg = "Missing accession for " + id.AsFastaString();
498  msg, ctx);
499  }
500  }
501  }
502  } else {
504  "Seq-id type not handled", ctx);
505  }
506  break;
507  case CSeq_id::e_Gi:
508  if (id.GetGi() <= ZERO_GI) {
510  "Invalid GI number", ctx);
511  }
512  break;
513  case CSeq_id::e_General:
514  if (!id.GetGeneral().IsSetDb() || NStr::IsBlank(id.GetGeneral().GetDb())) {
515  PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, "General identifier missing database field", ctx);
516  }
517  if (id.GetGeneral().IsSetDb()) {
518  const CDbtag& dbt = id.GetGeneral();
519  size_t dblen = dbt.GetDb().length();
520  EDiagSev sev = eDiag_Error;
521  if (m_Imp.IsLocalGeneralOnly()) {
522  sev = eDiag_Critical;
523  } else if (m_Imp.IsRefSeq()) {
524  sev = eDiag_Error;
525  } else if (m_Imp.IsINSDInSep()) {
526  sev = eDiag_Error;
527  } else if (m_Imp.IsIndexerVersion()) {
528  sev = eDiag_Error;
529  }
530  static const auto max_dblen = CSeq_id::kMaxGeneralDBLength;
531  if (dblen > max_dblen) {
532  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "General database longer than " + NStr::NumericToString(max_dblen) + " characters", ctx);
533  }
534  if (! s_IsSkippableDbtag(dbt)) {
535  if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
536  size_t idlen = dbt.GetTag().GetStr().length();
537  static const auto maxlen = CSeq_id::kMaxGeneralTagLength;
538  if (longer_general) {
539  if (idlen > 100 && ! m_Imp.IsGI()) {
540  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "General identifier longer than " + NStr::NumericToString(100) + " characters", ctx);
541  }
542  } else {
543  if (idlen > maxlen && ! m_Imp.IsGI()) {
544  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "General identifier longer than " + NStr::NumericToString(maxlen) + " characters", ctx);
545  }
546  }
547  if (idlen == 0) {
548  PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, "General identifier must not be an empty string", ctx);
549  }
550  }
551  }
552  if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
553  const string& acc = dbt.GetTag().GetStr();
554  char badch;
555  if (dbt.IsSetDb() && (NStr::Equal(dbt.GetDb(), "NCBIFILE") || NStr::Equal(dbt.GetDb(), "BankIt"))) {
556  badch = CheckForBadFileIDSeqIdChars(acc);
557  } else {
558  badch = CheckForBadLocalIdChars(acc);
559  if (badch == '\0' && dbt.IsSetDb()) {
560  badch = CheckForBadLocalIdChars(dbt.GetDb());
561  }
562  }
563  if (badch != '\0') {
565  "Bad character '" + string(1, badch) + "' in sequence ID '" + id.AsFastaString() + "'", ctx);
566  }
567  }
568  }
569  break;
570  case CSeq_id::e_Local:
571  if (id.IsLocal() && id.GetLocal().IsStr() && id.GetLocal().GetStr().length() > CSeq_id::kMaxLocalIDLength) {
572  EDiagSev sev = eDiag_Error;
573  if (! m_Imp.IsINSDInSep()) {
574  sev = eDiag_Critical;
575  } else if (! m_Imp.IsIndexerVersion()) {
576  sev = eDiag_Error;
577  }
578  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "Local identifier longer than " + NStr::NumericToString(CSeq_id::kMaxLocalIDLength) + " characters", ctx);
579  }
580  if (id.IsLocal() && id.GetLocal().IsStr()) {
581  const string& acc = id.GetLocal().GetStr();
582  const char badch = CheckForBadLocalIdChars(acc);
583  if (badch != '\0') {
585  "Bad character '" + string(1, badch) + "' in local ID '" + acc + "'", ctx);
586  }
587  }
588  break;
589  case CSeq_id::e_Pdb:
590  if (id.IsPdb()) {
591  const CPDB_seq_id& pdb = id.GetPdb();
592  if (pdb.IsSetChain() && pdb.IsSetChain_id()) {
593  int chain = pdb.GetChain();
594  const string& chain_id = pdb.GetChain_id();
595  if (chain_id.size() == 1 && chain_id[0] == chain) {
596  break; // OK (straightforward match)
597  } else if (islower(chain) && chain_id.size() == 2
598  && chain_id[0] == chain_id[1]
599  && chain_id[0] == toupper(chain)) {
600  break; // OK (historic special case)
601  } else if (chain == '|' && chain_id == "VB") {
602  break; // OK (likewise)
603  } else {
605  "PDB Seq-id contains mismatched \'chain\' and"
606  " \'chain-id\' slots", ctx);
607  }
608  }
609  }
610  break;
611  default:
612  break;
613  }
614 
615 #if 0
616  // disabled for now
617  if (!IsNCBIFILESeqId(**i)) {
618  string label;
619  (*i)->GetLabel(&label);
620  if (label.length() > 40) {
622  "Sequence ID is unusually long (" +
623  NStr::IntToString(label.length()) + "): " + label,
624  seq);
625  }
626  }
627 #endif
628 
629 }
630 
631 static bool x_IsWgsSecondary (const CBioseq& seq)
632 
633 {
635  const list< string > *extra_acc = nullptr;
636  const CSeqdesc& desc = **sd;
637  switch (desc.Which()) {
638  case CSeqdesc::e_Genbank:
639  if (desc.GetGenbank().IsSetExtra_accessions()) {
640  extra_acc = &(desc.GetGenbank().GetExtra_accessions());
641  }
642  break;
643  case CSeqdesc::e_Embl:
644  if (desc.GetEmbl().IsSetExtra_acc()) {
645  extra_acc = &(desc.GetEmbl().GetExtra_acc());
646  }
647  break;
648  default:
649  break;
650  }
651  if ( extra_acc ) {
652  FOR_EACH_STRING_IN_LIST (acc, *extra_acc) {
655  && (info & CSeq_id::fAcc_master) != 0) {
656  return true;
657  }
658  }
659  }
660  }
661  return false;
662 }
663 
664 // VR-728
665 // cannot have only seq-ids that will be stripped when loading to ID
667 {
668  bool found_good = false;
669  ITERATE(CBioseq::TId, id_it, seq.GetId()) {
670  if (!IsTemporary(**id_it)) {
671  found_good = true;
672  }
673  }
674  if (!found_good) {
676  "The only ids on this Bioseq will be stripped during ID load", seq);
677  }
678 }
679 
680 
682 (const CBioseq& seq)
683 {
684  // Ensure that CBioseq has at least one CSeq_id
685  if ( !seq.IsSetId() || seq.GetId().empty() ) {
687  "No ids on a Bioseq", seq);
688  return;
689  }
690 
691  CSeq_inst::ERepr repr = seq.GetInst().GetRepr();
692 
693  // Loop thru CSeq_ids for this CBioseq. Determine if seq has
694  // gi, NG, or NC. Check that the same CSeq_id not included more
695  // than once.
696  bool has_gi = false;
697  bool is_lrg = false;
698  bool has_ng = false;
699  bool wgs_tech_needs_wgs_accession = false;
700  bool is_segset_accession = false;
701  bool has_wgs_general = false;
702  bool is_eb_db = false;
703  bool longer_general = false;
704 
705  FOR_EACH_SEQID_ON_BIOSEQ (i, seq) {
706  if ((*i)->IsOther() || (*i)->IsEmbl() || (*i)->IsTpe()) {
707  longer_general = true;
708  }
709  }
710 
711  FOR_EACH_SEQID_ON_BIOSEQ (i, seq) {
712  // first, do standalone validation
713  ValidateSeqId (**i, seq, longer_general);
714 
715  if ((*i)->IsGeneral() && (*i)->GetGeneral().IsSetDb()) {
716  if (NStr::EqualNocase((*i)->GetGeneral().GetDb(), "LRG")) {
717  is_lrg = true;
718  }
719  if (NStr::StartsWith((*i)->GetGeneral().GetDb(), "WGS:")) {
720  has_wgs_general = true;
721  }
722  } else if ((*i)->IsOther() && (*i)->GetOther().IsSetAccession()) {
723  const string& acc = (*i)->GetOther().GetAccession();
724  if (NStr::StartsWith(acc, "NG_")) {
725  has_ng = true;
726  wgs_tech_needs_wgs_accession = true;
727  } else if (NStr::StartsWith(acc, "NM_")
728  || NStr::StartsWith(acc, "NP_")
729  || NStr::StartsWith(acc, "NR_")) {
730  wgs_tech_needs_wgs_accession = true;
731  }
732  } else if ((*i)->IsEmbl() && (*i)->GetEmbl().IsSetAccession()) {
733  is_eb_db = true;
734  } else if ((*i)->IsDdbj() && (*i)->GetDdbj().IsSetAccession()) {
735  is_eb_db = true;
736  }
737 
738  // Check that no two CSeq_ids for same CBioseq are same type
739  CBioseq::TId::const_iterator j;
740  for (j = i, ++j; j != seq.GetId().end(); ++j) {
741  if ((**i).Compare(**j) != CSeq_id::e_DIFF) {
742  CNcbiOstrstream os;
743  os << "Conflicting ids on a Bioseq: (";
744  (**i).WriteAsFasta(os);
745  os << " - ";
746  (**j).WriteAsFasta(os);
747  os << ")";
749  CNcbiOstrstreamToString (os) /* os.str() */, seq);
750  }
751  }
752 
753  if ( (*i)->IsGenbank() || (*i)->IsEmbl() || (*i)->IsDdbj() ) {
754  wgs_tech_needs_wgs_accession = true;
755  }
756 
757  if ( (*i)->IsGi() ) {
758  has_gi = true;
759  }
760 
761  if ( (*i)->IdentifyAccession() == CSeq_id::eAcc_segset) {
762  is_segset_accession = true;
763  }
764 
765  }
766  if (is_lrg && !has_ng) {
768  "LRG sequence needs NG_ accession", seq);
769  }
770 
771 
772  // Loop thru CSeq_ids to check formatting
773  bool is_wgs = false;
774  unsigned int gi_count = 0;
775  unsigned int accn_count = 0;
776  unsigned int lcl_count = 0;
777  FOR_EACH_SEQID_ON_BIOSEQ (k, seq) {
778  const CTextseq_id* tsid = (*k)->GetTextseq_Id();
779  switch ((**k).Which()) {
780  case CSeq_id::e_Local:
781  lcl_count++;
782  break;
783  case CSeq_id::e_Tpg:
784  case CSeq_id::e_Tpe:
785  case CSeq_id::e_Tpd:
786  case CSeq_id::e_Genbank:
787  case CSeq_id::e_Embl:
788  case CSeq_id::e_Ddbj:
789  if ( tsid && tsid->IsSetAccession() ) {
790  if ((*k)->IsGenbank() || (*k)->IsEmbl() || (*k)->IsDdbj()) {
791  is_wgs |= IsWGSAccession(**k);
792  }
793 
794  if ( has_gi ) {
795  if (tsid->IsSetVersion() && tsid->GetVersion() == 0) {
796  const string& acc = tsid->GetAccession();
798  "Accession " + acc + " has 0 version", seq);
799  }
800  }
801  }
802  // Fall thru
803  case CSeq_id::e_Other:
804  if ( tsid ) {
805 
806  if ( has_gi && !tsid->IsSetAccession() && tsid->IsSetName() ) {
807  if ( (*k)->IsDdbj() && repr == CSeq_inst::eRepr_seg ) {
808  // Don't report ddbj segmented sequence missing accessions
809  } else {
811  "Missing accession for " + tsid->GetName(), seq);
812  }
813  }
814  accn_count++;
815  }
816  break;
817  // Fall thru
818  case CSeq_id::e_Pir:
820  case CSeq_id::e_Prf:
821  if ( tsid) {
822  if ((!tsid->IsSetAccession() || NStr::IsBlank(tsid->GetAccession())) &&
823  (!tsid->IsSetName() || NStr::IsBlank(tsid->GetName())) &&
824  seq.GetInst().IsAa()) {
825  string label = (*k)->AsFastaString();
827  "Missing identifier for " + label, seq);
828  }
829  accn_count++;
830  }
831  break;
832  case CSeq_id::e_Gi:
833  gi_count++;
834  break;
835  default:
836  break;
837  }
838  }
839 
841  if (!SeqIsPatent(seq) && !seq.IsAa()) {
842  if ( is_wgs ) {
843  if ( !mi || !mi->IsSetTech() ||
844  ( mi->GetTech() != CMolInfo::eTech_wgs &&
845  mi->GetTech() != CMolInfo::eTech_tsa &&
846  mi->GetTech() != CMolInfo::eTech_targeted) ) {
848  "WGS accession should have Mol-info.tech of wgs", seq);
849  }
850  } else if ( mi && mi->IsSetTech() &&
851  mi->GetTech() == CMolInfo::eTech_wgs &&
852  wgs_tech_needs_wgs_accession &&
853  !is_segset_accession &&
854  !has_wgs_general &&
855  !x_IsWgsSecondary(seq)) {
856  EDiagSev sev = eDiag_Error;
857  if (is_eb_db) {
858  sev = eDiag_Warning;
859  }
860  if (! is_eb_db) {
862  "Mol-info.tech of wgs should have WGS accession", seq);
863  }
864  }
865 
866  if ((IsNTNCNWACAccession(seq) || IsNG(seq)) && mi && seq.IsNa()
867  && (!mi->IsSetBiomol()
868  || (mi->GetBiomol() != CMolInfo::eBiomol_genomic
869  && mi->GetBiomol() != CMolInfo::eBiomol_cRNA))) {
871  "genomic RefSeq accession should use genomic or cRNA moltype",
872  seq);
873  }
874  }
875  if (seq.GetInst().GetMol() == CSeq_inst::eMol_dna) {
876  if (mi && mi->IsSetBiomol()) {
877  switch (mi->GetBiomol()) {
890  "Molecule type (DNA) does not match biomol (RNA)", seq);
891  break;
892  default:
893  break;
894  }
895  }
896  }
897 
898  // Check that a sequence with a gi number has exactly one accession
899  if ( gi_count > 0 && accn_count == 0 && !m_Imp.IsPDB() &&
900  repr != CSeq_inst::eRepr_virtual ) {
902  "No accession on sequence with gi number", seq);
903  }
904  if (gi_count > 0 && accn_count > 1) {
906  "Multiple accessions on sequence with gi number", seq);
907  }
908 
909  x_CheckGeneralIDs(seq);
910 
911  if ( m_Imp.IsValidateIdSet() ) {
913  }
914 
915  // C toolkit ensures that there is exactly one CBioseq for a CSeq_id
916  // Not done here because object manager will not allow
917  // the same Seq-id on multiple Bioseqs
918 
919 }
920 
921 
923 {
924  bool rval = false;
925  const CSeq_inst& inst = seq.GetInst();
926  if (inst.IsSetHist() && inst.GetHist().IsSetAssembly()) {
927  return false;
928  }
929  CSeq_inst::TRepr repr = inst.CanGetRepr() ?
931 
932  if ( seq.IsNa() && repr != CSeq_inst::eRepr_seg ) {
933  rval = true;
934  // look for keyword
936  CSeqdesc_CI genbank_i(bsh, CSeqdesc::e_Genbank);
937  if (genbank_i && genbank_i->GetGenbank().IsSetKeywords()) {
938  CGB_block::TKeywords::const_iterator keyword = genbank_i->GetGenbank().GetKeywords().begin();
939  while (keyword != genbank_i->GetGenbank().GetKeywords().end() && rval) {
940  if (NStr::EqualNocase(*keyword, "TPA:reassembly")) {
941  rval = false;
942  }
943  ++keyword;
944  }
945  }
946  if (rval) {
947  CSeqdesc_CI embl_i(bsh, CSeqdesc::e_Embl);
948  if (embl_i && embl_i->GetEmbl().IsSetKeywords()) {
949  CEMBL_block::TKeywords::const_iterator keyword = embl_i->GetEmbl().GetKeywords().begin();
950  while (keyword != embl_i->GetEmbl().GetKeywords().end() && rval) {
951  if (NStr::EqualNocase(*keyword, "TPA:reassembly")) {
952  rval = false;
953  }
954  ++keyword;
955  }
956  }
957  }
958  }
959  return rval;
960 }
961 
962 
964 (const string &primary_acc,
965  const CBioseq &seq,
966  int choice)
967 {
968  CSeqdesc_CI sd(m_Scope->GetBioseqHandle(seq), static_cast<CSeqdesc::E_Choice>(choice));
969  for (; sd; ++sd) {
970  const list< string > *extra_acc = nullptr;
971  if ( choice == CSeqdesc::e_Genbank &&
973  extra_acc = &(sd->GetGenbank().GetExtra_accessions());
974  } else if ( choice == CSeqdesc::e_Embl &&
975  sd->GetEmbl().IsSetExtra_acc() ) {
976  extra_acc = &(sd->GetEmbl().GetExtra_acc());
977  }
978 
979  if ( extra_acc ) {
980  FOR_EACH_STRING_IN_LIST (acc, *extra_acc) {
981  if ( NStr::CompareNocase(primary_acc, *acc) == 0 ) {
982  // If the same post error
985  primary_acc + " used for both primary and"
986  " secondary accession", seq);
987  }
988  }
989  }
990  }
991 }
992 
993 
995 {
996  for (CSeqdesc_CI it(bsh, CSeqdesc::e_User); it; ++it) {
997  if (it->GetUser().GetObjectType() == CUser_object::eObjectType_Unverified) {
998  return true;
999  }
1000  }
1001  return false;
1002 }
1003 
1004 
1006 {
1007  CBioseq_Handle bsh = m_Scope->GetBioseqHandle (seq);
1010 
1011  bool has_barcode_tech = false;
1012 
1014  if (di && di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetTech() == CMolInfo::eTech_barcode) {
1015  has_barcode_tech = true;
1016  }
1017 
1018  bool has_barcode_keyword = false;
1019  for (CSeqdesc_CI it(bsh, CSeqdesc::e_Genbank); it; ++it) {
1020  FOR_EACH_KEYWORD_ON_GENBANKBLOCK (k, it->GetGenbank()) {
1021  if (NStr::EqualNocase (*k, "BARCODE")) {
1022  has_barcode_keyword = true;
1023  break;
1024  }
1025  }
1026  if (has_barcode_keyword && !has_barcode_tech) {
1028  "BARCODE keyword without Molinfo.tech barcode",
1029  *ctx, *it);
1030  }
1031  }
1032  if (has_barcode_tech && !has_barcode_keyword && di) {
1034  "Molinfo.tech barcode without BARCODE keyword",
1035  *ctx, *di);
1036  }
1037  if (has_barcode_keyword && HasUnverified(bsh)) {
1039  "Sequence has both BARCODE and UNVERIFIED keywords",
1040  seq);
1041  }
1042 }
1043 
1044 
1046  const CBioseq& seq)
1047 {
1048  const CSeq_inst& inst = seq.GetInst();
1049 
1050 
1051  // Check representation
1052  if ( !ValidateRepr(inst, seq) ) {
1053  return;
1054  }
1055 
1056  // Check molecule, topology, and strand
1057  if (!inst.IsSetMol()) {
1058  PostErr(eDiag_Error, eErr_SEQ_INST_MolNotSet, "Bioseq.mol is 0",
1059  seq);
1060  } else {
1061  const CSeq_inst::EMol& mol = inst.GetMol();
1062  switch (mol) {
1063 
1064  case CSeq_inst::eMol_na:
1066  "Bioseq.mol is type nucleic acid", seq);
1067  break;
1068 
1069  case CSeq_inst::eMol_aa:
1070  if ( inst.IsSetTopology() &&
1074  "Non-linear topology set on protein", seq);
1075  }
1076  if ( inst.IsSetStrand() &&
1077  inst.GetStrand() != CSeq_inst::eStrand_ss &&
1080  "Protein not single stranded", seq);
1081  }
1082  break;
1083 
1084  case CSeq_inst::eMol_dna:
1085  if (seq.IsSetInst() && seq.GetInst().IsSetTopology() && seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular) {
1086  if (m_is_bact_or_arch) {
1087  if (! m_is_plasmid && ! m_is_chromosome && ! m_is_extrachrom) {
1088  EDiagSev sev = eDiag_Error;
1089  if (IsRefSeq(seq) || m_Imp.IsRefSeqConventions()) {
1090  sev = eDiag_Error;
1091  } else if (IsEmblOrDdbj(seq)) {
1092  sev = eDiag_Warning;
1093  }
1095  "Circular Bacteria or Archaea should be chromosome, or plasmid, or extrachromosomal", seq);
1096  }
1097  }
1098  }
1099  break;
1100 
1102  PostErr(eDiag_Error, eErr_SEQ_INST_MolNotSet, "Bioseq.mol is 0",
1103  seq);
1104  break;
1105 
1106  case CSeq_inst::eMol_other:
1108  "Bioseq.mol is type other", seq);
1109  break;
1110 
1111  default:
1112  break;
1113  }
1114  }
1115 
1116  CSeq_inst::ERepr rp = seq.GetInst().GetRepr();
1117 
1118  if (rp == CSeq_inst::eRepr_raw || rp == CSeq_inst::eRepr_const) {
1119  // Validate raw and constructed sequences
1120  ValidateRawConst(seq);
1121  }
1122 
1123  // per VR-779
1124 #if 1
1125  if (rp == CSeq_inst::eRepr_seg) {
1126  PostErr(eDiag_Critical, eErr_SEQ_INST_ReprInvalid, "Segmented set format is not supported", seq);
1127  } else if (rp == CSeq_inst::eRepr_ref) {
1128  PostErr(eDiag_Critical, eErr_SEQ_INST_ReprInvalid, "Repr_ref format is not supported", seq);
1129  }
1130 #else
1131  if (rp == CSeq_inst::eRepr_seg || rp == CSeq_inst::eRepr_ref) {
1132  // Validate segmented and reference sequences
1133  ValidateSegRef(seq);
1134  }
1135 #endif
1136 
1137  if (rp == CSeq_inst::eRepr_delta) {
1138  // Validate delta sequences
1139  ValidateDelta(seq);
1140  }
1141 
1142  if (rp == CSeq_inst::eRepr_seg && seq.GetInst().IsSetExt() &&
1143  seq.GetInst().GetExt().IsSeg()) {
1144  // Validate part of segmented sequence
1145  ValidateSeqParts(seq);
1146  }
1147 
1148  if (rp == CSeq_inst::eRepr_raw || rp == CSeq_inst::eRepr_delta) {
1149  x_ValidateBarcode (seq);
1150  }
1151 
1152  x_ValidateTitle(seq);
1153  /*if ( seq.IsAa() ) {
1154  Validate protein title (amino acids only)
1155  ValidateProteinTitle(seq);
1156  }*/
1157 
1158  if ( seq.IsNa() ) {
1159  // check for N bases at start or stop of sequence,
1160  // or sequence entirely made of Ns
1161  ValidateNsAndGaps(seq);
1162 
1163  GapByGapInst(seq);
1164  }
1165 
1166  // Validate sequence length
1167  ValidateSeqLen(seq);
1168 
1169  // proteins should not have gaps
1170  if (seq.IsAa() && x_HasGap(seq)) {
1171  PostErr(eDiag_Error, eErr_SEQ_INST_ProteinShouldNotHaveGaps, "Protein sequences should not have gaps", seq);
1172  }
1173 }
1174 
1175 
1177 
1178 {
1179  bool is_wgs = false;
1180  bool is_grc = false;
1181 
1183  CSeqdesc_CI user(bsh, CSeqdesc::e_User);
1184  while (user) {
1186  user->GetUser().HasField("BioProject", ".", NStr::eNocase)) {
1187  // bioproject field found
1188  return false;
1189  }
1190  ++user;
1191  }
1192 
1193  CSeqdesc_CI ti(bsh, CSeqdesc::e_Title);
1194  if (ti) {
1195  while (ti) {
1196  if (NStr::StartsWith(ti->GetTitle(), "GRC")) {
1197  is_grc = true;
1198  break;
1199  }
1200  ++ti;
1201  }
1202  } else {
1203  sequence::CDeflineGenerator defline_generator;
1204  string title = defline_generator.GenerateDefline(seq, *m_Scope, sequence::CDeflineGenerator::fIgnoreExisting);
1205  if (!NStr::IsBlank(title)) {
1206  if (NStr::StartsWith(title, "GRC")) {
1207  is_grc = true;
1208  }
1209  }
1210  }
1211 
1212  is_wgs = IsWGS(bsh);
1213 
1214  bool is_gb = false, /* is_eb_db = false, */ is_refseq = false, is_ng = false;
1215 
1216  FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, seq) {
1217  const CSeq_id& sid = **sid_itr;
1218  switch (sid.Which()) {
1219  case CSeq_id::e_Genbank:
1220  case CSeq_id::e_Embl:
1221  // is_eb_db = true;
1222  // fall through
1223  case CSeq_id::e_Ddbj:
1224  is_gb = true;
1225  break;
1226  case CSeq_id::e_Other:
1227  {
1228  is_refseq = true;
1229  if (sid.GetOther().IsSetAccession()) {
1230  string acc = sid.GetOther().GetAccession().substr(0, 3);
1231  if (acc == "NG_") {
1232  is_ng = true;
1233  }
1234  }
1235  }
1236  break;
1237  default:
1238  break;
1239  }
1240  }
1241 
1242  if (is_refseq || m_Imp.IsRefSeqConventions()) {
1243  if (is_ng) return false;
1244  } else if (is_gb) {
1245  if (! is_wgs && ! is_grc) return false;
1246  } else {
1247  return false;
1248  }
1249 
1250  const CSeq_inst & inst = seq.GetInst();
1251  CSeq_inst::TRepr repr = inst.GetRepr();
1252 
1253  if (repr == CSeq_inst::eRepr_delta) {
1254  if (x_IsDeltaLitOnly(inst)) return false;
1255  } else if (repr != CSeq_inst::eRepr_map) {
1256  return false;
1257  }
1258 
1259  return true;
1260 }
1261 
1263  const CBioseq& seq)
1264 {
1266 
1267  // Check that proteins in nuc_prot set have a CdRegion
1268  if ( CdError(bsh) ) {
1269  EDiagSev sev = eDiag_Error;
1271  if (bssh) {
1272  CBioseq_Handle nbsh = GetNucBioseq (bssh);
1273  if (nbsh) {
1274  CSeqdesc_CI desc( nbsh, CSeqdesc::e_Molinfo );
1275  const CMolInfo* mi = desc ? &(desc->GetMolinfo()) : nullptr;
1276  if (mi) {
1277  CMolInfo::TTech tech = mi->IsSetTech() ?
1279  if (tech == CMolInfo::eTech_wgs) {
1280  sev = eDiag_Critical;
1281  }
1282  }
1283  }
1284  }
1286  "No CdRegion in nuc-prot set points to this protein",
1287  seq);
1288  }
1289 
1290  bool is_patent = SeqIsPatent (seq);
1291 
1292  bool is_complete = false;
1293  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
1294  if (desc) {
1295  const CMolInfo& mi = desc->GetMolinfo();
1297  is_complete = true;
1298  }
1299  }
1300 
1301  try {
1302 
1303  // if there are no Seq-ids, the following tests can't be run
1304  if (seq.IsSetId()) {
1305 
1306  ValidateSeqFeatContext(seq, is_complete);
1307 
1308  // Check for duplicate features and overlapping peptide features.
1310 
1311  // Check for introns within introns.
1312  ValidateTwintrons(seq);
1313 
1314  // check for tRNA contained in tmRNA features
1316 
1317  // check for equivalent source features
1319 
1320  // check for equivalen pub features
1321  x_ValidatePubFeatures (bsh);
1322 
1323  // Check for colliding genes
1325 
1326  // Detect absence of BioProject DBLink for complete bacterial genomes
1328  }
1329 
1330  m_dblink_count = 0;
1331  m_taa_count = 0;
1332  m_bs_count = 0;
1333  m_as_count = 0;
1334  m_pdb_count = 0;
1335  m_sra_count = 0;
1336  m_bp_count = 0;
1337  m_unknown_count = 0;
1338 
1339  // Validate descriptors that affect this bioseq
1341 
1342 
1343  if (m_dblink_count > 1) {
1345  NStr::IntToString(m_dblink_count) + " DBLink user objects apply to a Bioseq", seq);
1346  }
1347 
1348  if (m_taa_count > 1) {
1350  "Trace Assembly Archive entries appear in " + NStr::IntToString(m_taa_count) + " DBLink user objects", seq);
1351  }
1352 
1353  if (m_bs_count > 1) {
1355  "BioSample entries appear in " + NStr::IntToString(m_bs_count) + " DBLink user objects", seq);
1356  }
1357 
1358  if (m_as_count > 1) {
1360  "Assembly entries appear in " + NStr::IntToString(m_as_count) + " DBLink user objects", seq);
1361  }
1362 
1363  if (m_pdb_count > 1) {
1365  "ProbeDB entries appear in " + NStr::IntToString(m_pdb_count) + " DBLink user objects", seq);
1366  }
1367 
1368  if (m_sra_count > 1) {
1370  "Sequence Read Archive entries appear in " + NStr::IntToString(m_sra_count) + " DBLink user objects", seq);
1371  }
1372 
1373  if (m_bp_count > 1) {
1375  "BioProject entries appear in " + NStr::IntToString(m_bp_count) + " DBLink user objects", seq);
1376  }
1377 
1378  if (m_unknown_count > 1) {
1380  "Unrecognized entries appear in " + NStr::IntToString(m_unknown_count) + " DBLink user objects", seq);
1381  } else if (m_unknown_count > 0) {
1383  "Unrecognized entries appear in " + NStr::IntToString(m_unknown_count) + " DBLink user object", seq);
1384  }
1385 
1386  // make sure that there is a pub on this bioseq
1388  CheckForPubOnBioseq(seq);
1389  }
1390  // make sure that there is a source on this bioseq
1392  CheckSourceDescriptor(bsh);
1393  //CheckForBiosourceOnBioseq(seq);
1394  }
1395 
1396  if (x_ShowBioProjectWarning (seq)) {
1398  "BioProject entries not present on CON record", seq);
1399  }
1400 
1401  } catch ( const exception& e ) {
1402  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1404  string("Exception while validating BioseqContext. EXCEPTION: ") +
1405  e.what(), seq);
1406  }
1407  }
1408 
1409  if (!is_patent) {
1410  // flag missing molinfo even if not in Sequin
1412  }
1413 
1414  CValidError_graph graph_validator(m_Imp);
1415  graph_validator.ValidateGraphsOnBioseq(seq);
1416 
1417  CheckTpaHistory(seq);
1418 
1419  // check for multiple publications with identical identifiers
1421 
1422  // look for orphaned proteins
1423  if (seq.IsAa() && bsh && !GetNucProtSetParent(bsh) && !AllowOrphanedProtein(seq, m_Imp.IsRefSeqConventions())) {
1425  "Orphaned stand-alone protein", seq);
1426  }
1427 
1428  // look for extra protein features
1429  if (seq.IsAa()) {
1430  CCacheImpl::SFeatKey prot_key(
1432  const CCacheImpl::TFeatValue & prot_feats =
1433  GetCache().GetFeatFromCache(prot_key);
1434 
1435  if (prot_feats.size() > 1) {
1436  ITERATE(CCacheImpl::TFeatValue, feat, prot_feats) {
1438  "Protein sequence has multiple unprocessed protein features",
1439  feat->GetOriginalFeature());
1440  }
1441  }
1442  }
1443 
1444  if (!m_Imp.IsNoCitSubPubs() && !x_HasCitSub(bsh) && !m_Imp.IsSeqSubmitParent()) {
1446  "Expected submission citation is missing for this Bioseq", seq);
1447  }
1448 
1449  // RW-1053 check sig_peptides and mat_peptides with instantiated products
1450  if (seq.IsAa()) {
1451 
1455  try {
1456  for (CFeat_CI feat_ci(bsh, sel); feat_ci; ++feat_ci) {
1457 
1458  const CSeq_feat& matpeptide = feat_ci->GetOriginalFeature();
1459  if (matpeptide.IsSetProduct()) {
1460  const CSeq_loc& loc = matpeptide.GetLocation();
1461  const CSeq_loc& prd = matpeptide.GetProduct();
1462 
1463  int matlen = GetLength(loc, m_Scope);
1464  int prdlen = GetLength(prd, m_Scope);
1465  if (matlen != prdlen) {
1467  "Mat_peptide does not match length of instantiated product",
1468  matpeptide);
1469  }
1470 
1473 
1474  int len = matlen;
1475  if (len > prdlen) {
1476  len = prdlen;
1477  }
1478 
1479  for (TSeqPos i = 0; i < len; ++i) {
1480  CSeqVectorTypes::TResidue m_res = mat_vec[i];
1481  CSeqVectorTypes::TResidue p_res = prd_vec[i];
1482 
1483  if (m_res != p_res) {
1485  "Mismatch in mat_peptide (" + string(1, (char)m_res) + ") and instantiated product (" + \
1486  string(1, (char)p_res) + ") at position " + NStr::NumericToString(i + 1),
1487  matpeptide);
1488  }
1489  }
1490  }
1491  }
1492 
1493  }
1494  catch (CException&) {
1495  }
1496  }
1497 }
1498 
1499 
1501 {
1502  ITERATE(CPub_equiv::Tdata, it, pub.Get()) {
1503  if (x_HasCitSub(**it)) {
1504  return true;
1505  }
1506  }
1507  return false;
1508 }
1509 
1510 
1512 {
1513  if (pub.IsSub()) {
1514  return true;
1515  } else if (pub.IsEquiv() && x_HasCitSub(pub.GetEquiv())) {
1516  return true;
1517  } else {
1518  return false;
1519  }
1520 }
1521 
1522 
1524 {
1525  bool has_cit_sub = false;
1526  CSeqdesc_CI p(bsh, CSeqdesc::e_Pub);
1527  while (p && !has_cit_sub) {
1528  if (p->GetPub().IsSetPub()) {
1529  has_cit_sub = x_HasCitSub(p->GetPub().GetPub());
1530  }
1531  ++p;
1532  }
1533 
1534  return has_cit_sub;
1535 }
1536 
1537 
1538 template <class Iterator, class Predicate>
1539 bool lists_match(Iterator iter1, Iterator iter1_stop, Iterator iter2, Iterator iter2_stop, Predicate pred)
1540 {
1541  while (iter1 != iter1_stop && iter2 != iter2_stop) {
1542  if (!pred(*iter1, *iter2)) {
1543  return false;
1544  }
1545  ++iter1;
1546  ++iter2;
1547  }
1548  if (iter1 != iter1_stop || iter2 != iter2_stop) {
1549  return false;
1550  } else {
1551  return true;
1552  }
1553 }
1554 
1555 
1556 static bool s_OrgModEqual (
1557  const CRef<COrgMod>& om1,
1558  const CRef<COrgMod>& om2
1559 )
1560 
1561 {
1562  const COrgMod& omd1 = *(om1);
1563  const COrgMod& omd2 = *(om2);
1564 
1565  const string& str1 = omd1.GetSubname();
1566  const string& str2 = omd2.GetSubname();
1567 
1568  if (NStr::CompareNocase (str1, str2) != 0) return false;
1569 
1570  TORGMOD_SUBTYPE chs1 = omd1.GetSubtype();
1571  TORGMOD_SUBTYPE chs2 = omd2.GetSubtype();
1572 
1573  if (chs1 == chs2) return true;
1574  if (chs2 == NCBI_ORGMOD(other)) return true;
1575 
1576  return false;
1577 }
1578 
1579 
1580 bool s_DbtagEqual (const CRef<CDbtag>& dbt1, const CRef<CDbtag>& dbt2)
1581 {
1582  // is dbt1 == dbt2
1583  return dbt1->Compare(*dbt2) == 0;
1584 }
1585 
1586 
1587 // Two OrgRefs are identical if the taxnames are identical, the dbxrefs are identical,
1588 // and the orgname orgmod lists are identical
1589 static bool s_OrgrefEquivalent (const COrg_ref& org1, const COrg_ref& org2)
1590 {
1591  if ((org1.IsSetTaxname() && !org2.IsSetTaxname())
1592  || (!org1.IsSetTaxname() && org2.IsSetTaxname())
1593  || (org1.IsSetTaxname() && org2.IsSetTaxname()
1594  && !NStr::EqualNocase (org1.GetTaxname(), org2.GetTaxname()))) {
1595  return false;
1596  }
1597 
1598  if ((org1.IsSetDb() && !org2.IsSetDb())
1599  || (!org1.IsSetDb() && org2.IsSetDb())
1600  || (org1.IsSetDb() && org2.IsSetDb()
1601  && !lists_match (org1.GetDb().begin(), org1.GetDb().end(),
1602  org2.GetDb().begin(), org2.GetDb().end(),
1603  s_DbtagEqual))) {
1604  return false;
1605  }
1606 
1607  if ((org1.IsSetOrgname() && !org2.IsSetOrgname())
1608  || (!org1.IsSetOrgname() && org2.IsSetOrgname())) {
1609  return false;
1610  }
1611  if (org1.IsSetOrgname() && org2.IsSetOrgname()) {
1612  const COrgName& on1 = org1.GetOrgname();
1613  const COrgName& on2 = org2.GetOrgname();
1614  if ((on1.IsSetMod() && !on2.IsSetMod())
1615  || (!on1.IsSetMod() && on2.IsSetMod())
1616  || (on1.IsSetMod() && on2.IsSetMod()
1617  && !lists_match (on1.GetMod().begin(), on1.GetMod().end(),
1618  on2.GetMod().begin(), on2.GetMod().end(),
1619  s_OrgModEqual))) {
1620  return false;
1621  }
1622  }
1623 
1624  return true;
1625 }
1626 
1627 
1628 // Two SubSources are equal and duplicates if:
1629 // they have the same subtype
1630 // and the same name (or don't require a name).
1631 
1633  const CRef<CSubSource>& st1,
1634  const CRef<CSubSource>& st2
1635 )
1636 
1637 {
1638  const CSubSource& sbs1 = *(st1);
1639  const CSubSource& sbs2 = *(st2);
1640 
1641  TSUBSOURCE_SUBTYPE chs1 = sbs1.GetSubtype();
1642  TSUBSOURCE_SUBTYPE chs2 = sbs2.GetSubtype();
1643 
1644  if (chs1 != chs2) return false;
1645  if (CSubSource::NeedsNoText(chs2)) return true;
1646 
1647  if (sbs1.IsSetName() && sbs2.IsSetName()) {
1648  if (NStr::CompareNocase (sbs1.GetName(), sbs2.GetName()) == 0) return true;
1649  }
1650  if (! sbs1.IsSetName() && ! sbs2.IsSetName()) return true;
1651 
1652  return false;
1653 }
1654 
1655 
1656 static bool s_BiosrcFullLengthIsOk (const CBioSource& src)
1657 {
1658  if (src.IsSetIs_focus()) {
1659  return true;
1660  }
1662  if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == CSubSource::eSubtype_transgenic) {
1663  return true;
1664  }
1665  }
1666  return false;
1667 }
1668 
1669 
1671 {
1672  if (!src.IsSetOrg() || !src.GetOrg().IsSetTaxname()) {
1673  return false;
1674  }
1675  if (NStr::EqualNocase(src.GetOrg().GetTaxname(), "unidentified phage")) {
1676  return true;
1677  }
1678  if (src.GetOrg().IsSetOrgname() && src.GetOrg().GetOrgname().IsSetLineage()
1679  && NStr::StartsWith(src.GetOrg().GetOrgname().GetLineage(), "Viruses", NStr::eNocase)) {
1680  return true;
1681  }
1682 #if 0
1683  if (!src.GetOrg().IsSetOrgname()) {
1684  printf ("Orgname not set!\n");
1685  } else if (!src.GetOrg().GetOrgname().IsSetLineage()) {
1686  printf ("Lineage not set!\n");
1687  } else {
1688  printf ("Lineage is %s!\n", src.GetOrg().GetOrgname().GetLineage().c_str());
1689  }
1690 #endif
1691  return false;
1692 }
1693 
1694 
1695 bool s_OverlapOrAbut(const CSeq_loc& loc1, const CSeq_loc& loc2, CScope* scope)
1696 {
1697  TSeqPos start1 = loc1.GetStart(eExtreme_Positional);
1698  TSeqPos stop1 = loc1.GetStop(eExtreme_Positional);
1699  TSeqPos start2 = loc2.GetStart(eExtreme_Positional);
1700  TSeqPos stop2 = loc2.GetStop(eExtreme_Positional);
1701 
1702  if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1703  // abut
1704  return true;
1705  } else if (TestForOverlapEx(loc1, loc2, eOverlap_Simple, scope) >= 0) {
1706  return true;
1707  } else {
1708  return false;
1709  }
1710 }
1711 
1712 
1713 bool s_ContainedIn(const CSeq_loc& loc1, const CSeq_loc& loc2, CScope* scope)
1714 {
1715  TSeqPos start1 = loc1.GetStart(eExtreme_Positional);
1716  TSeqPos stop1 = loc1.GetStop(eExtreme_Positional);
1717  TSeqPos start2 = loc2.GetStart(eExtreme_Positional);
1718  TSeqPos stop2 = loc2.GetStop(eExtreme_Positional);
1719 
1720  if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1721  // abut
1722  return false;
1723  } else if (TestForOverlapEx(loc1, loc2, eOverlap_Contained, scope) >= 0) {
1724  return true;
1725  } else {
1726  return false;
1727  }
1728 }
1729 
1730 
1731 bool s_CheckIntervals(const CSeq_loc& loc1, const CSeq_loc& loc2, CScope* scope)
1732 {
1733  TSeqPos start1 = loc1.GetStart(eExtreme_Positional);
1734  TSeqPos stop1 = loc1.GetStop(eExtreme_Positional);
1735  TSeqPos start2 = loc2.GetStart(eExtreme_Positional);
1736  TSeqPos stop2 = loc2.GetStop(eExtreme_Positional);
1737 
1738  if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1739  // abut
1740  return false;
1741  } else if (TestForOverlapEx(loc1, loc2, eOverlap_CheckIntervals, scope) >= 0) {
1742  return true;
1743  } else {
1744  return false;
1745  }
1746 }
1747 
1748 
1750  const CBioseq_Handle& bsh)
1751 {
1752  // don't bother if can't build all feature iterator
1753  if (!m_AllFeatIt) {
1754  return;
1755  }
1756  try {
1757  CCacheImpl::SFeatKey rna_key(
1759  const CCacheImpl::TFeatValue & rnas = GetCache().GetFeatFromCache(rna_key);
1760  CCacheImpl::TFeatValue::const_iterator feat = rnas.begin();
1761  if (feat != rnas.end()) {
1762 
1763  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1764  ++feat;
1765  for ( ; feat != rnas.end(); ++feat_prev, ++feat) {
1766 
1767  if (!s_OverlapOrAbut(feat_prev->GetLocation(),
1768  feat->GetLocation(), m_Scope)) {
1769  continue;
1770  }
1771 
1772  const CRNA_ref& tm = feat_prev->GetData().GetRna();
1773  const CRNA_ref& tr = feat->GetData().GetRna();
1774  if ( tm.IsSetType() && tm.GetType() == CRNA_ref::eType_tmRNA ) {
1775  if ( tr.IsSetType() && tr.GetType() == CRNA_ref::eType_tRNA ) {
1776  if (s_ContainedIn(feat_prev->GetLocation(),
1777  feat->GetLocation(), m_Scope)) {
1779  "tRNA contained within tmRNA",
1780  feat->GetOriginalFeature());
1781  }
1782  }
1783  }
1784  }
1785  }
1786  } catch ( const exception& e ) {
1787  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1789  string("Exception while validating RNA features. EXCEPTION: ") +
1790  e.what(), *(bsh.GetCompleteBioseq()));
1791  }
1792  }
1793 }
1794 
1795 
1796 
1798  const CBioseq_Handle& bsh)
1799 {
1800  // don't bother if can't build all feature iterator
1801  if (!m_AllFeatIt) {
1802  return;
1803  }
1804  try {
1805  CCacheImpl::SFeatKey biosrc_key(
1807  const CCacheImpl::TFeatValue & biosrcs = GetCache().GetFeatFromCache(biosrc_key);
1808  CCacheImpl::TFeatValue::const_iterator feat = biosrcs.begin();
1809  if (feat != biosrcs.end()) {
1810  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1812  if (di) {
1813  if (!s_BiosrcFullLengthIsOk(di->GetSource())) {
1815  "Source feature is full length, should be descriptor",
1816  feat->GetOriginalFeature());
1817  }
1818  }
1819  }
1820 
1821  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1822  ++feat;
1823  for ( ; feat != biosrcs.end(); ++feat_prev, ++feat) {
1824  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1826  "Multiple full-length source features, should only be one if descriptor is transgenic",
1827  feat->GetOriginalFeature());
1828  }
1829 
1830  if (!s_OverlapOrAbut(feat_prev->GetLocation(),
1831  feat->GetLocation(), m_Scope)) {
1832  // not close enough
1833  continue;
1834  }
1835 
1836  // compare to see if feature sources are identical
1837  bool are_identical = true;
1838  if (feat_prev->IsSetComment() && feat->IsSetComment()
1839  && !NStr::EqualNocase (feat_prev->GetComment(), feat->GetComment())) {
1840  are_identical = false;
1841  } else {
1842  const CBioSource& src_prev = feat_prev->GetData().GetBiosrc();
1843  const CBioSource& src = feat->GetData().GetBiosrc();
1844  if ((src.IsSetIs_focus() && !src_prev.IsSetIs_focus())
1845  || (!src.IsSetIs_focus() && src_prev.IsSetIs_focus())) {
1846  are_identical = false;
1847  } else if ((src.IsSetSubtype() && !src_prev.IsSetSubtype())
1848  || (!src.IsSetSubtype() && src_prev.IsSetSubtype())
1849  || (src.IsSetSubtype() && src_prev.IsSetSubtype()
1850  && !lists_match (src.GetSubtype().begin(), src.GetSubtype().end(),
1851  src_prev.GetSubtype().begin(), src_prev.GetSubtype().end(),
1853  are_identical = false;
1854  } else if ((src.IsSetOrg() && !src_prev.IsSetOrg())
1855  || (!src.IsSetOrg() && src_prev.IsSetOrg())
1856  || (src.IsSetOrg() && src_prev.IsSetOrg()
1857  && !s_OrgrefEquivalent (src.GetOrg(), src_prev.GetOrg()))) {
1858  are_identical = false;
1859  }
1860  }
1861  if (are_identical && !s_SuppressMultipleEquivBioSources(feat->GetData().GetBiosrc())) {
1863  "Multiple equivalent source features should be combined into one multi-interval feature",
1864  feat->GetOriginalFeature());
1865  }
1866  }
1867  }
1868  } catch ( const exception& e ) {
1869  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1871  string("Exception while validating source features. EXCEPTION: ") +
1872  e.what(), *(bsh.GetCompleteBioseq()));
1873  }
1874  }
1875 
1876 }
1877 
1878 
1879 static void s_MakePubLabelString (const CPubdesc& pd, string& label)
1880 
1881 {
1882  label = "";
1883 
1884  FOR_EACH_PUB_ON_PUBDESC (it, pd) {
1885  if ((*it)->IsGen() && (*it)->GetGen().IsSetCit()
1886  && !(*it)->GetGen().IsSetCit()
1887  && !(*it)->GetGen().IsSetJournal()
1888  && !(*it)->GetGen().IsSetDate()
1889  && (*it)->GetGen().IsSetSerial_number()) {
1890  // skip over just serial number
1891  } else {
1892  (*it)->GetLabel (&label, CPub::eContent, true);
1893  break;
1894  }
1895  }
1896 }
1897 
1898 
1900  const CBioseq_Handle& bsh)
1901 {
1902  // don't bother if can't build feature iterator at all
1903  if (!m_AllFeatIt) {
1904  return;
1905  }
1906  try {
1907  CCacheImpl::SFeatKey pub_key(
1909  const CCacheImpl::TFeatValue & pubs =
1910  GetCache().GetFeatFromCache(pub_key);
1911  CCacheImpl::TFeatValue::const_iterator feat = pubs.begin();
1912  if (feat != pubs.end()) {
1913  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1915  "Publication feature is full length, should be descriptor",
1916  feat->GetOriginalFeature());
1917  }
1918 
1919  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1920  string prev_label;
1921  if( feat_prev != pubs.end()) {
1922  s_MakePubLabelString(feat_prev->GetData().GetPub(), prev_label);
1923  ++feat;
1924  }
1925  for ( ; feat != pubs.end(); ++feat, ++feat_prev) {
1926  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1928  "Publication feature is full length, should be descriptor",
1929  feat->GetOriginalFeature());
1930  }
1931  // compare to see if feature sources are identical
1932  bool are_identical = true;
1933  if (feat_prev->IsSetComment() && feat->IsSetComment()
1934  && !NStr::EqualNocase (feat_prev->GetComment(), feat->GetComment())) {
1935  are_identical = false;
1936  } else {
1937  string label;
1938  s_MakePubLabelString (feat->GetData().GetPub(), label);
1939  if (!NStr::IsBlank (label) && !NStr::IsBlank(prev_label)
1940  && !NStr::EqualNocase (label, prev_label)) {
1941  are_identical = false;
1942  }
1943 
1944  // swap is faster than assignment
1945  prev_label.swap(label);
1946 
1947  // TODO: also check authors
1948  }
1949 
1950  if (are_identical) {
1952  "Multiple equivalent publication features should be combined into one multi-interval feature",
1953  feat->GetOriginalFeature());
1954  }
1955  }
1956  }
1957  } catch ( const exception& e ) {
1958  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1960  string("Exception while validating pub features. EXCEPTION: ") +
1961  e.what(), *(bsh.GetCompleteBioseq()));
1962  }
1963  }
1964 
1965 }
1966 
1967 
1969 {
1970 public:
1971  // faster than lexicographical order
1972  bool operator()(const CTempString& lhs, const CTempString& rhs) const
1973  {
1974  if( lhs.length() != rhs.length() ) {
1975  return (lhs.length() < rhs.length());
1976  }
1977  return NStr::CompareNocase (lhs, rhs) < 0;
1978  }
1979 };
1980 
1982 {
1983 public:
1984  bool operator()(const CTempString& lhs, const CTempString& rhs) const
1985  {
1986  return NStr::CompareNocase (lhs, rhs) < 0;
1987  }
1988 };
1989 
1990 
1992  const CBioseq& seq, const vector<CTempString>& labels)
1993 {
1994  if (labels.size() <= 1) {
1995  // optimize fast case
1996  return;
1997  }
1998  if (m_Imp.IsRefSeqConventions() || m_Imp.IsRefSeq()) {
1999  return;
2000  }
2001 
2002  static const string kWarningPrefix =
2003  "Multiple equivalent publications annotated on this sequence [";
2004  static const string::size_type kMaxSummaryLen = 100;
2005 
2006  // TTempStringCount maps a CTempString to the number of times it appears
2007  // (Note case-insensitivity and non-lexicographical order)
2009  TLabelCount label_count;
2010 
2011  ITERATE(vector<CTempString>, label_it, labels) {
2012  ++label_count[*label_it];
2013  }
2014 
2015  // put the dups into a vector and sort
2016  vector<CTempString> sorted_dup_labels;
2017  ITERATE(TLabelCount, label_count_it, label_count) {
2018  int num_appearances = label_count_it->second;
2019  _ASSERT(num_appearances > 0);
2020  if( num_appearances > 1 ) {
2021  const CTempString & dup_label = label_count_it->first;
2022  sorted_dup_labels.push_back(dup_label);
2023  }
2024  }
2025  sort(BEGIN_COMMA_END(sorted_dup_labels), SCaseInsensitiveLess());
2026 
2027  // find all that appear multiple times
2028  string err_msg = kWarningPrefix; // avoid create and destroy on each iter'n
2029  ITERATE(vector<CTempString>, dup_label_it, sorted_dup_labels) {
2030  const CTempString & summary = *dup_label_it;
2031 
2032  err_msg.resize(kWarningPrefix.length());
2033  if (summary.length() > kMaxSummaryLen) {
2034  err_msg += summary.substr(0, kMaxSummaryLen);
2035  err_msg += "...";
2036  } else {
2037  err_msg += summary;
2038  }
2039  err_msg += "]";
2041  err_msg, seq);
2042  }
2043 }
2044 
2045 
2047  const CBioseq_Handle& bsh)
2048 {
2049  // used to check for dups. Currently only deals with cases where
2050  // there's an otherpub, but check if this comment is out of date.
2051  set<TEntrezId> muids_seen;
2052  set<TEntrezId> pmids_seen;
2053 
2054  vector<int> serials;
2055  vector<CTempString> published_labels;
2056  vector<CTempString> unpublished_labels;
2057 
2060 
2061  const CBioseq& seq = *(bsh.GetCompleteBioseq());
2062 
2063  for (CSeqdesc_CI it(bsh, CSeqdesc::e_Pub); it; ++it) {
2064  CConstRef<CPubdesc> pub = ConstRef(&it->GetPub());
2065  // first, try to receive from cache
2066  const CCacheImpl::CPubdescInfo & pubdesc_info =
2067  GetCache().GetPubdescToInfo(pub);
2068  // note that some (e.g. pmids are ignored other than maybe storing
2069  // in the cache above)
2070  copy(BEGIN_COMMA_END(pubdesc_info.m_published_labels),
2071  back_inserter(published_labels));
2073  back_inserter(unpublished_labels));
2074 
2075  TEntrezId muid = ZERO_ENTREZ_ID;
2076  TEntrezId pmid = ZERO_ENTREZ_ID;
2077  bool otherpub = false;
2078  FOR_EACH_PUB_ON_PUBDESC (pub_it, *pub) {
2079  switch ( (*pub_it)->Which() ) {
2080  case CPub::e_Muid:
2081  muid = (*pub_it)->GetMuid();
2082  break;
2083  case CPub::e_Pmid:
2084  pmid = (*pub_it)->GetPmid();
2085  break;
2086  default:
2087  otherpub = true;
2088  break;
2089  }
2090  }
2091 
2092  if ( otherpub ) {
2093  bool collision = false;
2094  if ( muid > ZERO_ENTREZ_ID ) {
2095  if ( muids_seen.find(muid) != muids_seen.end() ) {
2096  collision = true;
2097  } else {
2098  muids_seen.insert(muid);
2099  }
2100  }
2101  if ( pmid > ZERO_ENTREZ_ID ) {
2102  if ( pmids_seen.find(pmid) != pmids_seen.end() ) {
2103  collision = true;
2104  } else {
2105  pmids_seen.insert(pmid);
2106  }
2107  }
2108  if ( collision && !m_Imp.IsRefSeqConventions() && !m_Imp.IsRefSeq() ) {
2110  "Multiple publications with identical PubMed ID", *ctx, *it);
2111  }
2112  }
2113  }
2114 
2115  x_ReportDuplicatePubLabels (seq, unpublished_labels);
2116  x_ReportDuplicatePubLabels (seq, published_labels);
2117 
2118 }
2119 
2120 
2122 {
2123  if ( !seq.GetInst().IsSetHist() ) {
2124  return;
2125  }
2126 
2127  TGi gi = ZERO_GI;
2128  FOR_EACH_SEQID_ON_BIOSEQ (id, seq) {
2129  if ( (*id)->IsGi() ) {
2130  gi = (*id)->GetGi();
2131  break;
2132  }
2133  }
2134  if ( gi == ZERO_GI ) {
2135  return;
2136  }
2137 
2138  const CSeq_hist& hist = seq.GetInst().GetHist();
2139  if ( hist.IsSetReplaced_by() && hist.GetReplaced_by().IsSetDate() ) {
2140  const CSeq_hist_rec& rec = hist.GetReplaced_by();
2141  ITERATE( CSeq_hist_rec::TIds, id, rec.GetIds() ) {
2142  if ( (*id)->IsGi() ) {
2143  if ( gi == (*id)->GetGi() ) {
2145  "Replaced by gi (" +
2146  NStr::NumericToString(gi) + ") is same as current Bioseq",
2147  seq);
2148  break;
2149  }
2150  }
2151  }
2152  }
2153 
2154  if ( hist.IsSetReplaces() && hist.GetReplaces().IsSetDate() ) {
2155  const CSeq_hist_rec& rec = hist.GetReplaces();
2156  ITERATE( CSeq_hist_rec::TIds, id, rec.GetIds() ) {
2157  if ( (*id)->IsGi() ) {
2158  if ( gi == (*id)->GetGi() ) {
2160  "Replaces gi (" +
2161  NStr::NumericToString(gi) + ") is same as current Bioseq",
2162  seq);
2163  break;
2164  }
2165  }
2166  }
2167  }
2168 }
2169 
2170 
2171 // =============================================================================
2172 // Private
2173 // =============================================================================
2174 
2175 
2176 
2177 
2178 // Is the id contained in the bioseq?
2179 bool CValidError_bioseq::IsIdIn(const CSeq_id& id, const CBioseq& seq)
2180 {
2181  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2182  if (id.Match(**it)) {
2183  return true;
2184  }
2185  }
2186  return false;
2187 }
2188 
2189 
2191 {
2192  if (!inst.IsSetSeq_data()) {
2193  return 0;
2194  }
2195 
2196  const CSeq_data& seqdata = inst.GetSeq_data();
2197  switch (seqdata.Which()) {
2198  case CSeq_data::e_not_set:
2199  return 0;
2200  case CSeq_data::e_Iupacna:
2201  return seqdata.GetIupacna().Get().size();
2202  case CSeq_data::e_Iupacaa:
2203  return seqdata.GetIupacaa().Get().size();
2204  case CSeq_data::e_Ncbi2na:
2205  return seqdata.GetNcbi2na().Get().size();
2206  case CSeq_data::e_Ncbi4na:
2207  return seqdata.GetNcbi4na().Get().size();
2208  case CSeq_data::e_Ncbi8na:
2209  return seqdata.GetNcbi8na().Get().size();
2210  case CSeq_data::e_Ncbipna:
2211  return seqdata.GetNcbipna().Get().size();
2212  case CSeq_data::e_Ncbi8aa:
2213  return seqdata.GetNcbi8aa().Get().size();
2214  case CSeq_data::e_Ncbieaa:
2215  return seqdata.GetNcbieaa().Get().size();
2216  case CSeq_data::e_Ncbipaa:
2217  return seqdata.GetNcbipaa().Get().size();
2219  return seqdata.GetNcbistdaa().Get().size();
2220  default:
2221  return 0;
2222  }
2223 }
2224 
2225 
2226 // Returns true if seq derived from translation ending in "*" or
2227 // seq is 3' partial (i.e. the right of the sequence is incomplete)
2229 {
2230 
2231  // Look for the Cdregion feature used to create this aa product
2232  // Use the Cdregion to translate the associated na sequence
2233  // and check if translation has a '*' at the end. If it does.
2234  // message about 'X' at the end of this aa product sequence is suppressed
2235  try {
2236  const CSeq_feat* sfp = m_Imp.GetCDSGivenProduct(seq);
2237  if ( sfp ) {
2238  // Translate na CSeq_data
2239  string prot;
2241  if ( prot[prot.size() - 1] == '*' ) {
2242  return true;
2243  }
2244  return false;
2245  }
2246 
2247  // Get CMolInfo for seq and determine if completeness is
2248  // "eCompleteness_no_right or eCompleteness_no_ends. If so
2249  // suppress message about "X" at end of aa sequence is suppressed
2251  if (mi && mi->IsSetCompleteness()) {
2252  if (mi->GetCompleteness() == CMolInfo::eCompleteness_no_right ||
2253  mi->GetCompleteness() == CMolInfo::eCompleteness_no_ends) {
2254  return true;
2255  }
2256  }
2257  } catch (const CException& ) {
2258  } catch (const std::exception& ) {
2259  }
2260  return false;
2261 }
2262 
2263 
2264 //LCOV_EXCL_START
2265 //part of segset validation, no longer used
2267 {
2268  CRef<CSeq_loc> loc;
2269  if (!seq.GetInst().IsSetExt()) {
2270  return loc;
2271  }
2272 
2273  if (seq.GetInst().GetExt().IsSeg()) {
2274  CRef<CSeq_loc> nloc(new CSeq_loc());
2275  loc = nloc;
2276  CSeq_loc_mix& mix = loc->SetMix();
2277  ITERATE (list< CRef<CSeq_loc> >, it,
2278  seq.GetInst().GetExt().GetSeg().Get()) {
2279  mix.Set().push_back(*it);
2280  }
2281  } else if (seq.GetInst().GetExt().IsRef()) {
2282  CRef<CSeq_loc> nloc(new CSeq_loc());
2283  loc = nloc;
2284  loc->Add(seq.GetInst().GetExt().GetRef());
2285  }
2286  return loc;
2287 }
2288 //LCOV_EXCL_STOP
2289 
2290 
2291 // Check if CdRegion required but not found
2293 {
2294  if ( bsh && CSeq_inst::IsAa(bsh.GetInst_Mol()) ) {
2295  CSeq_entry_Handle nps =
2297  if ( nps ) {
2298  const CSeq_feat* cds = GetCDSForProduct(bsh);
2299  if ( !cds ) {
2300  const CSeq_feat* mat = GetPROTForProduct(bsh);
2301  if ( !mat ) {
2302  return true;
2303  }
2304  }
2305  }
2306  }
2307 
2308  return false;
2309 }
2310 
2311 
2313 {
2315 
2316  if ( sd ) {
2317  const CMolInfo &mi = sd->GetMolinfo();
2318  if ( mi.IsSetBiomol() ) {
2319  return mi.GetBiomol() == CMolInfo::eBiomol_mRNA;
2320  }
2321  } else if (bsh.GetBioseqMolType() == CSeq_inst::eMol_rna) {
2322  // if no molinfo, assume rna is mrna
2323  return true;
2324  }
2325 
2326  return false;
2327 }
2328 
2329 
2331 {
2332  size_t counter = 0;
2333  for ( CSeq_loc_CI slit(loc); slit; ++slit ) {
2334  if ( !IsFarLocation(slit.GetEmbeddingSeq_loc(), m_Imp.GetTSEH()) ) {
2335  ++counter;
2336  }
2337  }
2338  return counter;
2339 }
2340 
2341 
2343 (const CSeq_feat& curr,
2344  const CSeq_feat& prev)
2345 {
2346  if (curr.IsSetExcept() && curr.GetExcept() && curr.IsSetExcept_text()) {
2347  if (NStr::FindNoCase(curr.GetExcept_text(), "alternative processing") != NPOS) {
2348  return false;
2349  }
2350  }
2351  if (prev.IsSetExcept() && prev.GetExcept() && prev.IsSetExcept_text()) {
2352  if (NStr::FindNoCase(prev.GetExcept_text(), "alternative processing") != NPOS) {
2353  return false;
2354  }
2355  }
2356 
2357  const CProt_ref& currP = curr.GetData().GetProt();
2358  const CProt_ref& prevP = prev.GetData().GetProt();
2359 
2360  if (currP.IsSetName() && prevP.IsSetName()) {
2361  string currN;
2362  for (auto it : currP.GetName()) {
2363  currN = it;
2364  break;
2365  }
2366  string prevN;
2367  for (auto it : prevP.GetName()) {
2368  prevN = it;
2369  break;
2370  }
2371  if (NStr::EqualNocase (currN, "anchored capsid protein ancC") && NStr::EqualNocase (prevN, "capsid protein C")) {
2372  return false;
2373  }
2374  if (NStr::EqualNocase (prevN, "anchored capsid protein ancC") && NStr::EqualNocase (currN, "capsid protein C")) {
2375  return false;
2376  }
2377  if (NStr::EqualNocase (currN, "membrane glycoprotein precursor prM") && NStr::EqualNocase (prevN, "protein pr")) {
2378  return false;
2379  }
2380  if (NStr::EqualNocase (prevN, "membrane glycoprotein precursor prM") && NStr::EqualNocase (currN, "protein pr")) {
2381  return false;
2382  }
2383  if (NStr::EqualNocase (currN, "membrane glycoprotein precursor prM") && NStr::EqualNocase (prevN, "membrane glycoprotein M")) {
2384  return false;
2385  }
2386  if (NStr::EqualNocase (prevN, "membrane glycoprotein precursor prM") && NStr::EqualNocase (currN, "membrane glycoprotein M")) {
2387  return false;
2388  }
2389  if (NStr::EqualNocase (currN, "anchored capsid protein C") && NStr::EqualNocase (prevN, "capsid protein C")) {
2390  return false;
2391  }
2392  if (NStr::EqualNocase (prevN, "anchored capsid protein C") && NStr::EqualNocase (currN, "capsid protein C")) {
2393  return false;
2394  }
2395  if (NStr::EqualNocase (currN, "membrane glycoprotein precursor M") && NStr::EqualNocase (prevN, "protein pr")) {
2396  return false;
2397  }
2398  if (NStr::EqualNocase (prevN, "membrane glycoprotein precursor M") && NStr::EqualNocase (currN, "protein pr")) {
2399  return false;
2400  }
2401  if (NStr::EqualNocase (currN, "membrane glycoprotein precursor M") && NStr::EqualNocase (prevN, "membrane glycoprotein M")) {
2402  return false;
2403  }
2404  if (NStr::EqualNocase (prevN, "membrane glycoprotein precursor M") && NStr::EqualNocase (currN, "membrane glycoprotein M")) {
2405  return false;
2406  }
2407  }
2408 
2409 
2410  return true;
2411 }
2412 
2413 
2414 #define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var) \
2415 ITERATE (CBioseq_Handle::TId, Itr, Var.GetId())
2416 
2418 {
2419  if (!IsMaster(seq)) {
2420  return false;
2421  }
2422  CBioseq_Handle bsh = scope.GetBioseqHandle(seq);
2423  return IsWGS(bsh);
2424 }
2425 
2426 
2428 {
2429  bool rval = false;
2430  if (entry.IsSeq()) {
2431  if (IsMaster(entry.GetSeq()) && IsWGS(entry.GetSeq())) {
2432  rval = true;
2433  }
2434  } else if (entry.IsSet() && entry.GetSet().IsSetSeq_set()) {
2436  if (IsWGSMaster(**it)) {
2437  rval = true;
2438  break;
2439  }
2440  }
2441  }
2442  return rval;
2443 }
2444 
2445 
2447 {
2448  if (!seq.IsSetDescr()) {
2449  return false;
2450  }
2451  ITERATE(CBioseq::TDescr::Tdata, it, seq.GetDescr().Get()) {
2452  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech() && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2453  return true;
2454  }
2455  }
2456  return false;
2457 }
2458 
2459 
2461 {
2462  CSeqdesc_CI molinfo(bsh, CSeqdesc::e_Molinfo);
2463  if (molinfo && molinfo->GetMolinfo().IsSetTech() && molinfo->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2464  return true;
2465  }
2466  return false;
2467 }
2468 
2469 
2471 {
2472  bool rval = false;
2473  if (entry.IsSeq()) {
2474  rval = IsWGS(entry.GetSeq());
2475  } else if (entry.IsSet() && entry.GetSet().IsSetSeq_set()) {
2476  for (auto it : entry.GetSet().GetSeq_set()) {
2477  if (IsWGS(*it)) {
2478  rval = true;
2479  break;
2480  }
2481  }
2482  }
2483  return rval;
2484 }
2485 
2486 
2488 {
2489  const CTextseq_id* txt = id.GetTextseq_Id();
2490  if (!txt || !txt->IsSetAccession()) {
2491  return false;
2492  }
2495  return true;
2496  } else {
2497  return false;
2498  }
2499 }
2500 
2501 
2503 {
2504  if (!seq.IsSetId()) {
2505  return false;
2506  }
2507  ITERATE(CBioseq::TId, id, seq.GetId()) {
2508  if (IsWGSAccession(**id)) {
2509  return true;
2510  }
2511  }
2512  return false;
2513 }
2514 
2515 
2517 {
2518  const CTextseq_id* txt = id.GetTextseq_Id();
2519  if (!txt || !txt->IsSetAccession()) {
2520  return false;
2521  }
2524  return true;
2525  } else {
2526  return false;
2527  }
2528 }
2529 
2530 
2532 {
2533  if (!seq.IsSetId()) {
2534  return false;
2535  }
2536  ITERATE(CBioseq::TId, id, seq.GetId()) {
2537  if (IsTSAAccession(**id)) {
2538  return true;
2539  }
2540  }
2541  return false;
2542 }
2543 
2544 
2546 {
2547  CBioseq_Handle bsh = scope.GetBioseqHandle(seq);
2548  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
2549  if (desc && desc->GetMolinfo().IsSetCompleteness()) {
2550  CMolInfo::TCompleteness completeness = desc->GetMolinfo().GetCompleteness();
2551  if (completeness == CMolInfo::eCompleteness_partial
2552  || completeness == CMolInfo::eCompleteness_no_left
2553  || completeness == CMolInfo::eCompleteness_no_right
2554  || completeness == CMolInfo::eCompleteness_no_ends) {
2555  return true;
2556  }
2557  }
2558  return false;
2559 }
2560 
2561 
2563 {
2564  FOR_EACH_SEQID_ON_BIOSEQ(id, seq) {
2565  if ((*id)->IsPdb()) {
2566  return true;
2567  }
2568  }
2569  return false;
2570 }
2571 
2572 
2574 {
2575  if (IsPdb(seq) || IsWGSMaster(seq, *m_Scope)) {
2576  return;
2577  }
2578  const CSeq_inst& inst = seq.GetInst();
2579 
2580  TSeqPos len = inst.IsSetLength() ? inst.GetLength() : 0;
2581  if ( seq.IsAa() ) {
2582  if (len <= 3 && !IsPartial(seq, *m_Scope)) {
2583  PostErr(eDiag_Warning, eErr_SEQ_INST_ShortSeq, "Sequence only " +
2584  NStr::IntToString(len) + " residues", seq);
2585  }
2586  } else {
2587  if ( len <= 10 && m_report_short_seq) {
2588  PostErr(eDiag_Warning, eErr_SEQ_INST_ShortSeq, "Sequence only " +
2589  NStr::IntToString(len) + " residues", seq);
2590  }
2591  }
2592 }
2593 
2594 
2595 // Assumes that seq is segmented and has Seq-ext data
2597 {
2598  // Get parent CSeq_entry of seq and then find the next
2599  // CSeq_entry in the set. This CSeq_entry should be a CBioseq_set
2600  // of class parts.
2601  const CSeq_entry* se = seq.GetParentEntry();
2602  if (!se) {
2603  return;
2604  }
2605  const CSeq_entry* parent = se->GetParentEntry ();
2606  if (!parent) {
2607  return;
2608  }
2609  if ( !parent->IsSet() || !parent->GetSet().IsSetClass() || parent->GetSet().GetClass() != CBioseq_set::eClass_segset) {
2610  return;
2611  }
2612 
2613  // Loop through seq_set looking for the parts set.
2614  FOR_EACH_SEQENTRY_ON_SEQSET (it, parent->GetSet()) {
2615  if ((*it)->Which() == CSeq_entry::e_Set
2616  && (*it)->GetSet().IsSetClass()
2617  && (*it)->GetSet().GetClass() == CBioseq_set::eClass_parts) {
2618  const CBioseq_set::TSeq_set& parts = (*it)->GetSet().GetSeq_set();
2619  const CSeg_ext::Tdata& locs = seq.GetInst().GetExt().GetSeg().Get();
2620 
2621  // Make sure the number of locations (excluding null locations)
2622  // match the number of parts
2623  size_t nulls = 0;
2624  ITERATE ( CSeg_ext::Tdata, loc, locs ) {
2625  if ( (*loc)->IsNull() ) {
2626  nulls++;
2627  }
2628  }
2629  if ( locs.size() - nulls < parts.size() ) {
2631  "Parts set contains too many Bioseqs", seq);
2632  return;
2633  } else if ( locs.size() - nulls > parts.size() ) {
2635  "Parts set does not contain enough Bioseqs", seq);
2636  return;
2637  }
2638 
2639  // Now, simultaneously loop through the parts of se_parts and CSeq_locs of
2640  // seq's CSseq-ext. If don't compare, post error.
2641  size_t size = locs.size(); // == parts.size()
2642  CSeg_ext::Tdata::const_iterator loc_it = locs.begin();
2643  CBioseq_set::TSeq_set::const_iterator part_it = parts.begin();
2644  for ( size_t i = 0; i < size; ++i ) {
2645  try {
2646  if ( (*loc_it)->IsNull() ) {
2647  ++loc_it;
2648  continue;
2649  }
2650  if ( !(*part_it)->IsSeq() ) {
2652  "Parts set component is not Bioseq", seq);
2653  return;
2654  }
2655  const CSeq_id& loc_id = GetId(**loc_it, m_Scope);
2656  if ( !IsIdIn(loc_id, (*part_it)->GetSeq()) ) {
2658  "Segmented bioseq seq_ext does not correspond to parts "
2659  "packaging order", seq);
2660  return;
2661  }
2662 
2663  // advance both iterators
2664  ++part_it;
2665  ++loc_it;
2666  } catch (const CObjmgrUtilException&) {
2667  ERR_POST_X(4, "Seq-loc not for unique sequence");
2668  return;
2669  } catch (CException &x1) {
2670  string err_msg = "Unknown error:";
2671  err_msg += x1.what();
2672  ERR_POST_X(5, err_msg);
2673  return;
2674  } catch (std::exception &x2) {
2675  string err_msg = "Unknown error:";
2676  err_msg += x2.what();
2677  ERR_POST_X(5, err_msg);
2678  return;
2679  }
2680  }
2681  }
2682  }
2683 }
2684 
2685 static bool s_IsConWithGaps(const CBioseq& seq)
2686 
2687 {
2688 
2689  if (! seq.IsSetInst ()) return false;
2690  const CSeq_inst& inst = seq.GetInst();
2691  if (! inst.IsSetExt ()) return false;
2692  if (! inst.GetExt().IsDelta()) return false;
2693 
2694  ITERATE(CDelta_ext::Tdata, iter, inst.GetExt().GetDelta().Get()) {
2695  if (! (*iter)->IsLiteral() ) continue;
2696  const CSeq_literal& lit = (*iter)->GetLiteral();
2697  if (!lit.IsSetSeq_data()) return true;
2698  if (lit.GetSeq_data().IsGap() && lit.GetLength() > 0) return true;
2699  }
2700 
2701  return false;
2702 }
2703 
2704 
2706 {
2707  bool has_gap = false;
2708  if (seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta()) {
2709  ITERATE(CDelta_ext::Tdata, iter, seq.GetInst().GetExt().GetDelta().Get()) {
2710  if ((*iter)->IsLiteral() &&
2711  (!(*iter)->GetLiteral().IsSetSeq_data() || (*iter)->GetLiteral().GetSeq_data().IsGap())) {
2712  has_gap = true;
2713  break;
2714  }
2715  }
2716  }
2717  return has_gap;
2718 }
2719 
2721 {
2723  if (!bsh) {
2724  return;
2725  }
2726 
2727  string title = sequence::CDeflineGenerator().GenerateDefline(bsh);
2728 
2729 /*bsv
2730  CMolInfo::TTech tech = CMolInfo::eTech_unknown;
2731 */
2732  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
2733  if (desc) {
2734  const CMolInfo& mi = desc->GetMolinfo();
2735 /*bsv
2736  tech = mi.GetTech();
2737 */
2739  if (m_Imp.IsGenbank()) {
2740  if (NStr::Find(title, "complete genome") != NPOS) {
2741  const CSeq_entry& ctx = *seq.GetParentEntry();
2743  "Complete genome in title without complete flag set",
2744  ctx, *desc);
2745  }
2746  }
2748  (! s_IsConWithGaps (seq)) &&
2749  !m_Imp.IsEmbl() && !m_Imp.IsDdbj()) {
2750  const CSeq_entry& ctx = *seq.GetParentEntry();
2752  "Circular topology without complete flag set", ctx, *desc);
2753  }
2754  }
2755  }
2756 
2757  // warning if title contains complete genome but sequence contains gap features
2758  if (NStr::FindNoCase (title, "complete genome") != NPOS && x_HasGap(seq)) {
2760  "Title contains 'complete genome' but sequence has gaps", seq);
2761  }
2762 
2763 
2764  // note - test for protein titles was moved to CValidError_bioseqset::ValidateNucProtSet
2765  // because it only applied for protein sequences in nuc-prot sets and it's more efficient
2766  // to create the defline generator once per nuc-prot set
2767 }
2768 
2769 static bool HasAssemblyOrNullGap (const CBioseq& seq)
2770 {
2771  const CSeq_inst& inst = seq.GetInst();
2772  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2773  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2774  if ( !(*sg) ) continue;
2775  if ((**sg).Which() != CDelta_seq::e_Literal) continue;
2776  const CSeq_literal& lit = (*sg)->GetLiteral();
2777  if (! lit.IsSetSeq_data()) return true;
2778  if (lit.GetSeq_data().IsGap()) return true;
2779  }
2780  }
2781 
2782  return false;
2783 }
2784 
2785 
2787 {
2788  const CSeq_inst& inst = seq.GetInst();
2789  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2790  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2791  if ( !(*sg) ) continue;
2792  if ((**sg).Which() != CDelta_seq::e_Literal) continue;
2793  const CSeq_literal& lit = (*sg)->GetLiteral();
2794  if (! lit.IsSetSeq_data()) {
2795  PostErr(eDiag_Warning, eErr_SEQ_INST_SeqGapProblem, "TSA Seq_data NULL", seq);
2796  } else {
2797  const CSeq_data& data = lit.GetSeq_data();
2798  if (data.Which() == CSeq_data::e_Gap) {
2799  const CSeq_gap& gap = data.GetGap();
2800  if (gap.IsSetType()) {
2801  int gaptype = gap.GetType();
2802  if (gaptype == CSeq_gap::eType_unknown) {
2803  PostErr(eDiag_Warning, eErr_SEQ_INST_TSAseqGapProblem, "TSA Seq_gap.unknown", seq);
2804  } else if (gaptype == CSeq_gap::eType_other) {
2805  PostErr(eDiag_Warning, eErr_SEQ_INST_TSAseqGapProblem, "TSA Seq_gap.other", seq);
2806  }
2807  } else {
2808  PostErr(eDiag_Warning, eErr_SEQ_INST_TSAseqGapProblem, "TSA Seq_gap NULL", seq);
2809  }
2810  }
2811  }
2812  }
2813  }
2814 }
2815 
2816 
2818 {
2819  const CSeq_inst& inst = seq.GetInst();
2820  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2821  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2822  if (!(*sg)) continue;
2823  // CON division - far delta - suppresses errors
2824  if ((**sg).Which() != CDelta_seq::e_Literal) /* continue */ return false;
2825  const CSeq_literal& lit = (*sg)->GetLiteral();
2826  if (!lit.IsSetSeq_data()) {
2827  return true;
2828  } else {
2829  const CSeq_data& data = lit.GetSeq_data();
2830  if (data.Which() == CSeq_data::e_Gap) {
2831  const CSeq_gap& gap = data.GetGap();
2832  CSeq_gap::TType gap_type = gap.IsSetType() ? gap.GetType() : CSeq_gap::eType_unknown;
2833 
2834  if (gap_type != CSeq_gap::eType_centromere && gap_type != CSeq_gap::eType_heterochromatin &&
2835  gap_type != CSeq_gap::eType_short_arm && gap_type != CSeq_gap::eType_telomere &&
2836  gap_type != CSeq_gap::eType_contig) {
2837 
2838  if (!gap.IsSetLinkage_evidence() || gap.GetLinkage_evidence().empty()) {
2839  return true;
2840  }
2841  }
2842  }
2843  }
2844  }
2845  }
2846  return false;
2847 }
2848 
2849 
2851 {
2852  if (HasBadWGSGap(seq)) {
2854  "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence.", seq);
2855  }
2856 }
2857 
2858 
2860 {
2861  if (HasBadWGSGap(seq)) {
2863  "TSA submission includes wrong gap type. Gaps for TSA should be Assembly Gaps with linkage evidence.", seq);
2864  }
2865 }
2866 
2867 
2869 {
2870  if (HasBadWGSGap(seq)) {
2872  "Genome submission includes wrong gap type. Gaps for genomes should be Assembly Gaps with linkage evidence.", seq);
2873  }
2874 }
2875 
2876 
2877 bool s_FieldHasLabel(const CUser_field& field, const string& label)
2878 {
2879  if (field.IsSetLabel() && field.GetLabel().IsStr() &&
2880  NStr::EqualNocase(field.GetLabel().GetStr(), label)) {
2881  return true;
2882  } else {
2883  return false;
2884  }
2885 }
2886 
2887 
2889 {
2890  if (!field.IsSetData()) {
2891  return false;
2892  }
2893  bool rval = false;
2894  if (field.GetData().IsStr()) {
2895  if (!NStr::IsBlank(field.GetData().GetStr())) {
2896  rval = true;
2897  }
2898  } else if (field.GetData().IsStrs()) {
2900  if (!NStr::IsBlank(*s)) {
2901  rval = true;
2902  break;
2903  }
2904  }
2905  }
2906  return rval;
2907 }
2908 
2909 
2911 {
2912  bool has_biosample = false;
2913  bool has_bioproject = false;
2914 
2915  CSeqdesc_CI d(bsh, CSeqdesc::e_User);
2916  while (d) {
2918  for (auto it: d->GetUser().GetData()) {
2919  if (s_FieldHasLabel(*it, "BioSample")) {
2920  if (s_FieldHasNonBlankValue(*it)) {
2921  has_biosample = true;
2922  }
2923  } else if (s_FieldHasLabel(*it, "BioProject")) {
2924  if (s_FieldHasNonBlankValue(*it)) {
2925  has_bioproject = true;
2926  }
2927  }
2928  }
2929  } else if (m_Imp.IsGenbank()) {
2930  const CUser_object& uo = d->GetUser();
2931  if (uo.GetType().IsStr()) {
2932  const string& type = uo.GetType().GetStr();
2933  if ( NStr::CompareNocase(type, "WGSProjects") == 0 ) {
2934  int fr = 0;
2935  int to = 0;
2936 
2937  for (auto it: uo.GetData()) {
2938  if ( !it->GetLabel().IsStr() ) {
2939  continue;
2940  }
2941  const string& label = it->GetLabel().GetStr();
2942  if ( NStr::CompareNocase(label, "WGS_accession_first") == 0 ||
2943  NStr::CompareNocase(label, "Accession_first") == 0 ) {
2944  const string& str = it->GetData().GetStr();
2945  auto fst = str.find_first_of("0123456789");
2946  fr = NStr::StringToInt (str.substr(fst));
2947  } else if ( NStr::CompareNocase(label, "WGS_accession_last") == 0 ||
2948  NStr::CompareNocase(label, "Accession_last") == 0 ) {
2949  const string& str = it->GetData().GetStr();
2950  auto lst = str.find_first_of("0123456789");
2951  to = NStr::StringToInt (str.substr(lst));
2952  }
2953  }
2954  if ( (fr != 0) && (to != 0) ) {
2955  int df = to - fr + 1;
2956  int blen = bsh.GetBioseqLength();
2957  if (df != blen) {
2959  "Number of accessions (" + NStr::IntToString(df) + ") does not correspond to number of records (" + NStr::IntToString(blen) +")",
2960  *(bsh.GetCompleteBioseq()));
2961  }
2962  }
2963  }
2964  }
2965  }
2966  ++d;
2967  }
2968  if (!has_biosample && !has_bioproject) {
2970  "WGS master lacks both BioSample and BioProject",
2971  *(bsh.GetCompleteBioseq()));
2972  } else if (!has_biosample) {
2974  "WGS master lacks BioSample",
2975  *(bsh.GetCompleteBioseq()));
2976  } else if (!has_bioproject) {
2978  "WGS master lacks BioProject",
2979  *(bsh.GetCompleteBioseq()));
2980  }
2981  if (!has_biosample || !has_bioproject) {
2982  }
2983 }
2984 
2985 
2986 static EDiagSev GetBioseqEndWarning (const CBioseq& seq, bool is_circular, EBioseqEndIsType end_is_char)
2987 {
2988  EDiagSev sev;
2989  bool only_local = true;
2990  bool is_NCACNTNW = false;
2991  bool is_patent = false;
2992  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2993  if (!(*id_it)->IsLocal()) {
2994  only_local = false;
2995  if ((*id_it)->IsPatent()) {
2996  is_patent = true;
2997  } else if (IsNTNCNWACAccession(**id_it)) {
2998  is_NCACNTNW = true;
2999  }
3000  }
3001  }
3002 
3003  if (is_NCACNTNW || is_patent) {
3004  sev = eDiag_Warning;
3005  } else if (is_circular) {
3006  sev = eDiag_Warning;
3007  } else if (only_local) {
3008  sev = eDiag_Warning;
3009  } else if (end_is_char == eBioseqEndIsType_All) {
3010  sev = eDiag_Error;
3011  } else {
3012  sev = eDiag_Warning;
3013  }
3014  return sev;
3015 }
3016 
3017 
3018 void CValidError_bioseq::x_CalculateNsStretchAndTotal(const CSeqVector& vec, TSeqPos& num_ns, TSeqPos& max_stretch, bool& n5, bool& n3)
3019 {
3020 
3022 
3023  num_ns = 0;
3024  max_stretch = 0;
3025  n5 = false;
3026  n3 = false;
3027 
3028  TSeqPos this_stretch = 0;
3029  for (TSeqPos i = 0; i < vec.size(); i++) {
3030  if (vec[i] == 'N') {
3031  num_ns++;
3032  if (vec.IsInGap(i)) {
3033  if (max_stretch < this_stretch) {
3034  max_stretch = this_stretch;
3035  }
3036  this_stretch = 0;
3037  } else {
3038  this_stretch++;
3039  if (this_stretch >= 10) {
3040  if (i < 20) {
3041  n5 = true;
3042  }
3043  if (vec.size() > 20 && i > vec.size() - 10) {
3044  n3 = true;
3045  }
3046  }
3047  }
3048  } else {
3049  if (max_stretch < this_stretch) {
3050  max_stretch = this_stretch;
3051  }
3052  this_stretch = 0;
3053  }
3054  }
3055  if (max_stretch < this_stretch) {
3056  max_stretch = this_stretch;
3057  }
3058 }
3059 
3060 
3062 {
3063 
3064  bool rval = false;
3065  if (HasAssemblyOrNullGap(seq)) {
3066  return rval;
3067  }
3068 
3070  if ( !bsh ) {
3071  return rval;
3072  }
3073 
3074 
3075  TSeqPos num_ns = 0;
3076  TSeqPos max_stretch = 0;
3077  bool n5 = false;
3078  bool n3 = false;
3079 
3080 
3082  x_CalculateNsStretchAndTotal(vec, num_ns, max_stretch, n5, n3);
3083 
3084  if (max_stretch >= 15) {
3086  "Sequence has a stretch of " + NStr::IntToString(max_stretch) + " Ns", seq);
3087  rval = true;
3088  } else {
3089  if (n5) {
3091  "Sequence has a stretch of at least 10 Ns within the first 20 bases", seq);
3092  rval = true;
3093  }
3094  if (n3) {
3096  "Sequence has a stretch of at least 10 Ns within the last 20 bases", seq);
3097  rval = true;
3098  }
3099  }
3100  return rval;
3101 }
3102 
3103 
3104 // check to see if sequence is all Ns
3106 {
3107  bool rval = true;
3108  bool at_least_one = false;
3109  try {
3110  for (CSeqVector_CI sv_iter(vec); (sv_iter) && rval; ++sv_iter) {
3111  if (*sv_iter != 'N') {
3112  rval = false;
3113  }
3114  at_least_one = true;
3115  }
3116  } catch (CException& ) {
3117 
3118  }
3119  return (rval && at_least_one);
3120 }
3121 
3122 
3123 static int CountNs(const CSeq_data& seq_data, TSeqPos len)
3124 {
3125  int total = 0;
3126  switch (seq_data.Which()) {
3127  case CSeq_data::e_Ncbi4na:
3128  {
3129  vector<char>::const_iterator it = seq_data.GetNcbi4na().Get().begin();
3130  unsigned char mask = 0xf0;
3131  unsigned char shift = 4;
3132  for (size_t n = 0; n < len; n++) {
3133  unsigned char c = ((*it) & mask) >> shift;
3134  mask >>= 4;
3135  shift -= 4;
3136  if (!mask) {
3137  mask = 0xf0;
3138  shift = 4;
3139  ++it;
3140  }
3141  if (c == 15) {
3142  total++;
3143  }
3144  }
3145  }
3146  return total;
3147  case CSeq_data::e_Iupacna:
3148  {
3149  const string& s = seq_data.GetIupacna().Get();
3150  for (size_t n = 0; n < len && n < s.length(); n++) {
3151  if (s[n] == 'N') {
3152  total++;
3153  }
3154  }
3155  }
3156  return total;
3157  case CSeq_data::e_Ncbi8na:
3158  case CSeq_data::e_Ncbipna:
3159  {
3160  CSeq_data iupacna;
3161  if (!CSeqportUtil::Convert(seq_data, &iupacna, CSeq_data::e_Iupacna)) {
3162  return total;
3163  }
3164  const string& s = iupacna.GetIupacna().Get();
3165  for (size_t n = 0; n < len; n++) {
3166  if (s[n] == 'N') {
3167  total++;
3168  }
3169  }
3170  }
3171  return total;
3172  default:
3173  return total;
3174  }
3175 }
3176 
3177 
3179 {
3180  int count = 0;
3181  SSeqMapSelector sel;
3183  for (CSeqMap_CI seq_iter(bsh, sel); seq_iter; ++seq_iter) {
3184  switch (seq_iter.GetType()) {
3185  case CSeqMap::eSeqData:
3186  count += CountNs(seq_iter.GetData(), seq_iter.GetLength());
3187  break;
3188  default:
3189  break;
3190  }
3191  }
3192 /*
3193  int pct_n = 0;
3194  try {
3195  CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
3196  TSeqPos num_ns = 0;
3197  for (size_t i = 0; i < vec.size(); i++) {
3198  try {
3199  if (vec[i] == 'N' && !vec.IsInGap(i)) {
3200  num_ns++;
3201  }
3202  } catch (CException& e2) {
3203  //bad character
3204  }
3205  }
3206  pct_n = (num_ns * 100) / bsh.GetBioseqLength();
3207  } catch (CException& e) {
3208  pct_n = 100;
3209  }
3210 */
3211  return bsh.GetBioseqLength() ? count * 100 / bsh.GetBioseqLength() : 100;
3212 }
3213 
3214 static
3216 {
3219  bool is_first = true;
3220 
3221  if ( inst.CanGetExt() && inst.GetExt().IsDelta() ) {
3222  ITERATE(CDelta_ext::Tdata, iter, inst.GetExt().GetDelta().Get()) {
3223  if ( (*iter)->IsLoc() ) {
3224  return false;
3225  }
3226  if ( (*iter)->IsLiteral() ) {
3227  const CSeq_literal& lit = (*iter)->GetLiteral();
3228  if (lit.IsSetSeq_data() && lit.GetSeq_data().IsGap()) {
3229  const CSeq_gap& gap = lit.GetSeq_data().GetGap();
3231  if (gap.IsSetType()) {
3232  gaptype = gap.GetType();
3233  }
3234  if (is_first) {
3235  first = gaptype;
3236  } else {
3237  last = gaptype;
3238  }
3239  } else {
3241  }
3242  }
3243  is_first = false;
3244  }
3245  }
3246  fst = first;
3247  lst = last;
3248  return true;
3249 }
3250 
3251 static bool s_WillReportTerminalGap(const CBioseq& seq, CBioseq_Handle bsh)
3252 {
3253  if (!seq.IsSetInst() || !seq.GetInst().IsSetRepr()) {
3254  return false;
3255  }
3256  if (!seq.GetInst().IsSetMol() || seq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
3257  return false;
3258  }
3259  CSeq_inst::TRepr repr = seq.GetInst().GetRepr();
3260 
3261  if (repr != CSeq_inst::eRepr_delta) {
3262  return false;
3263  }
3264 
3265  if ( !bsh ) {
3266  return false;
3267  }
3268 
3269  if (!seq.GetInst().IsSetLength() || seq.GetInst().GetLength() < 10) {
3270  return false;
3271  }
3272 
3273  if (!ShouldCheckForNsAndGap(bsh)) {
3274  return false;
3275  }
3276 
3277  return true;
3278 }
3279 
3280 
3281 static optional<int> s_MaxSeqStretchIfLessThanThreshold(const CSeqVector& vec, int threshold)
3282 {
3283  int max_stretch = 0;
3284  auto IsN = [](char c) { return c == 'N'; };
3285 
3286  for (auto begin_it = find_if_not(begin(vec), end(vec), IsN);
3287  begin_it != end(vec);) {
3288  auto distanceToEnd = distance(begin_it, end(vec));
3289  // check a sequence interval no longer than the threshold length
3290  auto interval = (distanceToEnd > threshold) ? threshold : distanceToEnd;
3291  auto end_it = find_if(begin_it, next(begin_it, interval), IsN);
3292  const auto current_stretch = distance(begin_it, end_it);
3293  if (current_stretch >= threshold) { // No Ns in the interval
3294  return {};
3295  }
3296 
3297  if (current_stretch > max_stretch) {
3298  max_stretch = current_stretch;
3299  }
3300  begin_it = find_if_not(end_it, end(vec), IsN);
3301  }
3302  return max_stretch;
3303 }
3304 
3305 
3307 {
3308  if (!seq.IsSetInst() || !seq.GetInst().IsSetRepr()) {
3309  // can't check if no Inst or Repr
3310  return;
3311  }
3312  if (!seq.GetInst().IsSetMol() || seq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
3313  // don't check proteins here
3314  return;
3315  }
3316  CSeq_inst::TRepr repr = seq.GetInst().GetRepr();
3317 
3318  // only check for raw or for delta sequences that are delta lit only
3319  if (repr == CSeq_inst::eRepr_virtual || repr == CSeq_inst::eRepr_map) {
3320  return;
3321  }
3322 
3324  if ( !bsh ) {
3325  // no check if Bioseq not in scope
3326  return;
3327  }
3328 
3329  try {
3331 
3332  if (IsAllNs(vec)) {
3333  PostErr(m_Imp.IsPDB() ? eDiag_Warning : eDiag_Critical, eErr_SEQ_INST_AllNs, "Sequence is all Ns", seq);
3334  return;
3335  }
3336 
3337  // don't bother checking if length is less than 10
3338  if (!seq.IsSetInst() || !seq.GetInst().IsSetRepr()
3339  || !seq.GetInst().IsSetLength() || seq.GetInst().GetLength() < 10) {
3340  return;
3341  }
3342 
3343  if (const auto oMaxLength = s_MaxSeqStretchIfLessThanThreshold(vec, 10); oMaxLength.has_value()) {
3345  "Maximum contig length is " + NStr::IntToString(*oMaxLength) + " bases", seq);
3346  }
3347 
3352  bool begin_ambig = false, end_ambig = false;
3355  if (ShouldCheckForNsAndGap(bsh) && x_IsDeltaLitOnly(seq.GetInst())) {
3356  CheckBioseqEndsForNAndGap(vec, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
3357  s_GetFlankingGapTypes(seq.GetInst(), fst, lst);
3358  }
3359 
3360  bool is_circular = false;
3362  is_circular = true;
3363  }
3364  EDiagSev sev;
3365  if (begin_n != eBioseqEndIsType_None) {
3366  sev = GetBioseqEndWarning(seq, is_circular, begin_n);
3367  PostErr(sev, eErr_SEQ_INST_TerminalNs, "N at beginning of sequence", seq);
3368  } else if (begin_gap != eBioseqEndIsType_None && fst != CSeq_gap::eType_contamination) {
3369  sev = GetBioseqEndWarning(seq, is_circular, begin_gap);
3370  PostErr (sev, eErr_SEQ_INST_TerminalGap, "Gap at beginning of sequence", seq);
3371  }
3372 
3373  if (end_n != eBioseqEndIsType_None) {
3374  sev = GetBioseqEndWarning(seq, is_circular, end_n);
3375  PostErr(sev, eErr_SEQ_INST_TerminalNs, "N at end of sequence", seq);
3376  } else if (end_gap != eBioseqEndIsType_None && lst != CSeq_gap::eType_contamination) {
3377  sev = GetBioseqEndWarning(seq, is_circular, end_gap);
3378  PostErr (sev, eErr_SEQ_INST_TerminalGap, "Gap at end of sequence", seq);
3379  }
3380 
3381  if (begin_ambig && !s_WillReportTerminalGap(seq, bsh)) {
3383  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases",
3384  seq);
3385  }
3386  if (end_ambig && !s_WillReportTerminalGap(seq, bsh)) {
3388  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases",
3389  seq);
3390  }
3391 
3392  // don't check N content for patent sequences
3393  if (SeqIsPatent(seq)) {
3394  return;
3395  }
3396 
3397  // if TSA, check for percentage of Ns and max stretch of Ns
3398  if (IsBioseqTSA(seq, m_Scope)) {
3399  ReportBadAssemblyGap (seq);
3400  if (!HasAssemblyOrNullGap(seq)) {
3401  bool n5 = false;
3402  bool n3 = false;
3403  TSeqPos num_ns = 0, max_stretch = 0;
3404  x_CalculateNsStretchAndTotal(vec, num_ns, max_stretch, n5, n3);
3405 
3406  int pct_n = (num_ns * 100) / seq.GetLength();
3407  if (pct_n > 10) {
3409  "Sequence contains " + NStr::IntToString(pct_n) + " percent Ns", seq);
3410  }
3411 
3412  if (max_stretch >= 15) {
3414  "Sequence has a stretch of " + NStr::IntToString(max_stretch) + " Ns", seq);
3415  } else {
3416  if (n5) {
3418  "Sequence has a stretch of at least 10 Ns within the first 20 bases", seq);
3419  }
3420  if (n3) {
3422  "Sequence has a stretch of at least 10 Ns within the last 20 bases", seq);
3423  }
3424  }
3425  }
3426  } else {
3427  // not TSA, just check for really high N percent
3428  int pct_n = PctNs(bsh);
3429  if (pct_n > 50) {
3431  "Sequence contains " + NStr::IntToString(pct_n) + " percent Ns", seq);
3432  }
3433  }
3434 
3435  if (!m_Imp.IsRefSeqConventions() && !IsRefSeq(seq) && !IsEmblOrDdbj(seq)) {
3436  if (IsWGS(bsh)) {
3437  ReportBadWGSGap(seq);
3438  } else if (IsBioseqTSA(seq, m_Scope)) {
3439  ReportBadTSAGap(seq);
3440  } else if (m_Imp.IsGenomeSubmission()) {
3441  ReportBadGenomeGap(seq);
3442  }
3443  }
3444  } catch ( exception& ) {
3445  // just ignore, and continue with the validation process.
3446  }
3447 }
3448 
3450 {
3451  // rough measure of where exception occurs - triggered by certain conditions set up in unit_test_validator
3452  int errPt = 0;
3453 
3454  try {
3455 
3456  errPt++;
3457 
3458  if (!seq.IsSetInst() || !seq.GetInst().IsSetRepr()) {
3459  // can't check if no Inst or Repr
3460  return;
3461  }
3462  if (!seq.GetInst().IsSetMol() || seq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
3463  // don't check proteins here
3464  return;
3465  }
3466  CSeq_inst::TRepr repr = seq.GetInst().GetRepr();
3467 
3468  // only check for raw or for delta sequences that are delta lit only
3469  if (repr == CSeq_inst::eRepr_virtual || repr == CSeq_inst::eRepr_map) {
3470  return;
3471  }
3472 
3474  if ( !bsh ) {
3475  // no check if Bioseq not in scope
3476  return;
3477  }
3478 
3479  errPt++;
3480 
3481  vector<TSeqPos> gapPositions;
3482 
3483  SSeqMapSelector sel;
3484 
3486 
3487  for (CSeqMap_CI gap_it(bsh, sel); gap_it; ++gap_it) {
3488 
3489  TSeqPos gp_start = gap_it.GetPosition();
3490  TSeqPos gp_end = gap_it.GetEndPosition() - 1;
3491 
3492  gapPositions.push_back(gp_start);
3493  gapPositions.push_back(gp_end);
3494 
3495  // cout << "gap start: " << gp_start << ", end: " << gp_end << endl;
3496  }
3497 
3498  errPt++;
3499 
3500  vector<TSeqPos> featPositions;
3501 
3502  for (CFeat_CI feat_it(bsh); feat_it; ++feat_it) {
3503 
3504  CSeq_feat_Handle feat = feat_it->GetSeq_feat_Handle();
3505  CSeqFeatData::ESubtype subtype = feat.GetFeatSubtype();
3506  if (subtype != CSeqFeatData::eSubtype_gap) continue;
3507 
3508  CConstRef<CSeq_loc> feat_loc(&feat_it->GetLocation());
3509 
3510  int ft_start = feat_loc->GetStart(eExtreme_Positional);
3511  int ft_end = feat_loc->GetStop(eExtreme_Positional);
3512 
3513  featPositions.push_back(ft_start);
3514  featPositions.push_back(ft_end);
3515 
3516  // cout << "feat start: " << ft_start << ", end: " << ft_end << endl;
3517  }
3518 
3519  errPt++;
3520 
3521  int remaininig_gaps = (int) gapPositions.size() / 2;
3522  int remaining_feats = (int) featPositions.size() / 2;
3523 
3524  if (remaininig_gaps < 1 || remaining_feats < 1) {
3525  return;
3526  }
3527 
3528  int gap_idx = 0;
3529  int feat_idx = 0;
3530 
3531  TSeqPos gap_start = gapPositions[gap_idx];
3532  gap_idx++;
3533  TSeqPos gap_end = gapPositions[gap_idx];
3534  gap_idx++;
3535  remaininig_gaps--;
3536 
3537  TSeqPos feat_start = featPositions[feat_idx];
3538  feat_idx++;
3539  TSeqPos feat_end = featPositions[feat_idx];
3540  feat_idx++;
3541  remaining_feats--;
3542 
3543  errPt++;
3544 
3545  while (remaininig_gaps >= 0 && remaining_feats >= 0) {
3546  if (gap_end < feat_start) {
3547  if (remaininig_gaps <= 0) {
3548  return;
3549  }
3550  gap_start = gapPositions[gap_idx];
3551  gap_idx++;
3552  gap_end = gapPositions[gap_idx];
3553  gap_idx++;
3554  remaininig_gaps--;
3555  } else if (feat_end < gap_start) {
3556  if (remaining_feats <= 0) {
3557  return;
3558  }
3559  feat_start = featPositions[feat_idx];
3560  feat_idx++;
3561  feat_end = featPositions[feat_idx];
3562  feat_idx++;
3563  remaining_feats--;
3564  } else {
3565  // cout << "overlap gap start: " << gap_start << ", end: " << gap_end << ", feat start: " << feat_start << ", end: " << feat_end << endl;
3566  if (feat_start != gap_start || feat_end != gap_end) {
3567  PostErr(eDiag_Warning, eErr_SEQ_INST_InstantiatedGapMismatch, "Gap feature location does not match delta gap coordinates", seq);
3568  }
3569  if (remaininig_gaps <= 0) {
3570  return;
3571  }
3572  gap_start = gapPositions[gap_idx];
3573  gap_idx++;
3574  gap_end = gapPositions[gap_idx];
3575  gap_idx++;
3576  remaininig_gaps--;
3577  if (remaining_feats <= 0) {
3578  return;
3579  }
3580  feat_start = featPositions[feat_idx];
3581  feat_idx++;
3582  feat_end = featPositions[feat_idx];
3583  feat_idx++;
3584  remaining_feats--;
3585  }
3586  }
3587 
3588  errPt++;
3589 
3590  } catch ( const exception& ) {
3592  string("Exception " + NStr::IntToString(errPt) + " in GapByGapInst"), seq);
3593  }
3594 }
3595 
3596 // Assumes that seq is eRepr_raw or eRepr_inst
3598  const CBioseq& seq)
3599 {
3600  const CSeq_inst& inst = seq.GetInst();
3601  const CEnumeratedTypeValues* tv = CSeq_inst::GetTypeInfo_enum_ERepr();
3602  const string& rpr = tv->FindName(inst.GetRepr(), true);
3603 
3604  if (inst.IsSetFuzz() && (!inst.IsSetSeq_data() || !inst.GetSeq_data().IsGap())) {
3606  "Fuzzy length on " + rpr + " Bioseq", seq);
3607  }
3608 
3609  if (!inst.IsSetLength() || inst.GetLength() == 0) {
3610  string len = inst.IsSetLength() ?
3611  NStr::IntToString(inst.GetLength()) : "0";
3613  "Invalid Bioseq length [" + len + "]", seq);
3614  }
3615 
3616  if (inst.GetRepr() == CSeq_inst::eRepr_raw) {
3617  const CMolInfo* mi = nullptr;
3619  if ( mi_desc ) {
3620  mi = &(mi_desc->GetMolinfo());
3621  }
3622  CMolInfo::TTech tech = mi ? mi->GetTech() : CMolInfo::eTech_unknown;
3623  if (tech == CMolInfo::eTech_htgs_2 &&
3624  !GraphsOnBioseq(seq) &&
3625  !x_IsActiveFin(seq)) {
3627  "HTGS 2 raw seq has no gaps and no graphs", seq);
3628  }
3629  }
3630 
3632 
3633  CSeq_data::E_Choice seqtyp = inst.IsSetSeq_data() ?
3635  if (seqtyp != CSeq_data::e_Gap) {
3636  switch (seqtyp) {
3637  case CSeq_data::e_Iupacna:
3638  case CSeq_data::e_Ncbi2na:
3639  case CSeq_data::e_Ncbi4na:
3640  case CSeq_data::e_Ncbi8na:
3641  case CSeq_data::e_Ncbipna:
3642  if (inst.IsAa()) {
3644  "Using a nucleic acid alphabet on a protein sequence",
3645  seq);
3646  return;
3647  }
3648  break;
3649  case CSeq_data::e_Iupacaa:
3650  case CSeq_data::e_Ncbi8aa:
3651  case CSeq_data::e_Ncbieaa:
3652  case CSeq_data::e_Ncbipaa:
3654  if (inst.IsNa()) {
3656  "Using a protein alphabet on a nucleic acid",
3657  seq);
3658  return;
3659  }
3660  break;
3661  case CSeq_data::e_Gap:
3662  break;
3663  default:
3665  "Sequence alphabet not set",
3666  seq);
3667  return;
3668  }
3669 
3670  bool check_alphabet = false;
3671  unsigned int factor = 1;
3672  switch (seqtyp) {
3673  case CSeq_data::e_Iupacaa:
3674  case CSeq_data::e_Iupacna:
3675  case CSeq_data::e_Ncbieaa:
3677  check_alphabet = true;
3678  break;
3679  case CSeq_data::e_Ncbi8na:
3680  case CSeq_data::e_Ncbi8aa:
3681  break;
3682  case CSeq_data::e_Ncbi4na:
3683  factor = 2;
3684  break;
3685  case CSeq_data::e_Ncbi2na:
3686  factor = 4;
3687  break;
3688  case CSeq_data::e_Ncbipna:
3689  factor = 5;
3690  break;
3691  case CSeq_data::e_Ncbipaa:
3692  factor = 21;
3693  break;
3694  default:
3695  // Logically, should not occur
3697  "Sequence alphabet not set",
3698  seq);
3699  return;
3700  }
3701  TSeqPos calc_len = inst.IsSetLength() ? inst.GetLength() : 0;
3702  if (calc_len % factor) {
3703  calc_len += factor;
3704  }
3705  calc_len /= factor;
3706 
3707  string s_len = NStr::UIntToString(inst.GetLength());
3708 
3709  size_t data_len = GetDataLen(inst);
3710  string data_len_str = NStr::NumericToString(data_len * factor);
3711  if (calc_len > data_len) {
3713  "Bioseq.seq_data too short [" + data_len_str +
3714  "] for given length [" + s_len + "]", seq);
3715  return;
3716  } else if (calc_len < data_len) {
3718  "Bioseq.seq_data is larger [" + data_len_str +
3719  "] than given length [" + s_len + "]", seq);
3720  }
3721 
3722  if (check_alphabet) {
3723  unsigned int trailingX = 0;
3724  size_t dashes = 0;
3725  bool leading_x = false, found_lower = false, cds_5_prime = false;
3726 
3729 
3730  size_t bad_cnt = 0;
3731  TSeqPos pos = 1;
3732  for ( CSeqVector_CI sv_iter(*sv), sv_res_iter(sv_res); (sv_iter) && (sv_res_iter); ++sv_iter, ++sv_res_iter ) {
3733  CSeqVector::TResidue res = *sv_iter;
3734  CSeqVector::TResidue n_res = *sv_res_iter;
3735  if ( !IsResidue(n_res) ) {
3736  if (res == 'U' && bsh.IsSetInst_Mol() && bsh.GetInst_Mol() == CSeq_inst::eMol_rna) {
3737  // U is ok for RNA
3738  }
3739  else if (res == '*' && bsh.IsAa()) {
3740  trailingX = 0;
3741  } else if (res == '-' && bsh.IsAa()) {
3742  dashes++;
3744  "Invalid residue [" + NStr::UIntToString(res)
3745  + "] at position [" + NStr::UIntToString(pos) + "]",
3746  seq);
3747  } else {
3748  if ( ! IsResidue(res)) {
3749  if ( ++bad_cnt > 10 ) {
3751  "More than 10 invalid residues. Checking stopped",
3752  seq);
3753  return;
3754  } else {
3756  "Invalid residue [" + NStr::UIntToString(res)
3757  + "] at position [" + NStr::UIntToString(pos) + "]",
3758  seq);
3759  }
3760  } else if (islower (res)) {
3761  found_lower = true;
3762  } else {
3763  string msg = "Invalid";
3764  if (seq.IsNa() && strchr ("EFIJLOPQXZ", res) != NULL) {
3765  msg += " nucleotide";
3766  } else if (seq.IsNa() && res == 'U') {
3767  msg += " nucleotide";
3768  }
3769  msg += " residue ";
3770  if (seqtyp == CSeq_data::e_Ncbistdaa) {
3771  msg += "[" + NStr::UIntToString(res) + "]";
3772  } else {
3773  msg += "'";
3774  msg += res;
3775  msg += "'";
3776  }
3777  msg += " at position [" + NStr::UIntToString(pos) + "]";
3778 
3780  msg, seq);
3781  }
3782  }
3783  } else if ( res == '-' || sv->IsInGap(pos - 1) ) {
3784  dashes++;
3785  } else if ( res == '*') {
3786  trailingX = 0;
3787  } else if ( res == 'X' ) {
3788  trailingX++;
3789  if (pos == 1) {
3790  leading_x = true;
3791  }
3792  } else if (!isalpha (res)) {
3793  string msg = "Invalid residue [";
3794  msg += res;
3795  msg += "] in position [" + NStr::UIntToString(pos) + "]";
3797  msg, seq);
3798  } else {
3799  trailingX = 0;
3800  }
3801  ++pos;
3802  }
3803 
3804  bool gap_at_start = HasBadProteinStart(*sv);
3805  size_t terminations = CountProteinStops(*sv);
3806 
3807  // only show leading or trailing X if product of NNN in nucleotide
3808  if (seq.IsAa() && (leading_x || trailingX > 0)) {
3809  CBioseq_Handle bsh = m_Scope->GetBioseqHandle (seq);
3810  const CSeq_feat* cds = GetCDSForProduct(bsh);
3811  if (cds && cds->IsSetLocation()) {
3812  const CSeq_loc& cdsloc = cds->GetLocation();
3813  size_t dna_len = GetLength (cdsloc, m_Scope);
3814  if (dna_len > 5) {
3815  string cds_seq = GetSequenceStringFromLoc (cdsloc, *m_Scope);
3816  if (cds->GetData().GetCdregion().IsSetFrame()) {
3817  if (cds->GetData().GetCdregion().GetFrame() == 2) {
3818  cds_seq = cds_seq.substr(1);
3819  } else if (cds->GetData().GetCdregion().GetFrame() == 3) {
3820  cds_seq = cds_seq.substr(2);
3821  }
3822  }
3823 
3824  if (!NStr::StartsWith (cds_seq, "NNN")) {
3825  leading_x = false;
3826  }
3827  if (cds_seq.length() >= 3) {
3828  string lastcodon = cds_seq.substr(cds_seq.length() - 3);
3829  if (!NStr::StartsWith(lastcodon, "NNN")) {
3830  trailingX = 0;
3831  }
3832  }
3833  }
3834  // only need to calculate cds_5_prime to set severity for subsequent eErr_SEQ_INST_LeadingX message
3835  if (leading_x) {
3836  if (cdsloc.IsPartialStart(eExtreme_Biological)) {
3837  cds_5_prime = true;
3838  }
3839  }
3840  }
3841  }
3842 
3843  if (leading_x) {
3844  EDiagSev sev = eDiag_Warning;
3845  if (cds_5_prime) {
3846  sev = eDiag_Info;
3847  }
3849  "Sequence starts with leading X", seq);
3850  }
3851 
3852  if ( trailingX > 0 && !SuppressTrailingXMsg(seq) ) {
3853  // Suppress if cds ends in "*" or 3' partial
3854  string msg = "Sequence ends in " +
3855  NStr::IntToString(trailingX) + " trailing X";
3856  if ( trailingX > 1 ) {
3857  msg += "s";
3858  }
3860  }
3861 
3862  if (found_lower) {
3864  "Sequence contains lower-case characters", seq);
3865  }
3866 
3867  if (terminations > 0 || dashes > 0) {
3868  // Post error indicating terminations found in protein sequence
3869  // if possible, get gene and protein names
3870  CBioseq_Handle bsh = m_Scope->GetBioseqHandle (seq);
3871  // First get gene label
3872  string gene_label;
3873  try {
3874  const CSeq_feat* cds = GetCDSForProduct(bsh);
3875  if (cds) {
3877  if (gene && gene->IsSetData() && gene->GetData().IsGene()) {
3878  gene->GetData().GetGene().GetLabel(&gene_label);
3879  }
3880  }
3881  } catch (...) {
3882  }
3883  // get protein label
3884  string protein_label;
3885  try {
3886  CCacheImpl::SFeatKey prot_key(
3888  const CCacheImpl::TFeatValue & prots =
3889  GetCache().GetFeatFromCache(prot_key);
3890  if( ! prots.empty() ) {
3891  const CSeqFeatData_Base::TProt & first_prot =
3892  prots[0].GetData().GetProt();
3893  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(first_prot, Name) ) {
3894  protein_label = first_prot.GetName().front();
3895  }
3896  }
3897  } catch (const CException& ) {
3898  } catch (const std::exception& ) {
3899  }
3900 
3901  if (NStr::IsBlank(gene_label)) {
3902  gene_label = "gene?";
3903  }
3904  if (NStr::IsBlank(protein_label)) {
3905  protein_label = "prot?";
3906  }
3907 
3908  if (dashes > 0) {
3909  if (gap_at_start && dashes == 1) {
3911  "gap symbol at start of protein sequence (" + gene_label + " - " + protein_label + ")",
3912  seq);
3913  } else if (gap_at_start) {
3915  "gap symbol at start of protein sequence (" + gene_label + " - " + protein_label + ")",
3916  seq);
3918  "[" + NStr::SizetToString (dashes - 1) + "] internal gap symbols in protein sequence (" + gene_label + " - " + protein_label + ")",
3919  seq);
3920  } else {
3922  "[" + NStr::SizetToString (dashes) + "] internal gap symbols in protein sequence (" + gene_label + " - " + protein_label + ")",
3923  seq);
3924  }
3925  }
3926 
3927  if (terminations > 0) {
3928  string msg = "[" + NStr::SizetToString(terminations) + "] termination symbols in protein sequence";
3929  msg += " (" + gene_label + " - " + protein_label + ")";
3930  const CSeq_feat* cds = GetCDSForProduct(bsh);
3931  if (cds) {
3933  } else {
3935  }
3936  }
3937  }
3938  }
3939 
3940  bool is_wgs = IsWGS(bsh);
3941 
3942  if (seq.IsNa() && seq.GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
3943  // look for runs of Ns and gap characters
3944  bool has_gap_char = false;
3945  size_t run_len = 0;
3946  TSeqPos start_pos = 0;
3947  TSeqPos pos = 1;
3949  const size_t run_len_cutoff = ( is_wgs ? 20 : 100 );
3950  for ( CSeqVector_CI sv_iter(sv); (sv_iter); ++sv_iter, ++pos ) {
3951  CSeqVector::TResidue res = *sv_iter;
3952  switch(res) {
3953  case 'N':
3954  if (run_len == 0) {
3955  start_pos = pos;
3956  }
3957  run_len++;
3958  break;
3959  case '-':
3960  has_gap_char = true;
3961  ///////////////////////////////////
3962  ////////// FALL-THROUGH! //////////
3963  ///////////////////////////////////
3964  default:
3965  if (run_len >= run_len_cutoff && start_pos > 1)
3966  {
3968  "Run of " + NStr::SizetToString (run_len) + " Ns in raw sequence starting at base "
3969  + NStr::IntToString (start_pos),
3970  seq);
3971  }
3972  run_len = 0;
3973  break;
3974  }
3975  }
3976  if (has_gap_char) {
3978  "Raw nucleotide should not contain gap characters", seq);
3979  }
3980  }
3981  }
3982 }
3983 
3984 
3985 //LCOV_EXCL_START
3986 //part of segset validation, no longer used
3987 // Assumes seq is eRepr_seg or eRepr_ref
3989 {
3990  string id_test_label;
3991  seq.GetLabel(&id_test_label, CBioseq::eContent);
3992 
3994  const CSeq_inst& inst = seq.GetInst();
3995 
3996  // Validate extension data -- wrap in CSeq_loc_mix for convenience
3997  CRef<CSeq_loc> loc = GetLocFromSeq(seq);
3998  if (loc) {
3999  if (inst.IsSetRepr() && inst.GetRepr() == CSeq_inst::eRepr_seg) {
4000  m_Imp.ValidateSeqLoc(*loc, bsh, true, "Segmented Bioseq", seq);
4001  }
4002 
4003  // Validate Length
4004  try {
4005  TSeqPos loclen = GetLength(*loc, m_Scope);
4006  TSeqPos seqlen = inst.IsSetLength() ? inst.GetLength() : 0;
4007  if (seqlen > loclen) {
4009  "Bioseq.seq_data too short [" + NStr::IntToString(loclen) +
4010  "] for given length [" + NStr::IntToString(seqlen) + "]",
4011  seq);
4012  } else if (seqlen < loclen) {
4014  "Bioseq.seq_data is larger [" + NStr::IntToString(loclen) +
4015  "] than given length [" + NStr::IntToString(seqlen) + "]",
4016  seq);
4017  }
4018  } catch (const CObjmgrUtilException&) {
4019  ERR_POST_X(6, Critical << "Unable to calculate length: ");
4020  }
4021  }
4022 
4023  // Check for multiple references to the same Bioseq
4024  if (inst.IsSetExt() && inst.GetExt().IsSeg()) {
4025  const list< CRef<CSeq_loc> >& locs = inst.GetExt().GetSeg().Get();
4026  ITERATE(list< CRef<CSeq_loc> >, i1, locs) {
4027  if (!IsOneBioseq(**i1, m_Scope)) {
4028  continue;
4029  }
4030  const CSeq_id& id1 = GetId(**i1, m_Scope);
4031  list< CRef<CSeq_loc> >::const_iterator i2 = i1;
4032  for (++i2; i2 != locs.end(); ++i2) {
4033  if (!IsOneBioseq(**i2, m_Scope)) {
4034  continue;
4035  }
4036  const CSeq_id& id2 = GetId(**i2, m_Scope);
4037  if (IsSameBioseq(id1, id2, m_Scope)) {
4038  string sid;
4039  id1.GetLabel(&sid);
4040  if ((**i1).IsWhole() && (**i2).IsWhole()) {
4043  "Segmented sequence has multiple references to " +
4044  sid, seq);
4045  } else {
4048  "Segmented sequence has multiple references to " +
4049  sid + " that are not SEQLOC_WHOLE", seq);
4050  }
4051  }
4052  }
4053  }
4054  }
4055 
4056  // Check that partial sequence info on sequence segments is consistent with
4057  // partial sequence info on sequence -- aa sequences only
4058  int partial = SeqLocPartialCheck(*loc, m_Scope);
4059  if (seq.IsAa()) {
4060  bool got_partial = false;
4061  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (sd, seq) {
4062  if (!(*sd)->IsMolinfo() || !(*sd)->GetMolinfo().IsSetCompleteness()) {
4063  continue;
4064  }
4065 
4066  switch ((*sd)->GetMolinfo().GetCompleteness()) {
4068  got_partial = true;
4069  if (!partial) {
4071  "Complete segmented sequence with MolInfo partial", seq);
4072  }
4073  break;
4075  if (!(partial & eSeqlocPartial_Start) || (partial & eSeqlocPartial_Stop)) {
4077  "No-left inconsistent with segmented SeqLoc",
4078  seq);
4079  }
4080  got_partial = true;
4081  break;
4083  if (!(partial & eSeqlocPartial_Stop) || (partial & eSeqlocPartial_Start)) {
4085  "No-right inconsistent with segmented SeqLoc",
4086  seq);
4087  }
4088  got_partial = true;
4089  break;
4091  if (!(partial & eSeqlocPartial_Start) || !(partial & eSeqlocPartial_Stop)) {
4093  "No-ends inconsistent with segmented SeqLoc",
4094  seq);
4095  }
4096  got_partial = true;
4097  break;
4098  default:
4099  break;
4100  }
4101  }
4102  if (!got_partial) {
4104  "Partial segmented sequence without MolInfo partial", seq);
4105  }
4106  }
4107 }
4108 //LCOV_EXCL_STOP
4109 
4110 
4112 {
4113  int max_ns = -1;
4114 
4115  switch (tech) {
4119  max_ns = 80;
4120  break;
4121  case CMolInfo::eTech_wgs:
4122  max_ns = 19;
4123  break;
4124  default:
4125  max_ns = 99;
4126  break;
4127  }
4128  return max_ns;
4129 }
4130 
4131 
4132 static bool s_IsSwissProt (const CBioseq& seq)
4133 {
4134  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
4135  if ((*it)->IsSwissprot()) {
4136  return true;
4137  }
4138  }
4139  return false;
4140 }
4141 
4143 {
4144  TIntId cmp = q1->GetId()->CompareOrdered(*(q2->GetId()));
4145  if (cmp < 0) {
4146  return true;
4147  } else if (cmp > 0) {
4148  return false;
4149  }
4150 
4151  TSeqPos start1 = q1->GetStart(eExtreme_Positional);
4152  TSeqPos start2 = q2->GetStart(eExtreme_Positional);
4153  if (start1 < start2) {
4154  return true;
4155  } else if (start2 < start1) {
4156  return false;
4157  }
4158 
4159  TSeqPos stop1 = q1->GetStop(eExtreme_Positional);
4160  TSeqPos stop2 = q2->GetStop(eExtreme_Positional);
4161 
4162  if (stop1 < stop2) {
4163  return true;
4164  } else {
4165  return false;
4166  }
4167 }
4168 
4169 
4171 {
4172  bool rval = false;
4173 
4174  if (!seq.IsSetInst() || !seq.GetInst().IsSetExt() ||
4175  !seq.GetInst().GetExt().IsDelta()) {
4176  return false;
4177  }
4178 
4179  ITERATE(CDelta_ext::Tdata, sg, seq.GetInst().GetExt().GetDelta().Get()) {
4180  if (!(*sg)) {
4181  // skip NULL element
4182  } else if ((*sg)->IsLoc()) {
4183  const CSeq_id *id = (*sg)->GetLoc().GetId();
4184  if (id) {
4185  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
4186  if ((*id_it)->Compare(*id) == CSeq_id::e_YES) {
4187  rval = true;
4188  break;
4189  }
4190  }
4191  }
4192  if (rval) break;
4193  }
4194  }
4195  return rval;
4196 }
4197 
4198 
4200 {
4201  if (!loc.IsInt()) {
4202  return false;
4203  }
4204 
4205  TSeqPos stop = loc.GetStop(eExtreme_Positional);
4206  TSeqPos start = loc.GetStart(eExtreme_Positional);
4207 
4208  if (start > 1) {
4209  CRef<CSeq_loc> far_loc(new CSeq_loc());
4210  far_loc->SetInt().SetFrom(start - 2);
4211  far_loc->SetInt().SetTo(start - 1);
4212  far_loc->SetInt().SetId().Assign(loc.GetInt().GetId());
4213  CFeat_CI f(far_bsh.GetScope(), *far_loc);
4214  if (f) {
4215  return true;
4216  }
4217  }
4218  if (stop < far_bsh.GetBioseqLength() - 2) {
4219  CRef<CSeq_loc> far_loc(new CSeq_loc());
4220  far_loc->SetInt().SetFrom(stop + 1);
4221  far_loc->SetInt().SetTo(stop + 2);
4222  far_loc->SetInt().SetId().Assign(loc.GetInt().GetId());
4223  CFeat_CI f(far_bsh.GetScope(), *far_loc);
4224  if (f) {
4225  return true;
4226  }
4227  }
4228  return false;
4229 }
4230 
4231 
4233 (const CSeq_loc& loc,
4234  const CBioseq& seq,
4235  TSeqPos& len)
4236 {
4237  if (loc.IsWhole()) {
4239  "Delta seq component should not be of type whole", seq);
4240  }
4241 
4242  const CSeq_id *id = loc.GetId();
4243  if (id) {
4244  if (id->IsGi() && loc.GetId()->GetGi() == ZERO_GI) {
4246  "Delta component is gi|0", seq);
4247  }
4248  if (!loc.IsWhole()
4249  && (id->IsGi()
4250  || id->IsGenbank()
4251  || id->IsEmbl()
4252  || id->IsDdbj() || id->IsTpg()
4253  || id->IsTpe()
4254  || id->IsTpd()
4255  || id->IsOther())) {
4256  TSeqPos stop = loc.GetStop(eExtreme_Positional);
4257  try {
4259  if (bsh) {
4260  TSeqPos seq_len = bsh.GetBioseqLength();
4261  if (seq_len <= stop) {
4262  string id_label = id->AsFastaString();
4264  "Seq-loc extent (" + NStr::IntToString (stop + 1)
4265  + ") greater than length of " + id_label
4266  + " (" + NStr::IntToString(seq_len) + ")",
4267  seq);
4268  }
4269  if (!m_Imp.IsRefSeq() && IsWGS(seq) && HasExcludedAnnotation(loc, bsh)) {
4270  string id_label = id->AsFastaString();
4272  "Scaffold points to some but not all of " +
4273  id_label + ", excluded portion contains features", seq);
4274  }
4275  } else {
4277  "Unable to find far delta sequence component", seq);
4278  }
4279  } catch (const CException& ) {
4280  } catch (const std::exception& ) {
4281  }
4282  }
4283  }
4284 
4285  try {
4286  if (seq.IsSetInst ()) {
4287  const CSeq_inst& inst = seq.GetInst();
4288  TSeqPos loc_len = GetLength(loc, m_Scope);
4289  if (loc_len == numeric_limits<TSeqPos>::max()) {
4291  "-1 length on seq-loc of delta seq_ext", seq);
4292  string loc_str;
4293  loc.GetLabel(&loc_str);
4294  if ( loc_str.empty() ) {
4295  loc_str = "?";
4296  }
4297  if (x_IsDeltaLitOnly(inst)) {
4299  "Short length (-1) on seq-loc (" + loc_str + ") of delta seq_ext", seq);
4300  }
4301  } else {
4302  len += loc_len;
4303  }
4304  if ( loc_len <= 10 ) {
4305  string loc_str;
4306  loc.GetLabel(&loc_str);
4307  if ( loc_str.empty() ) {
4308  loc_str = "?";
4309  }
4310  if (x_IsDeltaLitOnly(inst)) {
4312  "Short length (" + NStr::SizetToString(loc_len) +
4313  ") on seq-loc (" + loc_str + ") of delta seq_ext", seq);
4314  }
4315  }
4316  }
4317 
4318  } catch (const CObjmgrUtilException&) {
4319  string loc_str;
4320  loc.GetLabel(&loc_str);
4321  if ( loc_str.empty() ) {
4322  loc_str = "?";
4323  }
4325  "No length for Seq-loc (" + loc_str + ") of delta seq-ext",
4326  seq);
4327  }
4328 }
4329 
4330 
4331 static size_t s_GetDeltaLen (const CDelta_seq& seg, CScope* scope)
4332 {
4333  if (seg.IsLiteral()) {
4334  return seg.GetLiteral().GetLength();
4335  } else if (seg.IsLoc()) {
4336  return GetLength (seg.GetLoc(), scope);
4337  } else {
4338  return 0;
4339  }
4340 }
4341 
4342 
4343 static string linkEvStrings [] = {
4344  "paired-ends",
4345  "align genus",
4346  "align xgenus",
4347  "align trnscpt",
4348  "within clone",
4349  "clone contig",
4350  "map",
4351  "strobe",
4352  "unspecified",
4353  "pcr",
4354  "proximity ligation",
4355  "other",
4356  "UNKNOWN VALUE"
4357 };
4358 
4359 /*bsv
4360 static bool s_IsGapComponent (const CDelta_seq& seg)
4361 {
4362  if (! seg.IsLiteral()) return false;
4363  const CSeq_literal& lit = seg.GetLiteral();
4364  if (! lit.IsSetSeq_data()) return true;
4365  if (lit.GetSeq_data().IsGap() && lit.GetLength() > 0) return true;
4366  return false;
4367 }
4368 */
4369 
4370 static bool s_IsUnspecified(const CSeq_gap& gap)
4371 {
4372  bool is_unspec = false;
4374  const CLinkage_evidence & evidence = **ev_itr;
4375  if (!evidence.CanGetType()) continue;
4376  int linktype = evidence.GetType();
4377  if (linktype == 8) {
4378  is_unspec = true;
4379  }
4380  }
4381  return is_unspec;
4382 }
4383 
4384 
4386 {
4387  // always ignore for circular sequences
4388  if (bsh.GetInst().IsSetTopology() &&
4390  return true;
4391  }
4392 
4393  // ignore if location is genomic and gap is of certain type
4394  if (gap_type != CSeq_gap::eType_centromere &&
4395  gap_type != CSeq_gap::eType_telomere &&
4396  gap_type != CSeq_gap::eType_heterochromatin &&
4397  gap_type != CSeq_gap::eType_short_arm &&
4398  gap_type != CSeq_gap::eType_contamination) {
4399  return false;
4400  }
4401 
4402  CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
4403  if (src && src->GetSource().IsSetGenome() && src->GetSource().GetGenome() == CBioSource::eGenome_chromosome) {
4404  return true;
4405  } else {
4406  return false;
4407  }
4408 }
4409 
4410 
4411 // Assumes seq is a delta sequence
4413 {
4414  const CSeq_inst& inst = seq.GetInst();
4415 
4416  // Get CMolInfo and tech used for validating technique and gap positioning
4417  const CMolInfo* mi = nullptr;
4419  if ( mi_desc ) {
4420  mi = &(mi_desc->GetMolinfo());
4421  }
4422  CMolInfo::TTech tech = mi ? mi->GetTech() : CMolInfo::eTech_unknown;
4423 
4424  if (!inst.IsSetExt() || !inst.GetExt().IsDelta() ||
4425  inst.GetExt().GetDelta().Get().empty()) {
4427  "No CDelta_ext data for delta Bioseq", seq);
4428  }
4429 
4430  bool any_tech_ok = false;
4431  bool has_gi = false;
4432  FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
4433  if (IsNTNCNWACAccession(**id_it)) {
4434  any_tech_ok = true;
4435  break;
4436  } else if ((*id_it)->IsGi()) {
4437  has_gi = true;
4438  }
4439  }
4441  if (!any_tech_ok && seq.IsNa()
4442  && tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4443  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3
4446  && tech != CMolInfo::eTech_htc && tech != CMolInfo::eTech_barcode
4447  && tech != CMolInfo::eTech_tsa) {
4449  "Delta seq technique should not be [" + NStr::IntToString(tech) + "]", seq);
4450  }
4451 
4452  // set severity for first / last gap error
4453  TSeqPos len = 0;
4454  TSeqPos seg = 0;
4455  bool last_is_gap = false;
4456  int prev_gap_linkage = -1;
4457  CSeq_gap::TType prev_gap_type = CSeq_gap::eType_unknown;
4458  int gap_linkage = -1;
4460  size_t num_gaps = 0;
4461  size_t num_adjacent_gaps = 0;
4462  bool non_interspersed_gaps = false;
4463  bool first = true;
4464  int num_gap_known_or_spec = 0;
4465  int num_gap_unknown_unspec = 0;
4466 
4467  vector<CConstRef<CSeq_loc> > delta_locs;
4468 
4469  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
4470  ++seg;
4471  if ( !(*sg) ) {
4473  "NULL pointer in delta seq_ext valnode (segment " +
4474  NStr::IntToString(seg) + ")", seq);
4475  continue;
4476  }
4477  switch ( (**sg).Which() ) {
4478  case CDelta_seq::e_Loc:
4479  {
4480  const CSeq_loc& loc = (**sg).GetLoc();
4481  CConstRef<CSeq_loc> tmp(&loc);
4482  delta_locs.push_back (tmp);
4483 
4484  ValidateDeltaLoc (loc, seq, len);
4485 
4486  if ( !last_is_gap && !first) {
4487  non_interspersed_gaps = true;
4488  }
4489  last_is_gap = false;
4490  prev_gap_linkage = -1;
4491  prev_gap_type = CSeq_gap::eType_unknown;
4492  gap_linkage = CSeq_gap::eType_unknown;
4493  first = false;
4494  break;
4495  }
4496  case CDelta_seq::e_Literal:
4497  {
4498  // The C toolkit code checks for valid alphabet here
4499  // The C++ object serializaton will not load if invalid alphabet
4500  // so no check needed here
4501  const CSeq_literal& lit = (*sg)->GetLiteral();
4502  TSeqPos start_len = len;
4503  len += lit.CanGetLength() ? lit.GetLength() : 0;
4504  if (lit.IsSetSeq_data() && ! lit.GetSeq_data().IsGap()
4505  && (!lit.IsSetLength() || lit.GetLength() == 0)) {
4507  "Seq-lit of length 0 in delta chain", seq);
4508  }
4509 
4510  // Check for invalid residues
4511  if ( lit.IsSetSeq_data() && !lit.GetSeq_data().IsGap() ) {
4512  if ( !last_is_gap && !first) {
4513  non_interspersed_gaps = true;
4514  }
4515  last_is_gap = false;
4516  prev_gap_linkage = -1;
4517  prev_gap_type = CSeq_gap::eType_unknown;
4518  const CSeq_data& data = lit.GetSeq_data();
4519  vector<TSeqPos> badIdx;
4520  CSeqportUtil::Validate(data, &badIdx);
4521  const string* ss = nullptr;
4522  switch (data.Which()) {
4523  case CSeq_data::e_Iupacaa:
4524  ss = &data.GetIupacaa().Get();
4525  break;
4526  case CSeq_data::e_Iupacna:
4527  ss = &data.GetIupacna().Get();
4528  break;
4529  case CSeq_data::e_Ncbieaa:
4530  ss = &data.GetNcbieaa().Get();
4531  break;
4533  {
4534  const vector<char>& c = data.GetNcbistdaa().Get();
4535  ITERATE (vector<TSeqPos>, ci, badIdx) {
4537  "Invalid residue [" +
4538  NStr::IntToString((int)c[*ci]) + "] at position [" +
4539  NStr::IntToString((*ci) + 1) + "]", seq);
4540  }
4541  break;
4542  }
4543  default:
4544  break;
4545  }
4546 
4547  if ( ss ) {
4548  ITERATE (vector<TSeqPos>, it, badIdx) {
4550  "Invalid residue [" +
4551  ss->substr(*it, 1) + "] at position [" +
4552  NStr::IntToString((*it) + 1) + "]", seq);
4553  }
4554  }
4555 
4556  if (mi) {
4557  // Count adjacent Ns in Seq-lit
4558  int max_ns = s_MaxNsInSeqLitForTech (tech);
4559  size_t adjacent_ns = x_CountAdjacentNs(lit);
4560  if (max_ns > -1 && adjacent_ns > max_ns) {
4562  "Run of " + NStr::NumericToString(adjacent_ns) +
4563  " Ns in delta component " + NStr::UIntToString(seg) +
4564  " that starts at base " + NStr::UIntToString(start_len + 1),
4565  seq);
4566  }
4567  }
4568  } else {
4569  gap_linkage = -1;
4570  gap_type = CSeq_gap::eType_unknown;
4571  if ( lit.IsSetSeq_data() && lit.GetSeq_data().IsGap() ) {
4572  const CSeq_data& data = lit.GetSeq_data();
4573  if (data.Which() == CSeq_data::e_Gap) {
4574  const CSeq_gap& gap = data.GetGap();
4575 
4576  if (gap.IsSetType()) {
4577  gap_type = gap.GetType();
4578  if (gap_type == CSeq_gap::eType_unknown && s_IsUnspecified(gap)) {
4579  num_gap_unknown_unspec++;
4580  }
4581  else {
4582  num_gap_known_or_spec++;
4583  }
4584  }
4585  if(gap.IsSetLinkage())
4586  gap_linkage = gap.GetLinkage();
4587  }
4588  }
4589  if (first && !x_IgnoreEndGap(bsh, gap_type) && !s_WillReportTerminalGap(seq, bsh)) {
4590  EDiagSev sev = eDiag_Error;
4591  if (tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4592  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3) {
4593  sev = eDiag_Warning;
4594  }
4596  "First delta seq component is a gap", seq);
4597  }
4598 
4599  if(last_is_gap &&
4600  (prev_gap_type == gap_type ||
4601  prev_gap_linkage != gap_linkage ||
4602  gap_linkage != CSeq_gap::eLinkage_unlinked)) {
4603  if (prev_gap_type != CSeq_gap::eType_contamination && gap_type != CSeq_gap::eType_contamination) {
4604  ++num_adjacent_gaps;
4605  }
4606  }
4607 
4608  if (lit.IsSetSeq_data() && lit.GetSeq_data().IsGap()) {
4609  ValidateSeqGap(lit.GetSeq_data().GetGap(), seq);
4610  } else if (!lit.CanGetLength() || lit.GetLength() == 0) {
4611  if (!lit.IsSetFuzz() || !lit.GetFuzz().IsLim() || lit.GetFuzz().GetLim() != CInt_fuzz::eLim_unk) {
4613  "Gap of length 0 in delta chain", seq);
4614  } else {
4616  "Gap of length 0 with unknown fuzz in delta chain", seq);
4617  }
4618  } else if (lit.CanGetLength() && lit.GetLength() != 100) {
4619  if (lit.IsSetFuzz()) {
4621  "Gap of unknown length should have length 100", seq);
4622  }
4623  }
4624  last_is_gap = true;
4625  prev_gap_type = gap_type;
4626  prev_gap_linkage = gap_linkage;
4627  ++num_gaps;
4628  }
4629  first = false;
4630  break;
4631  }
4632  default:
4634  "CDelta_seq::Which() is e_not_set", seq);
4635  }
4636  }
4637 
4638  if (num_gap_unknown_unspec > 0 && num_gap_known_or_spec == 0) {
4639  if (num_gap_unknown_unspec > 1) {
4641  "All " + NStr::IntToString(num_gap_unknown_unspec) +
4642  " Seq-gaps have unknown type and unspecified linkage", seq);
4643  } else {
4645  "Single Seq-gap has unknown type and unspecified linkage", seq);
4646  }
4647  }
4648 
4649  if (inst.GetLength() > len) {
4651  "Bioseq.seq_data too short [" + NStr::IntToString(len) +
4652  "] for given length [" + NStr::IntToString(inst.GetLength()) +
4653  "]", seq);
4654  } else if (inst.GetLength() < len) {
4656  "Bioseq.seq_data is larger [" + NStr::IntToString(len) +
4657  "] than given length [" + NStr::IntToString(inst.GetLength()) +
4658  "]", seq);
4659  }
4660  if ( non_interspersed_gaps && !has_gi && mi &&
4661  (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
4662  tech == CMolInfo::eTech_htgs_2) ) {
4663  EDiagSev missing_gaps_sev = eDiag_Error;
4665  while (desc_i) {
4666  if (desc_i->GetUser().IsRefGeneTracking()) {
4667  missing_gaps_sev = eDiag_Info;
4668  break;
4669  }
4670  ++desc_i;
4671  }
4672 
4673  PostErr(missing_gaps_sev, eErr_SEQ_INST_MissingGaps,
4674  "HTGS delta seq should have gaps between all sequence runs", seq);
4675  }
4676  if ( num_adjacent_gaps >= 1 ) {
4677  string msg = (num_adjacent_gaps == 1) ?
4678  "There is 1 adjacent gap in delta seq" :
4679  "There are " + NStr::SizetToString(num_adjacent_gaps) +
4680  " adjacent gaps in delta seq";
4682  }
4683  if (last_is_gap && !x_IgnoreEndGap(bsh, gap_type) && !s_WillReportTerminalGap(seq, bsh)) {
4684  EDiagSev sev = eDiag_Error;
4685  if (tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4686  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3) {
4687  sev = eDiag_Warning;
4688  }
4690  "Last delta seq component is a gap", seq);
4691  }
4692 
4693  // Validate technique
4694  if (num_gaps == 0 && mi) {
4695  if ( tech == CMolInfo::eTech_htgs_2 &&
4696  !GraphsOnBioseq(seq) &&
4697  !x_IsActiveFin(seq) ) {
4699  "HTGS 2 delta seq has no gaps and no graphs", seq);
4700  }
4701  }
4702 
4703  // look for multiple delta locs overlapping
4704  if (delta_locs.size() > 1) {
4705  stable_sort (delta_locs.begin(), delta_locs.end(), s_LocSortCompare);
4706  vector<CConstRef<CSeq_loc> >::iterator it1 = delta_locs.begin();
4707  vector<CConstRef<CSeq_loc> >::iterator it2 = it1;
4708  ++it2;
4709  while (it2 != delta_locs.end()) {
4710  if ((*it1)->GetId()->Compare(*(*it2)->GetId()) == CSeq_id::e_YES
4711  && Compare (**it1, **it2, m_Scope, fCompareOverlapping) != eNoOverlap) {
4712  string seq_label = (*it1)->GetId()->AsFastaString();
4714  "Overlapping delta range " + NStr::IntToString((*it2)->GetStart(eExtreme_Positional) + 1)
4715  + "-" + NStr::IntToString((*it2)->GetStop(eExtreme_Positional) + 1)
4716  + " and " + NStr::IntToString((*it1)->GetStart(eExtreme_Positional) + 1)
4717  + "-" + NStr::IntToString((*it1)->GetStop(eExtreme_Positional) + 1)
4718  + " on a Bioseq " + seq_label,
4719  seq);
4720  }
4721  ++it1;
4722  ++it2;
4723  }
4724  }
4725 
4726  if (IsSelfReferential(seq)) {
4728  "Self-referential delta sequence", seq);
4729  }
4730 
4731  // look for Ns next to gaps
4732  if (seq.IsNa() && seq.GetLength() > 1 && x_IsDeltaLitOnly(inst)) {
4733  try {
4734  TSeqPos pos = 0;
4736  ITERATE (CDelta_ext::Tdata, delta_i, seq.GetInst().GetExt().GetDelta().Get()) {
4737  if (delta_i->Empty()) {
4738  continue; // Ignore NULLs, reported separately above.
4739  }
4740  const CDelta_seq& seg = **delta_i;
4741  TSeqPos delta_len = (TSeqPos)s_GetDeltaLen (seg, m_Scope);
4742  if (pos > 0) {
4743  if (sv.IsInGap (pos)) {
4744  CSeqVector::TResidue res = sv [pos - 1];
4745  if (res == 'N' && !sv.IsInGap(pos - 1)) {
4747  "Ambiguous residue N is adjacent to a gap around position " + NStr::SizetToString (pos + 1),
4748  seq);
4749  }
4750  }
4751  }
4752  if (delta_len > 0 && pos + delta_len < len) {
4753  if (sv.IsInGap(pos + delta_len - 1)) {
4754  CSeqVector::TResidue res = sv[pos + delta_len];
4755  if (res == 'N' && !sv.IsInGap(pos + delta_len)) {
4757  "Ambiguous residue N is adjacent to a gap around position " + NStr::SizetToString(pos + delta_len + 1),
4758  seq);
4759  }
4760  }
4761  }
4762  pos += delta_len;
4763  }
4764  } catch (const CException& ) {
4765  } catch (const std::exception& ) {
4766  }
4767  }
4768 
4769 }
4770 
4771 
4772 bool s_HasGI(const CBioseq& seq)
4773 {
4774  bool has_gi = false;
4775  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
4776  if ((*id_it)->IsGi()) {
4777  has_gi = true;
4778  break;
4779  }
4780  }
4781  return has_gi;
4782 }
4783 
4784 
4786 {
4787  if (gap.IsSetLinkage_evidence()) {
4788  int linkcount = 0;
4789  int linkevarray[13];
4790  for (int i = 0; i < 13; i++) {
4791  linkevarray[i] = 0;
4792  }
4793  bool is_unspec = false;
4795  const CLinkage_evidence & evidence = **ev_itr;
4796  if (!evidence.CanGetType()) continue;
4797  int linktype = evidence.GetType();
4798  if (linktype == 8) {
4799  is_unspec = true;
4800  }
4801  linkcount++;
4802  if (linktype == 255) {
4803  (linkevarray[11])++;
4804  }
4805  else if (linktype < 0 || linktype > 10) {
4806  (linkevarray[12])++;
4807  }
4808  else {
4809  (linkevarray[linktype])++;
4810  }
4811  }
4812  if (linkevarray[8] > 0 && linkcount > linkevarray[8]) {
4814  "Seq-gap type has unspecified and additional linkage evidence", seq);
4815  }
4816  for (int i = 0; i < 13; i++) {
4817  if (linkevarray[i] > 1) {
4819  "Linkage evidence '" + linkEvStrings[i] + "' appears " +
4820  NStr::IntToString(linkevarray[i]) + " times", seq);
4821  }
4822  }
4823  if (!gap.IsSetLinkage() || gap.GetLinkage() != CSeq_gap::eLinkage_linked) {
4825  "Seq-gap with linkage evidence must have linkage field set to linked", seq);
4826  }
4827  if (gap.IsSetType()) {
4828  int gaptype = gap.GetType();
4829  if (gaptype != CSeq_gap::eType_fragment &&
4830  gaptype != CSeq_gap::eType_clone &&
4831  gaptype != CSeq_gap::eType_repeat &&
4832  gaptype != CSeq_gap::eType_scaffold) {
4833  if (gaptype == CSeq_gap::eType_unknown && is_unspec) {
4834  /* suppress for legacy records */
4835  } else if (gaptype == CSeq_gap::eType_contamination) {
4836  if (linkevarray[8] > 0 && linkcount == linkevarray[8]) {
4837  /* contamination can only have linked unspecified */
4838  } else {
4840  "Contamination gaps must have linkage evidence 'unspecified'", seq);
4841  }
4842  } else {
4844  "Seq-gap of type " + NStr::IntToString(gaptype) +
4845  " should not have linkage evidence", seq);
4846  }
4847  }
4848  }
4849  }
4850  else {
4851  if (gap.IsSetType()) {
4852  int gaptype = gap.GetType();
4853  if (gaptype == CSeq_gap::eType_scaffold) {
4855  "Seq-gap type == scaffold is missing required linkage evidence", seq);
4856  }
4857  if (gaptype == CSeq_gap::eType_repeat && gap.IsSetLinkage() && gap.GetLinkage() == CSeq_gap::eLinkage_linked)
4858  {
4859  bool suppress_SEQ_INST_SeqGapProblem = false;
4860  if (seq.IsSetDescr() && s_HasGI(seq))
4861  {
4863  {
4864  if ((**it).IsCreate_date())
4865  {
4866  CDate threshold_date(CTime(2012, 10, 1));
4867  if ((**it).GetCreate_date().Compare(threshold_date) == CDate::eCompare_before)
4868  suppress_SEQ_INST_SeqGapProblem = true;
4869  break;
4870  }
4871  }
4872  }
4873  if (!suppress_SEQ_INST_SeqGapProblem)
4875  "Seq-gap type == repeat and linkage == linked is missing required linkage evidence", seq);
4876 
4877  }
4878  if (gaptype == CSeq_gap::eType_contamination) {
4880  "Contamination gap-types must be linked and have linkage-evidence of type 'unspecified'", seq);
4881  }
4882  }
4883  }
4884 }
4885 
4886 
4888 (const CSeq_inst& inst,
4889  const CBioseq& seq)
4890 {
4891  bool rtn = true;
4892  const CEnumeratedTypeValues* tv = CSeq_inst::GetTypeInfo_enum_ERepr();
4893  string rpr = tv->FindName(inst.GetRepr(), true);
4894  if (NStr::Equal(rpr, "ref")) {
4895  rpr = "reference";
4896  } else if (NStr::Equal(rpr, "const")) {
4897  rpr = "constructed";
4898  }
4899  const string err0 = "Bioseq-ext not allowed on " + rpr + " Bioseq";
4900  const string err1 = "Missing or incorrect Bioseq-ext on " + rpr + " Bioseq";
4901  const string err2 = "Missing Seq-data on " + rpr + " Bioseq";
4902  const string err3 = "Seq-data not allowed on " + rpr + " Bioseq";
4903  switch (inst.GetRepr()) {
4905  if (inst.IsSetExt()) {
4907  rtn = false;
4908  }
4909  if (inst.IsSetSeq_data()) {
4911  rtn = false;
4912  }
4913  break;
4914  case CSeq_inst::eRepr_map:
4915  if (!inst.IsSetExt() || !inst.GetExt().IsMap()) {
4916  PostErr(