NCBI C++ ToolKit
validerror_bioseq.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validerror_bioseq.cpp 101515 2023-12-22 19:02:03Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat ......
27  *
28  * File Description:
29  * validation of bioseq
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbitime.hpp>
37 #include <corelib/ncbimisc.hpp>
38 
42 #include <objtools/error_codes.hpp>
43 
45 
46 #include <objmgr/seqdesc_ci.hpp>
47 #include <objmgr/graph_ci.hpp>
48 #include <objmgr/util/sequence.hpp>
49 
51 
52 #include <optional>
53 
54 
55 #define NCBI_USE_ERRCODE_X Objtools_Validator
56 
59 BEGIN_SCOPE(validator)
60 USING_SCOPE(sequence);
61 USING_SCOPE(feature);
62 
63 class CCdsMatchInfo;
64 
65 class CMrnaMatchInfo : public CObject
66 {
67 public:
68  CMrnaMatchInfo(const CSeq_feat& mrna, CScope* scope);
69  const CSeq_feat& GetSeqfeat() const;
70  bool Overlaps(const CSeq_feat& cds) const;
71  void SetMatch();
72  bool HasMatch() const;
73  void SetPseudo(bool val = true) { m_IsPseudo = val; }
74  bool OkWithoutCds(bool isGenbank = false) const;
75 
76 private:
78 
80  bool m_HasMatch;
81  bool m_IsPseudo;
82 };
83 
84 
85 class CCdsMatchInfo : public CObject
86 {
87 public:
88  CCdsMatchInfo(const CSeq_feat& cds, CScope* scope);
89  const CSeq_feat& GetSeqfeat() const;
90  bool Overlaps(const CSeq_feat& mrna) const;
91  bool AssignXrefMatch(TmRNAList& unmatched_mrnas, const CTSE_Handle& tse);
92  bool AssignOverlapMatch(TmRNAList& unmatched_mrnas, CScope& scope);
93  void UpdateOtherMrnas(const TmRNAList& unmatched_mrnas);
94  size_t CountOtherMrnas() { return m_OtherMrnas.size(); }
97  bool AssignMatch(TmRNAList& mrna_map, CFeatTree& feat_tree, CScope& scope);
98  bool HasMatch() const;
99  void NeedsMatch(bool needs_match);
100  bool NeedsMatch() const;
101  const CMrnaMatchInfo& GetMatch() const;
102  bool IsPseudo() const;
103  void SetPseudo();
104 
105 private:
108 
113  list<CConstRef<CSeq_feat>> m_OtherMrnas;
115 };
116 
117 
118 // =============================================================================
119 // Public
120 // =============================================================================
121 
123  CValidError_base(imp), m_AnnotValidator(imp), m_DescrValidator(imp), m_FeatValidator(imp), m_GeneIt(nullptr), m_AllFeatIt(nullptr)
124 {
125 }
126 
127 
129 {
130 }
131 
133 {
136 
137  if (bsh.IsSetInst_Repr()) {
138  repr = bsh.GetInst_Repr();
139  }
140 
142  while (m) {
143  const CSeqdesc::TMolinfo& mi = m->GetMolinfo();
144  if (mi.IsSetTech()) {
145  tech = mi.GetTech();
146  }
147 
148  ++m;
149  }
150 
151  for (auto id : bsh.GetId()) {
152  CSeq_id::EAccessionInfo acc_info = id.IdentifyAccession();
153  unsigned int acc_div = acc_info & CSeq_id::eAcc_division_mask;
154  if (acc_div == CSeq_id::eAcc_wgs && tech == CMolInfo::eTech_wgs && repr == CSeq_inst::eRepr_virtual) {
155  bool is_wgs_master = (acc_info & CSeq_id::fAcc_master) != 0;
156  if (is_wgs_master) {
157  m_report_short_seq = false;
158  }
159  }
160  }
161 
163  while (d) {
164  const CSeqdesc::TSource& source = d->GetSource();
165 
166  // look for chromosome, prokaryote, linkage group
168  if ((*it)->IsSetSubtype() && (*it)->IsSetName() && !NStr::IsBlank((*it)->GetName())) {
169  if ((*it)->GetSubtype() == CSubSource::eSubtype_chromosome) {
171  } else if ((*it)->GetSubtype() == CSubSource::eSubtype_linkage_group) {
173  }
174  }
175  }
176  if (source.IsSetLineage()) {
177  string lineage = source.GetLineage();
178  if (NStr::StartsWith(lineage, "Bacteria; ") ||
179  NStr::StartsWith(lineage, "Archaea; ")) {
182  m_is_bact_or_arch = true;
183  }
184  if (NStr::StartsWith(lineage, "Viruses; ")) {
186  }
187  }
188  if (source.IsSetDivision()) {
189  string div = source.GetDivision();
190  if (NStr::Equal(div, "BCT") || NStr::Equal(div, "VRL")) {
193  }
194  }
195  if (source.IsSetGenome()) {
196  CBioSource::TGenome genome = source.GetGenome();
197  // check for organelle
198  if (IsOrganelle(genome)) {
200  }
201  m_is_plasmid = (genome == NCBI_GENOME(plasmid));
202  m_is_chromosome = (genome == NCBI_GENOME(chromosome));
203  m_is_extrachrom = (genome == NCBI_GENOME(extrachrom));
204  }
205 
206  ++d;
207  }
208 }
209 
210 
212 {
213  m_splicing_not_expected = false;
215  m_report_short_seq = true;
216  m_is_bact_or_arch = false;
217  m_is_plasmid = false;
218  m_is_chromosome = false;
219  m_is_extrachrom = false;
220 
221  try {
223 
225 
226  CSeq_entry_Handle appropriate_parent;
227  if (m_Imp.ShouldSubdivide()) {
229  }
230  if (appropriate_parent) {
231  CRef<CScope> tmp_scope(new CScope(*(CObjectManager::GetInstance())));
232  tmp_scope->AddDefaults();
233  CSeq_entry_Handle this_seh = tmp_scope->AddTopLevelSeqEntry(*(appropriate_parent.GetCompleteSeq_entry()));
234  m_FeatValidator.SetScope(*tmp_scope);
235  m_FeatValidator.SetTSE(this_seh);
236  } else {
239  }
240 
241  try {
242  CCacheImpl::SFeatKey gene_key(
244  m_GeneIt = &GetCache().GetFeatFromCache(gene_key);
245 
246  CCacheImpl::SFeatKey all_feat_key(
248  m_AllFeatIt = &GetCache().GetFeatFromCache(all_feat_key);
249  } catch (const exception&) {
250  // sequence might be too broken to validate features
251  m_GeneIt = nullptr;
252  m_AllFeatIt = nullptr;
253  }
254  ValidateSeqIds(seq);
255  ValidateInst(seq);
257  ValidateHistory(seq);
258  FOR_EACH_ANNOT_ON_BIOSEQ (annot, seq) {
261  }
262  if (seq.IsSetDescr()) {
263  if (m_CurrentHandle) {
265  if (ctx) {
266  m_DescrValidator.ValidateSeqDescr(seq.GetDescr(), *(ctx.GetCompleteSeq_entry()));
267  }
268  }
269  }
270  if (IsWGSMaster(seq, m_CurrentHandle.GetScope())) {
272  }
273  if (appropriate_parent) {
276  }
277 
278  } catch (const exception& e) {
280  string("Exception while validating bioseq. EXCEPTION: ") +
281  e.what(), seq);
282  }
284  if (m_GeneIt) {
285  m_GeneIt = nullptr;
286  }
287  if (m_AllFeatIt) {
288  m_AllFeatIt = nullptr;
289  }
290 }
291 
292 
293 static bool s_IsSkippableDbtag(const CDbtag& dbt)
294 {
295  if (! dbt.IsSetDb()) {
296  return false;
297  }
298  const string& db = dbt.GetDb();
299  if (NStr::EqualNocase(db, "TMSMART")
300  || NStr::EqualNocase(db, "BankIt")
301  || NStr::EqualNocase(db, "NCBIFILE")) {
302  return true;
303  } else {
304  return false;
305  }
306 }
307 
308 static char CheckForBadSeqIdChars(const string& id)
309 {
310  FOR_EACH_CHAR_IN_STRING(itr, id) {
311  const char& ch = *itr;
312  if (ch == '|' || ch == ',') return ch;
313  }
314  return '\0';
315 }
316 
317 // VR-748
318 static char CheckForBadLocalIdChars(const string& id)
319 {
320  for (size_t i = 0; i < id.length(); i++) {
321  if (! CSeq_id::IsValidLocalID(id.substr(i, 1))) {
322  return id.c_str()[i];
323  }
324  }
325  return '\0';
326 }
327 
328 
329 static char CheckForBadFileIDSeqIdChars(const string& id)
330 {
331  FOR_EACH_CHAR_IN_STRING(itr, id) {
332  const char& ch = *itr;
333  if (ch == '|' || ch == ',') return ch;
334  }
335  return '\0';
336 }
337 
338 
339 // validation for individual Seq-id
340 void CValidError_bioseq::ValidateSeqId(const CSeq_id& id, const CBioseq& ctx, bool longer_general)
341 {
342  // see if ID can be used to find ctx
343  CBioseq_Handle ctx_handle = m_Scope->GetBioseqHandle(ctx);
344  if (! ctx_handle) {
345  if (! m_Imp.IsPatent()) {
347  "BioseqFind (" + id.AsFastaString() +
348  ") unable to find itself - possible internal error", ctx);
349  }
350  return;
351  }
352  CTSE_Handle tse = ctx_handle.GetTSE_Handle();
353  CBioseq_Handle bsh = tse.GetBioseqHandle(id);
354 
355  if (bsh) {
356  CConstRef<CBioseq> core = bsh.GetBioseqCore();
357  if (! core) {
358  if (! m_Imp.IsPatent()) {
360  "BioseqFind (" + id.AsFastaString() +
361  ") unable to find itself - possible internal error", ctx);
362  }
363  } else if (core.GetPointer() != &ctx) {
365  "SeqID " + id.AsFastaString() +
366  " is present on multiple Bioseqs in record", ctx);
367  }
368  } else {
370  "BioseqFind (" + id.AsFastaString() +
371  ") unable to find itself - possible internal error", ctx);
372  }
373 
374  //check formatting
375  const CTextseq_id* tsid = id.GetTextseq_Id();
376 
377  switch (id.Which()) {
378  case CSeq_id::e_Tpg:
379  case CSeq_id::e_Tpe:
380  case CSeq_id::e_Tpd:
381  if (IsHistAssemblyMissing(ctx) && ctx.IsNa()) {
383  "TPA record " + ctx.GetId().front()->AsFastaString() +
384  " should have Seq-hist.assembly for PRIMARY block",
385  ctx);
386  }
387  // Fall thru
389  case CSeq_id::e_Genbank:
390  case CSeq_id::e_Embl:
391  case CSeq_id::e_Ddbj:
392  if (tsid && tsid->IsSetAccession()) {
393  const string& acc = tsid->GetAccession();
394  const char badch = CheckForBadSeqIdChars (acc);
395  if (badch != '\0') {
397  "Bad character '" + string(1, badch) + "' in accession '" + acc + "'", ctx);
398  }
400  if (info == CSeq_id::eAcc_unknown ||
401  (ctx.IsNa() && (info & CSeq_id::fAcc_prot)) ||
402  (ctx.IsAa() && (info & CSeq_id::fAcc_nuc))) {
404  "Bad accession " + acc, ctx);
405  }
406  // Check for secondary conflicts
409  }
410  // Fall thru
412  case CSeq_id::e_Other:
413  if (tsid) {
414  if (tsid->IsSetName()) {
415  const string& name = tsid->GetName();
416  ITERATE (string, s, name) {
417  if (isspace((unsigned char)(*s))) {
420  "Seq-id.name '" + name + "' should be a single "
421  "word without any spaces", ctx);
422  break;
423  }
424  }
425  }
426 
427  if (tsid->IsSetAccession() && id.IsOther()) {
428  const string& acc = tsid->GetAccession();
429  const char badch = CheckForBadSeqIdChars (acc);
430  if (badch != '\0') {
432  "Bad character '" + string(1, badch) + "' in accession '" + acc + "'", ctx);
433  }
434  size_t num_letters = 0;
435  size_t num_digits = 0;
436  size_t num_underscores = 0;
437  bool bad_id_chars = false;
438  bool is_NZ = (NStr::CompareNocase(acc, 0, 3, "NZ_") == 0);
439  size_t i = 0;
440  bool letter_after_digit = false;
441 
442  if (is_NZ) {
443  i = 3;
444  }
445 
446  for (; i < acc.length(); ++i) {
447  if (isupper((unsigned char)acc[i])) {
448  num_letters++;
449  } else if (isdigit((unsigned char)acc[i])) {
450  num_digits++;
451  } else if (acc[i] == '_') {
452  num_underscores++;
453  if (num_digits > 0 || num_underscores > 1) {
454  letter_after_digit = true;
455  }
456  } else {
457  bad_id_chars = true;
458  }
459  }
460 
461  if (letter_after_digit || bad_id_chars) {
463  "Bad accession " + acc, ctx);
464  } else if (is_NZ && (num_letters == 4 || num_letters == 6) &&
465  (num_digits >= 8 && num_digits <= 11) && num_underscores == 0) {
466  // valid accession - do nothing!
467  } else if (is_NZ && ValidateAccessionString(acc, false) == eAccessionFormat_valid) {
468  // valid accession - do nothing!
469  } else if (num_letters == 2 &&
470  (num_digits == 6 || num_digits == 8 || num_digits == 9) &&
471  num_underscores == 1) {
472  // valid accession - do nothing!
473  } else if (num_letters == 4 && num_digits == 10 && ctx.IsNa()) {
474  } else {
476  "Bad accession " + acc, ctx);
477  }
478  }
479  }
480  // Fall thru
482  case CSeq_id::e_Pir:
484  case CSeq_id::e_Prf:
485  if (tsid) {
486  if (ctx.IsNa() &&
487  (! tsid->IsSetAccession() || tsid->GetAccession().empty())) {
488  if (ctx.GetInst().GetRepr() != CSeq_inst::eRepr_seg ||
489  m_Imp.IsGI()) {
490  if (! id.IsDdbj() ||
491  ctx.GetInst().GetRepr() != CSeq_inst::eRepr_seg) {
492  string msg = "Missing accession for " + id.AsFastaString();
495  msg, ctx);
496  }
497  }
498  }
499  } else {
501  "Seq-id type not handled", ctx);
502  }
503  break;
504  case CSeq_id::e_Gi:
505  if (id.GetGi() <= ZERO_GI) {
507  "Invalid GI number", ctx);
508  }
509  break;
510  case CSeq_id::e_General:
511  if (! id.GetGeneral().IsSetDb() || NStr::IsBlank(id.GetGeneral().GetDb())) {
512  PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, "General identifier missing database field", ctx);
513  }
514  if (id.GetGeneral().IsSetDb()) {
515  const CDbtag& dbt = id.GetGeneral();
516  size_t dblen = dbt.GetDb().length();
517  EDiagSev sev = eDiag_Error;
518  if (m_Imp.IsLocalGeneralOnly()) {
519  sev = eDiag_Critical;
520  } else if (m_Imp.IsRefSeq()) {
521  sev = eDiag_Error;
522  } else if (m_Imp.IsINSDInSep()) {
523  sev = eDiag_Error;
524  } else if (m_Imp.IsIndexerVersion()) {
525  sev = eDiag_Error;
526  }
527  static const auto max_dblen = CSeq_id::kMaxGeneralDBLength;
528  if (dblen > max_dblen) {
529  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "General database longer than " + NStr::NumericToString(max_dblen) + " characters", ctx);
530  }
531  if (! s_IsSkippableDbtag(dbt)) {
532  if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
533  size_t idlen = dbt.GetTag().GetStr().length();
534  static const auto maxlen = CSeq_id::kMaxGeneralTagLength;
535  if (longer_general) {
536  if (idlen > 100 && ! m_Imp.IsGI()) {
537  PostErr(sev, eErr_SEQ_INST_BadSeqIdLength, "General identifier longer than " + NStr::NumericToString(100) + " characters", ctx);
538  }
539  } else {
540  if (idlen > maxlen && ! m_Imp.IsGI()) {
541  PostErr(sev, eErr_SEQ_INST_BadSeqIdLength, "General identifier longer than " + NStr::NumericToString(maxlen) + " characters", ctx);
542  }
543  }
544  if (idlen == 0) {
545  PostErr(eDiag_Error, eErr_SEQ_INST_BadSeqIdFormat, "General identifier must not be an empty string", ctx);
546  }
547  }
548  }
549  if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
550  const string& acc = dbt.GetTag().GetStr();
551  char badch;
552  if (dbt.IsSetDb() && (NStr::Equal(dbt.GetDb(), "NCBIFILE") || NStr::Equal(dbt.GetDb(), "BankIt"))) {
553  badch = CheckForBadFileIDSeqIdChars(acc);
554  } else {
555  badch = CheckForBadLocalIdChars(acc);
556  if (badch == '\0' && dbt.IsSetDb()) {
557  badch = CheckForBadLocalIdChars(dbt.GetDb());
558  }
559  }
560  if (badch != '\0') {
562  "Bad character '" + string(1, badch) + "' in sequence ID '" + id.AsFastaString() + "'", ctx);
563  }
564  }
565  }
566  break;
567  case CSeq_id::e_Local:
568  if (id.IsLocal() && id.GetLocal().IsStr() && id.GetLocal().GetStr().length() > CSeq_id::kMaxLocalIDLength) {
569  EDiagSev sev = eDiag_Error;
570  if (! m_Imp.IsINSDInSep()) {
571  sev = eDiag_Critical;
572  } else if (! m_Imp.IsIndexerVersion()) {
573  sev = eDiag_Error;
574  }
575  PostErr(sev, eErr_SEQ_INST_BadSeqIdFormat, "Local identifier longer than " + NStr::NumericToString(CSeq_id::kMaxLocalIDLength) + " characters", ctx);
576  }
577  if (id.IsLocal() && id.GetLocal().IsStr()) {
578  const string& acc = id.GetLocal().GetStr();
579  const char badch = CheckForBadLocalIdChars(acc);
580  if (badch != '\0') {
582  "Bad character '" + string(1, badch) + "' in local ID '" + acc + "'", ctx);
583  }
584  }
585  break;
586  case CSeq_id::e_Pdb:
587  if (id.IsPdb()) {
588  const CPDB_seq_id& pdb = id.GetPdb();
589  if (pdb.IsSetChain() && pdb.IsSetChain_id()) {
590  int chain = pdb.GetChain();
591  const string& chain_id = pdb.GetChain_id();
592  if (chain_id.size() == 1 && chain_id[0] == chain) {
593  break; // OK (straightforward match)
594  } else if (islower(chain) && chain_id.size() == 2
595  && chain_id[0] == chain_id[1]
596  && chain_id[0] == toupper(chain)) {
597  break; // OK (historic special case)
598  } else if (chain == '|' && chain_id == "VB") {
599  break; // OK (likewise)
600  } else {
602  "PDB Seq-id contains mismatched \'chain\' and"
603  " \'chain-id\' slots", ctx);
604  }
605  }
606  }
607  break;
608  default:
609  break;
610  }
611 
612 #if 0
613  // disabled for now
614  if (! IsNCBIFILESeqId(**i)) {
615  string label;
616  (*i)->GetLabel(&label);
617  if (label.length() > 40) {
619  "Sequence ID is unusually long (" +
620  NStr::IntToString(label.length()) + "): " + label,
621  seq);
622  }
623  }
624 #endif
625 
626 }
627 
628 static bool x_IsWgsSecondary(const CBioseq& seq)
629 {
631  const list< string > *extra_acc = nullptr;
632  const CSeqdesc& desc = **sd;
633  switch (desc.Which()) {
634  case CSeqdesc::e_Genbank:
635  if (desc.GetGenbank().IsSetExtra_accessions()) {
636  extra_acc = &(desc.GetGenbank().GetExtra_accessions());
637  }
638  break;
639  case CSeqdesc::e_Embl:
640  if (desc.GetEmbl().IsSetExtra_acc()) {
641  extra_acc = &(desc.GetEmbl().GetExtra_acc());
642  }
643  break;
644  default:
645  break;
646  }
647  if (extra_acc) {
648  FOR_EACH_STRING_IN_LIST (acc, *extra_acc) {
651  && (info & CSeq_id::fAcc_master) != 0) {
652  return true;
653  }
654  }
655  }
656  }
657  return false;
658 }
659 
660 // VR-728
661 // cannot have only seq-ids that will be stripped when loading to ID
663 {
664  bool found_good = false;
665  ITERATE(CBioseq::TId, id_it, seq.GetId()) {
666  if (! IsTemporary(**id_it)) {
667  found_good = true;
668  }
669  }
670  if (! found_good) {
672  "The only ids on this Bioseq will be stripped during ID load", seq);
673  }
674 }
675 
676 
678 {
679  // Ensure that CBioseq has at least one CSeq_id
680  if (! seq.IsSetId() || seq.GetId().empty()) {
682  "No ids on a Bioseq", seq);
683  return;
684  }
685 
686  CSeq_inst::ERepr repr = seq.GetInst().GetRepr();
687 
688  // Loop thru CSeq_ids for this CBioseq. Determine if seq has
689  // gi, NG, or NC. Check that the same CSeq_id not included more
690  // than once.
691  bool has_gi = false;
692  bool is_lrg = false;
693  bool has_ng = false;
694  bool wgs_tech_needs_wgs_accession = false;
695  bool is_segset_accession = false;
696  bool has_wgs_general = false;
697  bool is_eb_db = false;
698  bool longer_general = false;
699 
700  FOR_EACH_SEQID_ON_BIOSEQ (i, seq) {
701  if ((*i)->IsOther() || (*i)->IsEmbl() || (*i)->IsTpe()) {
702  longer_general = true;
703  }
704  }
705 
706  FOR_EACH_SEQID_ON_BIOSEQ (i, seq) {
707  // first, do standalone validation
708  ValidateSeqId(**i, seq, longer_general);
709 
710  if ((*i)->IsGeneral() && (*i)->GetGeneral().IsSetDb()) {
711  if (NStr::EqualNocase((*i)->GetGeneral().GetDb(), "LRG")) {
712  is_lrg = true;
713  }
714  if (NStr::StartsWith((*i)->GetGeneral().GetDb(), "WGS:")) {
715  has_wgs_general = true;
716  }
717  } else if ((*i)->IsOther() && (*i)->GetOther().IsSetAccession()) {
718  const string& acc = (*i)->GetOther().GetAccession();
719  if (NStr::StartsWith(acc, "NG_")) {
720  has_ng = true;
721  wgs_tech_needs_wgs_accession = true;
722  } else if (NStr::StartsWith(acc, "NM_")
723  || NStr::StartsWith(acc, "NP_")
724  || NStr::StartsWith(acc, "NR_")) {
725  wgs_tech_needs_wgs_accession = true;
726  }
727  } else if ((*i)->IsEmbl() && (*i)->GetEmbl().IsSetAccession()) {
728  is_eb_db = true;
729  } else if ((*i)->IsDdbj() && (*i)->GetDdbj().IsSetAccession()) {
730  is_eb_db = true;
731  }
732 
733  // Check that no two CSeq_ids for same CBioseq are same type
734  CBioseq::TId::const_iterator j;
735  for (j = i, ++j; j != seq.GetId().end(); ++j) {
736  if ((**i).Compare(**j) != CSeq_id::e_DIFF) {
737  CNcbiOstrstream os;
738  os << "Conflicting ids on a Bioseq: (";
739  (**i).WriteAsFasta(os);
740  os << " - ";
741  (**j).WriteAsFasta(os);
742  os << ")";
744  CNcbiOstrstreamToString (os) /* os.str() */, seq);
745  }
746  }
747 
748  if ((*i)->IsGenbank() || (*i)->IsEmbl() || (*i)->IsDdbj()) {
749  wgs_tech_needs_wgs_accession = true;
750  }
751 
752  if ((*i)->IsGi()) {
753  has_gi = true;
754  }
755 
756  if ((*i)->IdentifyAccession() == CSeq_id::eAcc_segset) {
757  is_segset_accession = true;
758  }
759 
760  }
761  if (is_lrg && ! has_ng) {
763  "LRG sequence needs NG_ accession", seq);
764  }
765 
766 
767  // Loop thru CSeq_ids to check formatting
768  bool is_wgs = false;
769  unsigned int gi_count = 0;
770  unsigned int accn_count = 0;
771  unsigned int lcl_count = 0;
772  FOR_EACH_SEQID_ON_BIOSEQ (k, seq) {
773  const CTextseq_id* tsid = (*k)->GetTextseq_Id();
774  switch ((**k).Which()) {
775  case CSeq_id::e_Local:
776  lcl_count++;
777  break;
778  case CSeq_id::e_Tpg:
779  case CSeq_id::e_Tpe:
780  case CSeq_id::e_Tpd:
781  case CSeq_id::e_Genbank:
782  case CSeq_id::e_Embl:
783  case CSeq_id::e_Ddbj:
784  if (tsid && tsid->IsSetAccession()) {
785  if ((*k)->IsGenbank() || (*k)->IsEmbl() || (*k)->IsDdbj()) {
786  is_wgs |= IsWGSAccession(**k);
787  }
788 
789  if (has_gi) {
790  if (tsid->IsSetVersion() && tsid->GetVersion() == 0) {
791  const string& acc = tsid->GetAccession();
793  "Accession " + acc + " has 0 version", seq);
794  }
795  }
796  }
797  // Fall thru
799  case CSeq_id::e_Other:
800  if (tsid) {
801  if (has_gi && ! tsid->IsSetAccession() && tsid->IsSetName()) {
802  if ((*k)->IsDdbj() && repr == CSeq_inst::eRepr_seg) {
803  // Don't report ddbj segmented sequence missing accessions
804  } else {
806  "Missing accession for " + tsid->GetName(), seq);
807  }
808  }
809  accn_count++;
810  }
811  break;
812 
813  case CSeq_id::e_Pir:
815  case CSeq_id::e_Prf:
816  if (tsid) {
817  if ((! tsid->IsSetAccession() || NStr::IsBlank(tsid->GetAccession())) &&
818  (! tsid->IsSetName() || NStr::IsBlank(tsid->GetName())) &&
819  seq.GetInst().IsAa()) {
820  string label = (*k)->AsFastaString();
822  "Missing identifier for " + label, seq);
823  }
824  accn_count++;
825  }
826  break;
827  case CSeq_id::e_Gi:
828  gi_count++;
829  break;
830  default:
831  break;
832  }
833  }
834 
836  if (! SeqIsPatent(seq) && ! seq.IsAa()) {
837  if (is_wgs) {
838  if (! mi || ! mi->IsSetTech() ||
839  (mi->GetTech() != CMolInfo::eTech_wgs &&
840  mi->GetTech() != CMolInfo::eTech_tsa &&
841  mi->GetTech() != CMolInfo::eTech_targeted)) {
843  "WGS accession should have Mol-info.tech of wgs", seq);
844  }
845  } else if (mi && mi->IsSetTech() &&
846  mi->GetTech() == CMolInfo::eTech_wgs &&
847  wgs_tech_needs_wgs_accession &&
848  ! is_segset_accession &&
849  ! has_wgs_general &&
850  ! x_IsWgsSecondary(seq)) {
851  EDiagSev sev = eDiag_Error;
852  if (is_eb_db) {
853  sev = eDiag_Warning;
854  }
855  if (! is_eb_db) {
857  "Mol-info.tech of wgs should have WGS accession", seq);
858  }
859  }
860 
861  if ((IsNTNCNWACAccession(seq) || IsNG(seq)) && mi && seq.IsNa()
862  && (! mi->IsSetBiomol()
863  || (mi->GetBiomol() != CMolInfo::eBiomol_genomic
864  && mi->GetBiomol() != CMolInfo::eBiomol_cRNA))) {
866  "genomic RefSeq accession should use genomic or cRNA moltype",
867  seq);
868  }
869  }
870  if (seq.GetInst().GetMol() == CSeq_inst::eMol_dna) {
871  if (mi && mi->IsSetBiomol()) {
872  switch (mi->GetBiomol()) {
885  "Molecule type (DNA) does not match biomol (RNA)", seq);
886  break;
887  default:
888  break;
889  }
890  }
891  }
892 
893  // Check that a sequence with a gi number has exactly one accession
894  if (gi_count > 0 && accn_count == 0 && ! m_Imp.IsPDB() &&
895  repr != CSeq_inst::eRepr_virtual) {
897  "No accession on sequence with gi number", seq);
898  }
899  if (gi_count > 0 && accn_count > 1) {
901  "Multiple accessions on sequence with gi number", seq);
902  }
903 
904  x_CheckGeneralIDs(seq);
905 
906  if (m_Imp.IsValidateIdSet()) {
908  }
909 
910  // C toolkit ensures that there is exactly one CBioseq for a CSeq_id
911  // Not done here because object manager will not allow
912  // the same Seq-id on multiple Bioseqs
913 
914 }
915 
916 
918 {
919  bool rval = false;
920  const CSeq_inst& inst = seq.GetInst();
921  if (inst.IsSetHist() && inst.GetHist().IsSetAssembly()) {
922  return false;
923  }
924  CSeq_inst::TRepr repr = inst.CanGetRepr() ?
926 
927  if (seq.IsNa() && repr != CSeq_inst::eRepr_seg) {
928  rval = true;
929  // look for keyword
931  CSeqdesc_CI genbank_i(bsh, CSeqdesc::e_Genbank);
932  if (genbank_i && genbank_i->GetGenbank().IsSetKeywords()) {
933  CGB_block::TKeywords::const_iterator keyword = genbank_i->GetGenbank().GetKeywords().begin();
934  while (keyword != genbank_i->GetGenbank().GetKeywords().end() && rval) {
935  if (NStr::EqualNocase(*keyword, "TPA:reassembly")) {
936  rval = false;
937  }
938  ++keyword;
939  }
940  }
941  if (rval) {
942  CSeqdesc_CI embl_i(bsh, CSeqdesc::e_Embl);
943  if (embl_i && embl_i->GetEmbl().IsSetKeywords()) {
944  CEMBL_block::TKeywords::const_iterator keyword = embl_i->GetEmbl().GetKeywords().begin();
945  while (keyword != embl_i->GetEmbl().GetKeywords().end() && rval) {
946  if (NStr::EqualNocase(*keyword, "TPA:reassembly")) {
947  rval = false;
948  }
949  ++keyword;
950  }
951  }
952  }
953  }
954  return rval;
955 }
956 
957 
959  const string& primary_acc,
960  const CBioseq& seq,
961  int choice)
962 {
963  CSeqdesc_CI sd(m_Scope->GetBioseqHandle(seq), static_cast<CSeqdesc::E_Choice>(choice));
964  for (; sd; ++sd) {
965  const list<string>* extra_acc = nullptr;
966  if (choice == CSeqdesc::e_Genbank &&
968  extra_acc = &(sd->GetGenbank().GetExtra_accessions());
969  } else if (choice == CSeqdesc::e_Embl &&
970  sd->GetEmbl().IsSetExtra_acc()) {
971  extra_acc = &(sd->GetEmbl().GetExtra_acc());
972  }
973 
974  if (extra_acc) {
975  FOR_EACH_STRING_IN_LIST (acc, *extra_acc) {
976  if (NStr::CompareNocase(primary_acc, *acc) == 0) {
977  // If the same post error
980  primary_acc + " used for both primary and"
981  " secondary accession", seq);
982  }
983  }
984  }
985  }
986 }
987 
988 
990 {
991  for (CSeqdesc_CI it(bsh, CSeqdesc::e_User); it; ++it) {
992  if (it->GetUser().GetObjectType() == CUser_object::eObjectType_Unverified) {
993  return true;
994  }
995  }
996  return false;
997 }
998 
999 
1001 {
1005 
1006  bool has_barcode_tech = false;
1007 
1009  if (di && di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetTech() == CMolInfo::eTech_barcode) {
1010  has_barcode_tech = true;
1011  }
1012 
1013  bool has_barcode_keyword = false;
1014  for (CSeqdesc_CI it(bsh, CSeqdesc::e_Genbank); it; ++it) {
1015  FOR_EACH_KEYWORD_ON_GENBANKBLOCK (k, it->GetGenbank()) {
1016  if (NStr::EqualNocase(*k, "BARCODE")) {
1017  has_barcode_keyword = true;
1018  break;
1019  }
1020  }
1021  if (has_barcode_keyword && ! has_barcode_tech) {
1023  "BARCODE keyword without Molinfo.tech barcode",
1024  *ctx, *it);
1025  }
1026  }
1027  if (has_barcode_tech && ! has_barcode_keyword && di) {
1029  "Molinfo.tech barcode without BARCODE keyword",
1030  *ctx, *di);
1031  }
1032  if (has_barcode_keyword && HasUnverified(bsh)) {
1034  "Sequence has both BARCODE and UNVERIFIED keywords",
1035  seq);
1036  }
1037 }
1038 
1039 
1041 {
1042  const CSeq_inst& inst = seq.GetInst();
1043 
1044  // Check representation
1045  if (! ValidateRepr(inst, seq)) {
1046  return;
1047  }
1048 
1049  // Check molecule, topology, and strand
1050  if (! inst.IsSetMol()) {
1051  PostErr(eDiag_Error, eErr_SEQ_INST_MolNotSet, "Bioseq.mol is 0",
1052  seq);
1053  } else {
1054  const CSeq_inst::EMol& mol = inst.GetMol();
1055  switch (mol) {
1056 
1057  case CSeq_inst::eMol_na:
1059  "Bioseq.mol is type nucleic acid", seq);
1060  break;
1061 
1062  case CSeq_inst::eMol_aa:
1063  if (inst.IsSetTopology() &&
1067  "Non-linear topology set on protein", seq);
1068  }
1069  if (inst.IsSetStrand() &&
1070  inst.GetStrand() != CSeq_inst::eStrand_ss &&
1073  "Protein not single stranded", seq);
1074  }
1075  break;
1076 
1077  case CSeq_inst::eMol_dna:
1078  if (seq.IsSetInst() && seq.GetInst().IsSetTopology() && seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular) {
1079  if (m_is_bact_or_arch) {
1080  if (! m_is_plasmid && ! m_is_chromosome && ! m_is_extrachrom) {
1081  EDiagSev sev = eDiag_Error;
1082  if (IsRefSeq(seq) || m_Imp.IsRefSeqConventions()) {
1083  sev = eDiag_Error;
1084  } else if (IsEmblOrDdbj(seq)) {
1085  sev = eDiag_Warning;
1086  }
1088  "Circular Bacteria or Archaea should be chromosome, or plasmid, or extrachromosomal", seq);
1089  }
1090  }
1091  }
1092  break;
1093 
1095  PostErr(eDiag_Error, eErr_SEQ_INST_MolNotSet, "Bioseq.mol is 0",
1096  seq);
1097  break;
1098 
1099  case CSeq_inst::eMol_other:
1101  "Bioseq.mol is type other", seq);
1102  break;
1103 
1104  default:
1105  break;
1106  }
1107  }
1108 
1109  CSeq_inst::ERepr rp = seq.GetInst().GetRepr();
1110 
1111  if (rp == CSeq_inst::eRepr_raw || rp == CSeq_inst::eRepr_const) {
1112  // Validate raw and constructed sequences
1113  ValidateRawConst(seq);
1114  }
1115 
1116  // per VR-779
1117 #if 1
1118  if (rp == CSeq_inst::eRepr_seg) {
1119  PostErr(eDiag_Critical, eErr_SEQ_INST_ReprInvalid, "Segmented set format is not supported", seq);
1120  } else if (rp == CSeq_inst::eRepr_ref) {
1121  PostErr(eDiag_Critical, eErr_SEQ_INST_ReprInvalid, "Repr_ref format is not supported", seq);
1122  }
1123 #else
1124  if (rp == CSeq_inst::eRepr_seg || rp == CSeq_inst::eRepr_ref) {
1125  // Validate segmented and reference sequences
1126  ValidateSegRef(seq);
1127  }
1128 #endif
1129 
1130  if (rp == CSeq_inst::eRepr_delta) {
1131  // Validate delta sequences
1132  ValidateDelta(seq);
1133  }
1134 
1135  if (rp == CSeq_inst::eRepr_seg && seq.GetInst().IsSetExt() &&
1136  seq.GetInst().GetExt().IsSeg()) {
1137  // Validate part of segmented sequence
1138  ValidateSeqParts(seq);
1139  }
1140 
1141  if (rp == CSeq_inst::eRepr_raw || rp == CSeq_inst::eRepr_delta) {
1142  x_ValidateBarcode(seq);
1143  }
1144 
1145  x_ValidateTitle(seq);
1146  /*if (seq.IsAa()) {
1147  Validate protein title(amino acids only)
1148  ValidateProteinTitle(seq);
1149  }*/
1150 
1151  if (seq.IsNa()) {
1152  // check for N bases at start or stop of sequence,
1153  // or sequence entirely made of Ns
1154  ValidateNsAndGaps(seq);
1155 
1156  GapByGapInst(seq);
1157  }
1158 
1159  // Validate sequence length
1160  ValidateSeqLen(seq);
1161 
1162  // proteins should not have gaps
1163  if (seq.IsAa() && x_HasGap(seq)) {
1164  PostErr(eDiag_Error, eErr_SEQ_INST_ProteinShouldNotHaveGaps, "Protein sequences should not have gaps", seq);
1165  }
1166 }
1167 
1168 
1170 {
1171  bool is_wgs = false;
1172  bool is_grc = false;
1173 
1175  CSeqdesc_CI user(bsh, CSeqdesc::e_User);
1176  while (user) {
1178  user->GetUser().HasField("BioProject", ".", NStr::eNocase)) {
1179  // bioproject field found
1180  return false;
1181  }
1182  ++user;
1183  }
1184 
1185  CSeqdesc_CI ti(bsh, CSeqdesc::e_Title);
1186  if (ti) {
1187  while (ti) {
1188  if (NStr::StartsWith(ti->GetTitle(), "GRC")) {
1189  is_grc = true;
1190  break;
1191  }
1192  ++ti;
1193  }
1194  } else {
1195  sequence::CDeflineGenerator defline_generator;
1196  string title = defline_generator.GenerateDefline(seq, *m_Scope, sequence::CDeflineGenerator::fIgnoreExisting);
1197  if (! NStr::IsBlank(title)) {
1198  if (NStr::StartsWith(title, "GRC")) {
1199  is_grc = true;
1200  }
1201  }
1202  }
1203 
1204  is_wgs = IsWGS(bsh);
1205 
1206  bool is_gb = false, /* is_eb_db = false, */ is_refseq = false, is_ng = false;
1207 
1208  FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, seq) {
1209  const CSeq_id& sid = **sid_itr;
1210  switch (sid.Which()) {
1211  case CSeq_id::e_Genbank:
1212  case CSeq_id::e_Embl:
1213  // is_eb_db = true;
1214  // fall through
1215  case CSeq_id::e_Ddbj:
1216  is_gb = true;
1217  break;
1218  case CSeq_id::e_Other: {
1219  is_refseq = true;
1220  if (sid.GetOther().IsSetAccession()) {
1221  string acc = sid.GetOther().GetAccession().substr(0, 3);
1222  if (acc == "NG_") {
1223  is_ng = true;
1224  }
1225  }
1226  } break;
1227  default:
1228  break;
1229  }
1230  }
1231 
1232  if (is_refseq || m_Imp.IsRefSeqConventions()) {
1233  if (is_ng)
1234  return false;
1235  } else if (is_gb) {
1236  if (! is_wgs && ! is_grc)
1237  return false;
1238  } else {
1239  return false;
1240  }
1241 
1242  const CSeq_inst& inst = seq.GetInst();
1243  CSeq_inst::TRepr repr = inst.GetRepr();
1244 
1245  if (repr == CSeq_inst::eRepr_delta) {
1246  if (x_IsDeltaLitOnly(inst))
1247  return false;
1248  } else if (repr != CSeq_inst::eRepr_map) {
1249  return false;
1250  }
1251 
1252  return true;
1253 }
1254 
1256 {
1258 
1259  // Check that proteins in nuc_prot set have a CdRegion
1260  if (CdError(bsh)) {
1261  EDiagSev sev = eDiag_Error;
1263  if (bssh) {
1264  CBioseq_Handle nbsh = GetNucBioseq (bssh);
1265  if (nbsh) {
1266  CSeqdesc_CI desc( nbsh, CSeqdesc::e_Molinfo );
1267  const CMolInfo* mi = desc ? &(desc->GetMolinfo()) : nullptr;
1268  if (mi) {
1269  CMolInfo::TTech tech = mi->IsSetTech() ?
1271  if (tech == CMolInfo::eTech_wgs) {
1272  sev = eDiag_Critical;
1273  }
1274  }
1275  }
1276  }
1278  "No CdRegion in nuc-prot set points to this protein",
1279  seq);
1280  }
1281 
1282  bool is_patent = SeqIsPatent (seq);
1283 
1284  bool is_complete = false;
1285  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
1286  if (desc) {
1287  const CMolInfo& mi = desc->GetMolinfo();
1289  is_complete = true;
1290  }
1291  }
1292 
1293  try {
1294 
1295  // if there are no Seq-ids, the following tests can't be run
1296  if (seq.IsSetId()) {
1297 
1298  ValidateSeqFeatContext(seq, is_complete);
1299 
1300  // Check for duplicate features and overlapping peptide features.
1302 
1303  // Check for introns within introns.
1304  ValidateTwintrons(seq);
1305 
1306  // check for tRNA contained in tmRNA features
1308 
1309  // check for equivalent source features
1311 
1312  // check for equivalen pub features
1313  x_ValidatePubFeatures(bsh);
1314 
1315  // Check for colliding genes
1317 
1318  // Detect absence of BioProject DBLink for complete bacterial genomes
1320  }
1321 
1322  m_dblink_count = 0;
1323  m_taa_count = 0;
1324  m_bs_count = 0;
1325  m_as_count = 0;
1326  m_pdb_count = 0;
1327  m_sra_count = 0;
1328  m_bp_count = 0;
1329  m_unknown_count = 0;
1330 
1331  // Validate descriptors that affect this bioseq
1333 
1334 
1335  if (m_dblink_count > 1) {
1337  NStr::IntToString(m_dblink_count) + " DBLink user objects apply to a Bioseq", seq);
1338  }
1339 
1340  if (m_taa_count > 1) {
1342  "Trace Assembly Archive entries appear in " + NStr::IntToString(m_taa_count) + " DBLink user objects", seq);
1343  }
1344 
1345  if (m_bs_count > 1) {
1347  "BioSample entries appear in " + NStr::IntToString(m_bs_count) + " DBLink user objects", seq);
1348  }
1349 
1350  if (m_as_count > 1) {
1352  "Assembly entries appear in " + NStr::IntToString(m_as_count) + " DBLink user objects", seq);
1353  }
1354 
1355  if (m_pdb_count > 1) {
1357  "ProbeDB entries appear in " + NStr::IntToString(m_pdb_count) + " DBLink user objects", seq);
1358  }
1359 
1360  if (m_sra_count > 1) {
1362  "Sequence Read Archive entries appear in " + NStr::IntToString(m_sra_count) + " DBLink user objects", seq);
1363  }
1364 
1365  if (m_bp_count > 1) {
1367  "BioProject entries appear in " + NStr::IntToString(m_bp_count) + " DBLink user objects", seq);
1368  }
1369 
1370  if (m_unknown_count > 1) {
1372  "Unrecognized entries appear in " + NStr::IntToString(m_unknown_count) + " DBLink user objects", seq);
1373  } else if (m_unknown_count > 0) {
1375  "Unrecognized entries appear in " + NStr::IntToString(m_unknown_count) + " DBLink user object", seq);
1376  }
1377 
1378  // make sure that there is a pub on this bioseq
1380  CheckForPubOnBioseq(seq);
1381  }
1382  // make sure that there is a source on this bioseq
1384  CheckSourceDescriptor(bsh);
1385  // CheckForBiosourceOnBioseq(seq);
1386  }
1387 
1388  if (x_ShowBioProjectWarning(seq)) {
1390  "BioProject entries not present on CON record", seq);
1391  }
1392 
1393  } catch (const exception& e) {
1394  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1396  string("Exception while validating BioseqContext. EXCEPTION: ") +
1397  e.what(), seq);
1398  }
1399  }
1400 
1401  if (! is_patent) {
1402  // flag missing molinfo even if not in Sequin
1404  }
1405 
1406  CValidError_graph graph_validator(m_Imp);
1407  graph_validator.ValidateGraphsOnBioseq(seq);
1408 
1409  CheckTpaHistory(seq);
1410 
1411  // check for multiple publications with identical identifiers
1413 
1414  // look for orphaned proteins
1415  if (seq.IsAa() && bsh && ! GetNucProtSetParent(bsh) && ! AllowOrphanedProtein(seq, m_Imp.IsRefSeqConventions())) {
1417  "Orphaned stand-alone protein", seq);
1418  }
1419 
1420  // look for extra protein features
1421  if (seq.IsAa()) {
1422  CCacheImpl::SFeatKey prot_key(
1424  const CCacheImpl::TFeatValue& prot_feats =
1425  GetCache().GetFeatFromCache(prot_key);
1426 
1427  if (prot_feats.size() > 1) {
1428  ITERATE(CCacheImpl::TFeatValue, feat, prot_feats) {
1430  "Protein sequence has multiple unprocessed protein features",
1431  feat->GetOriginalFeature());
1432  }
1433  }
1434  }
1435 
1436  if (! m_Imp.IsNoCitSubPubs() && ! x_HasCitSub(bsh) && ! m_Imp.IsSeqSubmitParent()) {
1438  "Expected submission citation is missing for this Bioseq", seq);
1439  }
1440 
1441  // RW-1053 check sig_peptides and mat_peptides with instantiated products
1442  if (seq.IsAa()) {
1443 
1447  try {
1448  for (CFeat_CI feat_ci(bsh, sel); feat_ci; ++feat_ci) {
1449 
1450  const CSeq_feat& matpeptide = feat_ci->GetOriginalFeature();
1451  if (matpeptide.IsSetProduct()) {
1452  const CSeq_loc& loc = matpeptide.GetLocation();
1453  const CSeq_loc& prd = matpeptide.GetProduct();
1454 
1455  TSeqPos matlen = GetLength(loc, m_Scope);
1456  TSeqPos prdlen = GetLength(prd, m_Scope);
1457  if (matlen != prdlen) {
1459  "Mat_peptide does not match length of instantiated product",
1460  matpeptide);
1461  }
1462 
1465 
1466  TSeqPos len = matlen;
1467  if (len > prdlen) {
1468  len = prdlen;
1469  }
1470 
1471  for (TSeqPos i = 0; i < len; ++i) {
1472  CSeqVectorTypes::TResidue m_res = mat_vec[i];
1473  CSeqVectorTypes::TResidue p_res = prd_vec[i];
1474 
1475  if (m_res != p_res) {
1477  "Mismatch in mat_peptide (" + string(1, (char)m_res) + ") and instantiated product (" + \
1478  string(1, (char)p_res) + ") at position " + NStr::NumericToString(i + 1),
1479  matpeptide);
1480  }
1481  }
1482  }
1483  }
1484  } catch (CException&) {
1485  }
1486  }
1487 }
1488 
1489 
1491 {
1492  ITERATE(CPub_equiv::Tdata, it, pub.Get()) {
1493  if (x_HasCitSub(**it)) {
1494  return true;
1495  }
1496  }
1497  return false;
1498 }
1499 
1500 
1502 {
1503  if (pub.IsSub()) {
1504  return true;
1505  } else if (pub.IsEquiv() && x_HasCitSub(pub.GetEquiv())) {
1506  return true;
1507  } else {
1508  return false;
1509  }
1510 }
1511 
1512 
1514 {
1515  bool has_cit_sub = false;
1516  CSeqdesc_CI p(bsh, CSeqdesc::e_Pub);
1517  while (p && !has_cit_sub) {
1518  if (p->GetPub().IsSetPub()) {
1519  has_cit_sub = x_HasCitSub(p->GetPub().GetPub());
1520  }
1521  ++p;
1522  }
1523 
1524  return has_cit_sub;
1525 }
1526 
1527 
1528 template <class Iterator, class Predicate>
1529 bool lists_match(Iterator iter1, Iterator iter1_stop, Iterator iter2, Iterator iter2_stop, Predicate pred)
1530 {
1531  while (iter1 != iter1_stop && iter2 != iter2_stop) {
1532  if (! pred(*iter1, *iter2)) {
1533  return false;
1534  }
1535  ++iter1;
1536  ++iter2;
1537  }
1538  if (iter1 != iter1_stop || iter2 != iter2_stop) {
1539  return false;
1540  } else {
1541  return true;
1542  }
1543 }
1544 
1545 
1546 static bool s_OrgModEqual(
1547  const CRef<COrgMod>& om1,
1548  const CRef<COrgMod>& om2)
1549 {
1550  const COrgMod& omd1 = *(om1);
1551  const COrgMod& omd2 = *(om2);
1552 
1553  const string& str1 = omd1.GetSubname();
1554  const string& str2 = omd2.GetSubname();
1555 
1556  if (NStr::CompareNocase (str1, str2) != 0) return false;
1557 
1558  TORGMOD_SUBTYPE chs1 = omd1.GetSubtype();
1559  TORGMOD_SUBTYPE chs2 = omd2.GetSubtype();
1560 
1561  if (chs1 == chs2) return true;
1562  if (chs2 == NCBI_ORGMOD(other)) return true;
1563 
1564  return false;
1565 }
1566 
1567 
1568 bool s_DbtagEqual(const CRef<CDbtag>& dbt1, const CRef<CDbtag>& dbt2)
1569 {
1570  // is dbt1 == dbt2
1571  return dbt1->Compare(*dbt2) == 0;
1572 }
1573 
1574 
1575 // Two OrgRefs are identical if the taxnames are identical, the dbxrefs are identical,
1576 // and the orgname orgmod lists are identical
1577 static bool s_OrgrefEquivalent(const COrg_ref& org1, const COrg_ref& org2)
1578 {
1579  if ((org1.IsSetTaxname() && ! org2.IsSetTaxname())
1580  || (! org1.IsSetTaxname() && org2.IsSetTaxname())
1581  || (org1.IsSetTaxname() && org2.IsSetTaxname()
1582  && ! NStr::EqualNocase(org1.GetTaxname(), org2.GetTaxname()))) {
1583  return false;
1584  }
1585 
1586  if ((org1.IsSetDb() && ! org2.IsSetDb())
1587  || (! org1.IsSetDb() && org2.IsSetDb())
1588  || (org1.IsSetDb() && org2.IsSetDb()
1589  && !lists_match (org1.GetDb().begin(), org1.GetDb().end(),
1590  org2.GetDb().begin(), org2.GetDb().end(),
1591  s_DbtagEqual))) {
1592  return false;
1593  }
1594 
1595  if ((org1.IsSetOrgname() && ! org2.IsSetOrgname())
1596  || (! org1.IsSetOrgname() && org2.IsSetOrgname())) {
1597  return false;
1598  }
1599  if (org1.IsSetOrgname() && org2.IsSetOrgname()) {
1600  const COrgName& on1 = org1.GetOrgname();
1601  const COrgName& on2 = org2.GetOrgname();
1602  if ((on1.IsSetMod() && ! on2.IsSetMod())
1603  || (! on1.IsSetMod() && on2.IsSetMod())
1604  || (on1.IsSetMod() && on2.IsSetMod()
1605  && !lists_match (on1.GetMod().begin(), on1.GetMod().end(),
1606  on2.GetMod().begin(), on2.GetMod().end(),
1607  s_OrgModEqual))) {
1608  return false;
1609  }
1610  }
1611 
1612  return true;
1613 }
1614 
1615 
1616 // Two SubSources are equal and duplicates if:
1617 // they have the same subtype
1618 // and the same name (or don't require a name).
1619 
1621  const CRef<CSubSource>& st1,
1622  const CRef<CSubSource>& st2)
1623 {
1624  const CSubSource& sbs1 = *(st1);
1625  const CSubSource& sbs2 = *(st2);
1626 
1627  TSUBSOURCE_SUBTYPE chs1 = sbs1.GetSubtype();
1628  TSUBSOURCE_SUBTYPE chs2 = sbs2.GetSubtype();
1629 
1630  if (chs1 != chs2)
1631  return false;
1632  if (CSubSource::NeedsNoText(chs2))
1633  return true;
1634 
1635  if (sbs1.IsSetName() && sbs2.IsSetName()) {
1636  if (NStr::CompareNocase(sbs1.GetName(), sbs2.GetName()) == 0)
1637  return true;
1638  }
1639  if (! sbs1.IsSetName() && ! sbs2.IsSetName())
1640  return true;
1641 
1642  return false;
1643 }
1644 
1645 
1646 static bool s_BiosrcFullLengthIsOk(const CBioSource& src)
1647 {
1648  if (src.IsSetIs_focus()) {
1649  return true;
1650  }
1652  if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == CSubSource::eSubtype_transgenic) {
1653  return true;
1654  }
1655  }
1656  return false;
1657 }
1658 
1659 
1661 {
1662  if (! src.IsSetOrg() || ! src.GetOrg().IsSetTaxname()) {
1663  return false;
1664  }
1665  if (NStr::EqualNocase(src.GetOrg().GetTaxname(), "unidentified phage")) {
1666  return true;
1667  }
1668  if (src.GetOrg().IsSetOrgname() && src.GetOrg().GetOrgname().IsSetLineage()
1669  && NStr::StartsWith(src.GetOrg().GetOrgname().GetLineage(), "Viruses", NStr::eNocase)) {
1670  return true;
1671  }
1672 #if 0
1673  if (! src.GetOrg().IsSetOrgname()) {
1674  printf ("Orgname not set!\n");
1675  } else if (! src.GetOrg().GetOrgname().IsSetLineage()) {
1676  printf ("Lineage not set!\n");
1677  } else {
1678  printf ("Lineage is %s!\n", src.GetOrg().GetOrgname().GetLineage().c_str());
1679  }
1680 #endif
1681  return false;
1682 }
1683 
1684 
1685 bool s_OverlapOrAbut(const CSeq_loc& loc1, const CSeq_loc& loc2, CScope* scope)
1686 {
1687  TSeqPos start1 = loc1.GetStart(eExtreme_Positional);
1688  TSeqPos stop1 = loc1.GetStop(eExtreme_Positional);
1689  TSeqPos start2 = loc2.GetStart(eExtreme_Positional);
1690  TSeqPos stop2 = loc2.GetStop(eExtreme_Positional);
1691 
1692  if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1693  // abut
1694  return true;
1695  } else if (TestForOverlapEx(loc1, loc2, eOverlap_Simple, scope) >= 0) {
1696  return true;
1697  } else {
1698  return false;
1699  }
1700 }
1701 
1702 
1703 bool s_ContainedIn(const CSeq_loc& loc1, const CSeq_loc& loc2, CScope* scope)
1704 {
1705  TSeqPos start1 = loc1.GetStart(eExtreme_Positional);
1706  TSeqPos stop1 = loc1.GetStop(eExtreme_Positional);
1707  TSeqPos start2 = loc2.GetStart(eExtreme_Positional);
1708  TSeqPos stop2 = loc2.GetStop(eExtreme_Positional);
1709 
1710  if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1711  // abut
1712  return false;
1713  } else if (TestForOverlapEx(loc1, loc2, eOverlap_Contained, scope) >= 0) {
1714  return true;
1715  } else {
1716  return false;
1717  }
1718 }
1719 
1720 
1721 bool s_CheckIntervals(const CSeq_loc& loc1, const CSeq_loc& loc2, CScope* scope)
1722 {
1723  TSeqPos start1 = loc1.GetStart(eExtreme_Positional);
1724  TSeqPos stop1 = loc1.GetStop(eExtreme_Positional);
1725  TSeqPos start2 = loc2.GetStart(eExtreme_Positional);
1726  TSeqPos stop2 = loc2.GetStop(eExtreme_Positional);
1727 
1728  if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1729  // abut
1730  return false;
1731  } else if (TestForOverlapEx(loc1, loc2, eOverlap_CheckIntervals, scope) >= 0) {
1732  return true;
1733  } else {
1734  return false;
1735  }
1736 }
1737 
1738 
1740 {
1741  // don't bother if can't build all feature iterator
1742  if (! m_AllFeatIt) {
1743  return;
1744  }
1745  try {
1746  CCacheImpl::SFeatKey rna_key(
1748  const CCacheImpl::TFeatValue & rnas = GetCache().GetFeatFromCache(rna_key);
1749  CCacheImpl::TFeatValue::const_iterator feat = rnas.begin();
1750  if (feat != rnas.end()) {
1751 
1752  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1753  ++feat;
1754  for (; feat != rnas.end(); ++feat_prev, ++feat) {
1755 
1756  if (! s_OverlapOrAbut(feat_prev->GetLocation(),
1757  feat->GetLocation(), m_Scope)) {
1758  continue;
1759  }
1760 
1761  const CRNA_ref& tm = feat_prev->GetData().GetRna();
1762  const CRNA_ref& tr = feat->GetData().GetRna();
1763  if (tm.IsSetType() && tm.GetType() == CRNA_ref::eType_tmRNA) {
1764  if (tr.IsSetType() && tr.GetType() == CRNA_ref::eType_tRNA) {
1765  if (s_ContainedIn(feat_prev->GetLocation(),
1766  feat->GetLocation(), m_Scope)) {
1768  "tRNA contained within tmRNA",
1769  feat->GetOriginalFeature());
1770  }
1771  }
1772  }
1773  }
1774  }
1775  } catch (const exception& e) {
1776  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1778  string("Exception while validating RNA features. EXCEPTION: ") +
1779  e.what(), *(bsh.GetCompleteBioseq()));
1780  }
1781  }
1782 }
1783 
1784 
1786 {
1787  // don't bother if can't build all feature iterator
1788  if (! m_AllFeatIt) {
1789  return;
1790  }
1791  try {
1792  CCacheImpl::SFeatKey biosrc_key(
1794  const CCacheImpl::TFeatValue & biosrcs = GetCache().GetFeatFromCache(biosrc_key);
1795  CCacheImpl::TFeatValue::const_iterator feat = biosrcs.begin();
1796  if (feat != biosrcs.end()) {
1797  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1799  if (di) {
1800  if (! s_BiosrcFullLengthIsOk(di->GetSource())) {
1802  "Source feature is full length, should be descriptor",
1803  feat->GetOriginalFeature());
1804  }
1805  }
1806  }
1807 
1808  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1809  ++feat;
1810  for (; feat != biosrcs.end(); ++feat_prev, ++feat) {
1811  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1813  "Multiple full-length source features, should only be one if descriptor is transgenic",
1814  feat->GetOriginalFeature());
1815  }
1816 
1817  if (! s_OverlapOrAbut(feat_prev->GetLocation(),
1818  feat->GetLocation(), m_Scope)) {
1819  // not close enough
1820  continue;
1821  }
1822 
1823  // compare to see if feature sources are identical
1824  bool are_identical = true;
1825  if (feat_prev->IsSetComment() && feat->IsSetComment()
1826  && ! NStr::EqualNocase(feat_prev->GetComment(), feat->GetComment())) {
1827  are_identical = false;
1828  } else {
1829  const CBioSource& src_prev = feat_prev->GetData().GetBiosrc();
1830  const CBioSource& src = feat->GetData().GetBiosrc();
1831  if ((src.IsSetIs_focus() && ! src_prev.IsSetIs_focus())
1832  || (! src.IsSetIs_focus() && src_prev.IsSetIs_focus())) {
1833  are_identical = false;
1834  } else if ((src.IsSetSubtype() && ! src_prev.IsSetSubtype())
1835  || (! src.IsSetSubtype() && src_prev.IsSetSubtype())
1836  || (src.IsSetSubtype() && src_prev.IsSetSubtype()
1837  && ! lists_match(src.GetSubtype().begin(), src.GetSubtype().end(),
1838  src_prev.GetSubtype().begin(), src_prev.GetSubtype().end(),
1840  are_identical = false;
1841  } else if ((src.IsSetOrg() && ! src_prev.IsSetOrg())
1842  || (! src.IsSetOrg() && src_prev.IsSetOrg())
1843  || (src.IsSetOrg() && src_prev.IsSetOrg()
1844  && ! s_OrgrefEquivalent (src.GetOrg(), src_prev.GetOrg()))) {
1845  are_identical = false;
1846  }
1847  }
1848  if (are_identical && ! s_SuppressMultipleEquivBioSources(feat->GetData().GetBiosrc())) {
1850  "Multiple equivalent source features should be combined into one multi-interval feature",
1851  feat->GetOriginalFeature());
1852  }
1853  }
1854  }
1855  } catch (const exception& e) {
1856  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1858  string("Exception while validating source features. EXCEPTION: ") +
1859  e.what(), *(bsh.GetCompleteBioseq()));
1860  }
1861  }
1862 
1863 }
1864 
1865 
1866 static void s_MakePubLabelString(const CPubdesc& pd, string& label)
1867 {
1868  label = "";
1869 
1870  FOR_EACH_PUB_ON_PUBDESC (it, pd) {
1871  if ((*it)->IsGen() && (*it)->GetGen().IsSetCit()
1872  && ! (*it)->GetGen().IsSetCit()
1873  && ! (*it)->GetGen().IsSetJournal()
1874  && ! (*it)->GetGen().IsSetDate()
1875  && (*it)->GetGen().IsSetSerial_number()) {
1876  // skip over just serial number
1877  } else {
1878  (*it)->GetLabel(&label, CPub::eContent, CPub::fLabel_Unique);
1879  break;
1880  }
1881  }
1882 }
1883 
1884 
1886 {
1887  // don't bother if can't build feature iterator at all
1888  if (! m_AllFeatIt) {
1889  return;
1890  }
1891  try {
1892  CCacheImpl::SFeatKey pub_key(
1894  const CCacheImpl::TFeatValue& pubs =
1895  GetCache().GetFeatFromCache(pub_key);
1896  CCacheImpl::TFeatValue::const_iterator feat = pubs.begin();
1897  if (feat != pubs.end()) {
1898  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1900  "Publication feature is full length, should be descriptor",
1901  feat->GetOriginalFeature());
1902  }
1903 
1904  CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1905  string prev_label;
1906  if (feat_prev != pubs.end()) {
1907  s_MakePubLabelString(feat_prev->GetData().GetPub(), prev_label);
1908  ++feat;
1909  }
1910  for (; feat != pubs.end(); ++feat, ++feat_prev) {
1911  if (IsLocFullLength(feat->GetLocation(), bsh)) {
1913  "Publication feature is full length, should be descriptor",
1914  feat->GetOriginalFeature());
1915  }
1916  // compare to see if feature sources are identical
1917  bool are_identical = true;
1918  if (feat_prev->IsSetComment() && feat->IsSetComment()
1919  && ! NStr::EqualNocase(feat_prev->GetComment(), feat->GetComment())) {
1920  are_identical = false;
1921  } else {
1922  string label;
1923  s_MakePubLabelString(feat->GetData().GetPub(), label);
1924  if (! NStr::IsBlank(label) && ! NStr::IsBlank(prev_label)
1925  && ! NStr::EqualNocase(label, prev_label)) {
1926  are_identical = false;
1927  }
1928 
1929  // swap is faster than assignment
1930  prev_label.swap(label);
1931 
1932  // TODO: also check authors
1933  }
1934 
1935  if (are_identical) {
1937  "Multiple equivalent publication features should be combined into one multi-interval feature",
1938  feat->GetOriginalFeature());
1939  }
1940  }
1941  }
1942  } catch (const exception& e) {
1943  if (NStr::Find(e.what(), "Error: Cannot resolve") == string::npos) {
1945  string("Exception while validating pub features. EXCEPTION: ") +
1946  e.what(), *(bsh.GetCompleteBioseq()));
1947  }
1948  }
1949 
1950 }
1951 
1952 
1954 {
1955 public:
1956  // faster than lexicographical order
1957  bool operator()(const CTempString& lhs, const CTempString& rhs) const
1958  {
1959  if (lhs.length() != rhs.length()) {
1960  return (lhs.length() < rhs.length());
1961  }
1962  return NStr::CompareNocase(lhs, rhs) < 0;
1963  }
1964 };
1965 
1967 {
1968 public:
1969  bool operator()(const CTempString& lhs, const CTempString& rhs) const
1970  {
1971  return NStr::CompareNocase(lhs, rhs) < 0;
1972  }
1973 };
1974 
1975 
1977  const CBioseq& seq, const vector<CTempString>& labels)
1978 {
1979  if (labels.size() <= 1) {
1980  // optimize fast case
1981  return;
1982  }
1983  if (m_Imp.IsRefSeqConventions() || m_Imp.IsRefSeq()) {
1984  return;
1985  }
1986 
1987  static const string kWarningPrefix =
1988  "Multiple equivalent publications annotated on this sequence [";
1989  static const string::size_type kMaxSummaryLen = 100;
1990 
1991  // TTempStringCount maps a CTempString to the number of times it appears
1992  // (Note case-insensitivity and non-lexicographical order)
1994  TLabelCount label_count;
1995 
1996  ITERATE(vector<CTempString>, label_it, labels) {
1997  ++label_count[*label_it];
1998  }
1999 
2000  // put the dups into a vector and sort
2001  vector<CTempString> sorted_dup_labels;
2002  ITERATE(TLabelCount, label_count_it, label_count) {
2003  int num_appearances = label_count_it->second;
2004  _ASSERT(num_appearances > 0);
2005  if (num_appearances > 1) {
2006  const CTempString& dup_label = label_count_it->first;
2007  sorted_dup_labels.push_back(dup_label);
2008  }
2009  }
2010  sort(BEGIN_COMMA_END(sorted_dup_labels), SCaseInsensitiveLess());
2011 
2012  // find all that appear multiple times
2013  string err_msg = kWarningPrefix; // avoid create and destroy on each iter'n
2014  ITERATE(vector<CTempString>, dup_label_it, sorted_dup_labels) {
2015  const CTempString& summary = *dup_label_it;
2016 
2017  err_msg.resize(kWarningPrefix.length());
2018  if (summary.length() > kMaxSummaryLen) {
2019  err_msg += summary.substr(0, kMaxSummaryLen);
2020  err_msg += "...";
2021  } else {
2022  err_msg += summary;
2023  }
2024  err_msg += "]";
2026  err_msg, seq);
2027  }
2028 }
2029 
2030 
2032 {
2033  // used to check for dups. Currently only deals with cases where
2034  // there's an otherpub, but check if this comment is out of date.
2035  set<TEntrezId> muids_seen;
2036  set<TEntrezId> pmids_seen;
2037 
2038  vector<int> serials;
2039  vector<CTempString> published_labels;
2040  vector<CTempString> unpublished_labels;
2041 
2044 
2045  const CBioseq& seq = *(bsh.GetCompleteBioseq());
2046 
2047  for (CSeqdesc_CI it(bsh, CSeqdesc::e_Pub); it; ++it) {
2048  CConstRef<CPubdesc> pub = ConstRef(&it->GetPub());
2049  // first, try to receive from cache
2050  const CCacheImpl::CPubdescInfo& pubdesc_info =
2051  GetCache().GetPubdescToInfo(pub);
2052  // note that some (e.g. pmids are ignored other than maybe storing
2053  // in the cache above)
2054  copy(BEGIN_COMMA_END(pubdesc_info.m_published_labels),
2055  back_inserter(published_labels));
2057  back_inserter(unpublished_labels));
2058 
2059  TEntrezId muid = ZERO_ENTREZ_ID;
2060  TEntrezId pmid = ZERO_ENTREZ_ID;
2061  bool otherpub = false;
2062  FOR_EACH_PUB_ON_PUBDESC (pub_it, *pub) {
2063  switch ((*pub_it)->Which()) {
2064  case CPub::e_Muid:
2065  muid = (*pub_it)->GetMuid();
2066  break;
2067  case CPub::e_Pmid:
2068  pmid = (*pub_it)->GetPmid();
2069  break;
2070  default:
2071  otherpub = true;
2072  break;
2073  }
2074  }
2075 
2076  if (otherpub) {
2077  bool collision = false;
2078  if (muid > ZERO_ENTREZ_ID) {
2079  if (muids_seen.find(muid) != muids_seen.end()) {
2080  collision = true;
2081  } else {
2082  muids_seen.insert(muid);
2083  }
2084  }
2085  if (pmid > ZERO_ENTREZ_ID) {
2086  if (pmids_seen.find(pmid) != pmids_seen.end()) {
2087  collision = true;
2088  } else {
2089  pmids_seen.insert(pmid);
2090  }
2091  }
2092  if (collision && ! m_Imp.IsRefSeqConventions() && ! m_Imp.IsRefSeq()) {
2094  "Multiple publications with identical PubMed ID", *ctx, *it);
2095  }
2096  }
2097  }
2098 
2099  x_ReportDuplicatePubLabels(seq, unpublished_labels);
2100  x_ReportDuplicatePubLabels(seq, published_labels);
2101 
2102 }
2103 
2104 
2106 {
2107  if (! seq.GetInst().IsSetHist()) {
2108  return;
2109  }
2110 
2111  TGi gi = ZERO_GI;
2112  FOR_EACH_SEQID_ON_BIOSEQ (id, seq) {
2113  if ((*id)->IsGi()) {
2114  gi = (*id)->GetGi();
2115  break;
2116  }
2117  }
2118  if (gi == ZERO_GI) {
2119  return;
2120  }
2121 
2122  const CSeq_hist& hist = seq.GetInst().GetHist();
2123  if (hist.IsSetReplaced_by() && hist.GetReplaced_by().IsSetDate()) {
2124  const CSeq_hist_rec& rec = hist.GetReplaced_by();
2125  ITERATE(CSeq_hist_rec::TIds, id, rec.GetIds()) {
2126  if ((*id)->IsGi()) {
2127  if (gi == (*id)->GetGi()) {
2129  "Replaced by gi (" +
2130  NStr::NumericToString(gi) + ") is same as current Bioseq",
2131  seq);
2132  break;
2133  }
2134  }
2135  }
2136  }
2137 
2138  if (hist.IsSetReplaces() && hist.GetReplaces().IsSetDate()) {
2139  const CSeq_hist_rec& rec = hist.GetReplaces();
2140  ITERATE(CSeq_hist_rec::TIds, id, rec.GetIds()) {
2141  if ((*id)->IsGi()) {
2142  if (gi == (*id)->GetGi()) {
2144  "Replaces gi (" +
2145  NStr::NumericToString(gi) + ") is same as current Bioseq",
2146  seq);
2147  break;
2148  }
2149  }
2150  }
2151  }
2152 }
2153 
2154 
2155 // =============================================================================
2156 // Private
2157 // =============================================================================
2158 
2159 
2160 
2161 
2162 // Is the id contained in the bioseq?
2163 bool CValidError_bioseq::IsIdIn(const CSeq_id& id, const CBioseq& seq)
2164 {
2165  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2166  if (id.Match(**it)) {
2167  return true;
2168  }
2169  }
2170  return false;
2171 }
2172 
2173 
2175 {
2176  if (! inst.IsSetSeq_data()) {
2177  return 0;
2178  }
2179 
2180  const CSeq_data& seqdata = inst.GetSeq_data();
2181  switch (seqdata.Which()) {
2182  case CSeq_data::e_not_set:
2183  return 0;
2184  case CSeq_data::e_Iupacna:
2185  return seqdata.GetIupacna().Get().size();
2186  case CSeq_data::e_Iupacaa:
2187  return seqdata.GetIupacaa().Get().size();
2188  case CSeq_data::e_Ncbi2na:
2189  return seqdata.GetNcbi2na().Get().size();
2190  case CSeq_data::e_Ncbi4na:
2191  return seqdata.GetNcbi4na().Get().size();
2192  case CSeq_data::e_Ncbi8na:
2193  return seqdata.GetNcbi8na().Get().size();
2194  case CSeq_data::e_Ncbipna:
2195  return seqdata.GetNcbipna().Get().size();
2196  case CSeq_data::e_Ncbi8aa:
2197  return seqdata.GetNcbi8aa().Get().size();
2198  case CSeq_data::e_Ncbieaa:
2199  return seqdata.GetNcbieaa().Get().size();
2200  case CSeq_data::e_Ncbipaa:
2201  return seqdata.GetNcbipaa().Get().size();
2203  return seqdata.GetNcbistdaa().Get().size();
2204  default:
2205  return 0;
2206  }
2207 }
2208 
2209 
2210 // Returns true if seq derived from translation ending in "*" or
2211 // seq is 3' partial (i.e. the right of the sequence is incomplete)
2213 {
2214 
2215  // Look for the Cdregion feature used to create this aa product
2216  // Use the Cdregion to translate the associated na sequence
2217  // and check if translation has a '*' at the end. If it does.
2218  // message about 'X' at the end of this aa product sequence is suppressed
2219  try {
2220  const CSeq_feat* sfp = m_Imp.GetCDSGivenProduct(seq);
2221  if (sfp) {
2222  // Translate na CSeq_data
2223  string prot;
2225  if (prot[prot.size() - 1] == '*') {
2226  return true;
2227  }
2228  return false;
2229  }
2230 
2231  // Get CMolInfo for seq and determine if completeness is
2232  // "eCompleteness_no_right or eCompleteness_no_ends. If so
2233  // suppress message about "X" at end of aa sequence is suppressed
2235  if (mi && mi->IsSetCompleteness()) {
2236  if (mi->GetCompleteness() == CMolInfo::eCompleteness_no_right ||
2237  mi->GetCompleteness() == CMolInfo::eCompleteness_no_ends) {
2238  return true;
2239  }
2240  }
2241  } catch (const CException&) {
2242  } catch (const std::exception&) {
2243  }
2244  return false;
2245 }
2246 
2247 
2248 //LCOV_EXCL_START
2249 //part of segset validation, no longer used
2251 {
2252  CRef<CSeq_loc> loc;
2253  if (! seq.GetInst().IsSetExt()) {
2254  return loc;
2255  }
2256 
2257  if (seq.GetInst().GetExt().IsSeg()) {
2258  CRef<CSeq_loc> nloc(new CSeq_loc());
2259  loc = nloc;
2260  CSeq_loc_mix& mix = loc->SetMix();
2261  ITERATE (list< CRef<CSeq_loc> >, it,
2262  seq.GetInst().GetExt().GetSeg().Get()) {
2263  mix.Set().push_back(*it);
2264  }
2265  } else if (seq.GetInst().GetExt().IsRef()) {
2266  CRef<CSeq_loc> nloc(new CSeq_loc());
2267  loc = nloc;
2268  loc->Add(seq.GetInst().GetExt().GetRef());
2269  }
2270  return loc;
2271 }
2272 //LCOV_EXCL_STOP
2273 
2274 
2275 // Check if CdRegion required but not found
2277 {
2278  if (bsh && CSeq_inst::IsAa(bsh.GetInst_Mol())) {
2279  CSeq_entry_Handle nps =
2281  if (nps) {
2282  const CSeq_feat* cds = GetCDSForProduct(bsh);
2283  if (! cds) {
2284  const CSeq_feat* mat = GetPROTForProduct(bsh);
2285  if (! mat) {
2286  return true;
2287  }
2288  }
2289  }
2290  }
2291 
2292  return false;
2293 }
2294 
2295 
2297 {
2299 
2300  if (sd) {
2301  const CMolInfo& mi = sd->GetMolinfo();
2302  if (mi.IsSetBiomol()) {
2303  return mi.GetBiomol() == CMolInfo::eBiomol_mRNA;
2304  }
2305  } else if (bsh.GetBioseqMolType() == CSeq_inst::eMol_rna) {
2306  // if no molinfo, assume rna is mrna
2307  return true;
2308  }
2309 
2310  return false;
2311 }
2312 
2313 
2315 {
2316  size_t counter = 0;
2317  for (CSeq_loc_CI slit(loc); slit; ++slit) {
2318  if (! IsFarLocation(slit.GetEmbeddingSeq_loc(), m_Imp.GetTSEH())) {
2319  ++counter;
2320  }
2321  }
2322  return counter;
2323 }
2324 
2325 
2327  const CSeq_feat& curr,
2328  const CSeq_feat& prev)
2329 {
2330  if (curr.IsSetExcept() && curr.GetExcept() && curr.IsSetExcept_text()) {
2331  if (NStr::FindNoCase(curr.GetExcept_text(), "alternative processing") != NPOS) {
2332  return false;
2333  }
2334  }
2335  if (prev.IsSetExcept() && prev.GetExcept() && prev.IsSetExcept_text()) {
2336  if (NStr::FindNoCase(prev.GetExcept_text(), "alternative processing") != NPOS) {
2337  return false;
2338  }
2339  }
2340 
2341  const CProt_ref& currP = curr.GetData().GetProt();
2342  const CProt_ref& prevP = prev.GetData().GetProt();
2343 
2344  if (currP.IsSetName() && prevP.IsSetName()) {
2345  string currN;
2346  for (auto it : currP.GetName()) {
2347  currN = it;
2348  break;
2349  }
2350  string prevN;
2351  for (auto it : prevP.GetName()) {
2352  prevN = it;
2353  break;
2354  }
2355  if (NStr::EqualNocase(currN, "anchored capsid protein ancC") && NStr::EqualNocase(prevN, "capsid protein C")) {
2356  return false;
2357  }
2358  if (NStr::EqualNocase(prevN, "anchored capsid protein ancC") && NStr::EqualNocase(currN, "capsid protein C")) {
2359  return false;
2360  }
2361  if (NStr::EqualNocase(currN, "membrane glycoprotein precursor prM") && NStr::EqualNocase(prevN, "protein pr")) {
2362  return false;
2363  }
2364  if (NStr::EqualNocase(prevN, "membrane glycoprotein precursor prM") && NStr::EqualNocase(currN, "protein pr")) {
2365  return false;
2366  }
2367  if (NStr::EqualNocase(currN, "membrane glycoprotein precursor prM") && NStr::EqualNocase(prevN, "membrane glycoprotein M")) {
2368  return false;
2369  }
2370  if (NStr::EqualNocase(prevN, "membrane glycoprotein precursor prM") && NStr::EqualNocase(currN, "membrane glycoprotein M")) {
2371  return false;
2372  }
2373  if (NStr::EqualNocase(currN, "anchored capsid protein C") && NStr::EqualNocase(prevN, "capsid protein C")) {
2374  return false;
2375  }
2376  if (NStr::EqualNocase(prevN, "anchored capsid protein C") && NStr::EqualNocase(currN, "capsid protein C")) {
2377  return false;
2378  }
2379  if (NStr::EqualNocase(currN, "membrane glycoprotein precursor M") && NStr::EqualNocase(prevN, "protein pr")) {
2380  return false;
2381  }
2382  if (NStr::EqualNocase(prevN, "membrane glycoprotein precursor M") && NStr::EqualNocase(currN, "protein pr")) {
2383  return false;
2384  }
2385  if (NStr::EqualNocase(currN, "membrane glycoprotein precursor M") && NStr::EqualNocase(prevN, "membrane glycoprotein M")) {
2386  return false;
2387  }
2388  if (NStr::EqualNocase(prevN, "membrane glycoprotein precursor M") && NStr::EqualNocase(currN, "membrane glycoprotein M")) {
2389  return false;
2390  }
2391  }
2392 
2393 
2394  return true;
2395 }
2396 
2397 
2398 #define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var) \
2399 ITERATE (CBioseq_Handle::TId, Itr, Var.GetId())
2400 
2402 {
2403  if (! IsMaster(seq)) {
2404  return false;
2405  }
2406  CBioseq_Handle bsh = scope.GetBioseqHandle(seq);
2407  return IsWGS(bsh);
2408 }
2409 
2410 
2412 {
2413  bool rval = false;
2414  if (entry.IsSeq()) {
2415  if (IsMaster(entry.GetSeq()) && IsWGS(entry.GetSeq())) {
2416  rval = true;
2417  }
2418  } else if (entry.IsSet() && entry.GetSet().IsSetSeq_set()) {
2420  if (IsWGSMaster(**it)) {
2421  rval = true;
2422  break;
2423  }
2424  }
2425  }
2426  return rval;
2427 }
2428 
2429 
2431 {
2432  if (! seq.IsSetDescr()) {
2433  return false;
2434  }
2435  ITERATE(CBioseq::TDescr::Tdata, it, seq.GetDescr().Get()) {
2436  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech() && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2437  return true;
2438  }
2439  }
2440  return false;
2441 }
2442 
2443 
2445 {
2446  CSeqdesc_CI molinfo(bsh, CSeqdesc::e_Molinfo);
2447  if (molinfo && molinfo->GetMolinfo().IsSetTech() && molinfo->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2448  return true;
2449  }
2450  return false;
2451 }
2452 
2453 
2455 {
2456  bool rval = false;
2457  if (entry.IsSeq()) {
2458  rval = IsWGS(entry.GetSeq());
2459  } else if (entry.IsSet() && entry.GetSet().IsSetSeq_set()) {
2460  for (auto it : entry.GetSet().GetSeq_set()) {
2461  if (IsWGS(*it)) {
2462  rval = true;
2463  break;
2464  }
2465  }
2466  }
2467  return rval;
2468 }
2469 
2470 
2472 {
2473  const CTextseq_id* txt = id.GetTextseq_Id();
2474  if (! txt || ! txt->IsSetAccession()) {
2475  return false;
2476  }
2479  return true;
2480  } else {
2481  return false;
2482  }
2483 }
2484 
2485 
2487 {
2488  if (! seq.IsSetId()) {
2489  return false;
2490  }
2491  ITERATE(CBioseq::TId, id, seq.GetId()) {
2492  if (IsWGSAccession(**id)) {
2493  return true;
2494  }
2495  }
2496  return false;
2497 }
2498 
2499 
2501 {
2502  const CTextseq_id* txt = id.GetTextseq_Id();
2503  if (! txt || ! txt->IsSetAccession()) {
2504  return false;
2505  }
2508  return true;
2509  } else {
2510  return false;
2511  }
2512 }
2513 
2514 
2516 {
2517  if (! seq.IsSetId()) {
2518  return false;
2519  }
2520  ITERATE(CBioseq::TId, id, seq.GetId()) {
2521  if (IsTSAAccession(**id)) {
2522  return true;
2523  }
2524  }
2525  return false;
2526 }
2527 
2528 
2530 {
2531  CBioseq_Handle bsh = scope.GetBioseqHandle(seq);
2532  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
2533  if (desc && desc->GetMolinfo().IsSetCompleteness()) {
2534  CMolInfo::TCompleteness completeness = desc->GetMolinfo().GetCompleteness();
2535  if (completeness == CMolInfo::eCompleteness_partial
2536  || completeness == CMolInfo::eCompleteness_no_left
2537  || completeness == CMolInfo::eCompleteness_no_right
2538  || completeness == CMolInfo::eCompleteness_no_ends) {
2539  return true;
2540  }
2541  }
2542  return false;
2543 }
2544 
2545 
2547 {
2548  FOR_EACH_SEQID_ON_BIOSEQ(id, seq) {
2549  if ((*id)->IsPdb()) {
2550  return true;
2551  }
2552  }
2553  return false;
2554 }
2555 
2556 
2558 {
2559  if (IsPdb(seq) || IsWGSMaster(seq, *m_Scope)) {
2560  return;
2561  }
2562  const CSeq_inst& inst = seq.GetInst();
2563 
2564  TSeqPos len = inst.IsSetLength() ? inst.GetLength() : 0;
2565  if (seq.IsAa()) {
2566  if (len <= 3 && ! IsPartial(seq, *m_Scope)) {
2567  PostErr(eDiag_Warning, eErr_SEQ_INST_ShortSeq, "Sequence only " +
2568  NStr::IntToString(len) + " residues", seq);
2569  }
2570  } else {
2571  if (len <= 10 && m_report_short_seq) {
2572  PostErr(eDiag_Warning, eErr_SEQ_INST_ShortSeq, "Sequence only " +
2573  NStr::IntToString(len) + " residues", seq);
2574  }
2575  }
2576 }
2577 
2578 
2579 // Assumes that seq is segmented and has Seq-ext data
2581 {
2582  // Get parent CSeq_entry of seq and then find the next
2583  // CSeq_entry in the set. This CSeq_entry should be a CBioseq_set
2584  // of class parts.
2585  const CSeq_entry* se = seq.GetParentEntry();
2586  if (! se) {
2587  return;
2588  }
2589  const CSeq_entry* parent = se->GetParentEntry();
2590  if (! parent) {
2591  return;
2592  }
2593  if (! parent->IsSet() || ! parent->GetSet().IsSetClass() || parent->GetSet().GetClass() != CBioseq_set::eClass_segset) {
2594  return;
2595  }
2596 
2597  // Loop through seq_set looking for the parts set.
2598  FOR_EACH_SEQENTRY_ON_SEQSET (it, parent->GetSet()) {
2599  if ((*it)->Which() == CSeq_entry::e_Set
2600  && (*it)->GetSet().IsSetClass()
2601  && (*it)->GetSet().GetClass() == CBioseq_set::eClass_parts) {
2602  const CBioseq_set::TSeq_set& parts = (*it)->GetSet().GetSeq_set();
2603  const CSeg_ext::Tdata& locs = seq.GetInst().GetExt().GetSeg().Get();
2604 
2605  // Make sure the number of locations (excluding null locations)
2606  // match the number of parts
2607  size_t nulls = 0;
2608  ITERATE (CSeg_ext::Tdata, loc, locs) {
2609  if ((*loc)->IsNull()) {
2610  nulls++;
2611  }
2612  }
2613  if (locs.size() - nulls < parts.size()) {
2615  "Parts set contains too many Bioseqs", seq);
2616  return;
2617  } else if (locs.size() - nulls > parts.size()) {
2619  "Parts set does not contain enough Bioseqs", seq);
2620  return;
2621  }
2622 
2623  // Now, simultaneously loop through the parts of se_parts and CSeq_locs of
2624  // seq's CSseq-ext. If don't compare, post error.
2625  size_t size = locs.size(); // == parts.size()
2626  CSeg_ext::Tdata::const_iterator loc_it = locs.begin();
2627  CBioseq_set::TSeq_set::const_iterator part_it = parts.begin();
2628  for (size_t i = 0; i < size; ++i) {
2629  try {
2630  if ((*loc_it)->IsNull()) {
2631  ++loc_it;
2632  continue;
2633  }
2634  if (! (*part_it)->IsSeq()) {
2636  "Parts set component is not Bioseq", seq);
2637  return;
2638  }
2639  const CSeq_id& loc_id = GetId(**loc_it, m_Scope);
2640  if (! IsIdIn(loc_id, (*part_it)->GetSeq())) {
2642  "Segmented bioseq seq_ext does not correspond to parts "
2643  "packaging order", seq);
2644  return;
2645  }
2646 
2647  // advance both iterators
2648  ++part_it;
2649  ++loc_it;
2650  } catch (const CObjmgrUtilException&) {
2651  ERR_POST_X(4, "Seq-loc not for unique sequence");
2652  return;
2653  } catch (CException& x1) {
2654  string err_msg = "Unknown error:";
2655  err_msg += x1.what();
2656  ERR_POST_X(5, err_msg);
2657  return;
2658  } catch (std::exception& x2) {
2659  string err_msg = "Unknown error:";
2660  err_msg += x2.what();
2661  ERR_POST_X(5, err_msg);
2662  return;
2663  }
2664  }
2665  }
2666  }
2667 }
2668 
2669 static bool s_IsConWithGaps(const CBioseq& seq)
2670 {
2671  if (! seq.IsSetInst()) return false;
2672  const CSeq_inst& inst = seq.GetInst();
2673  if (! inst.IsSetExt()) return false;
2674  if (! inst.GetExt().IsDelta()) return false;
2675 
2676  ITERATE(CDelta_ext::Tdata, iter, inst.GetExt().GetDelta().Get()) {
2677  if (! (*iter)->IsLiteral()) continue;
2678  const CSeq_literal& lit = (*iter)->GetLiteral();
2679  if (! lit.IsSetSeq_data()) return true;
2680  if (lit.GetSeq_data().IsGap() && lit.GetLength() > 0) return true;
2681  }
2682 
2683  return false;
2684 }
2685 
2686 
2688 {
2689  bool has_gap = false;
2690  if (seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta()) {
2691  ITERATE(CDelta_ext::Tdata, iter, seq.GetInst().GetExt().GetDelta().Get()) {
2692  if ((*iter)->IsLiteral() &&
2693  (! (*iter)->GetLiteral().IsSetSeq_data() || (*iter)->GetLiteral().GetSeq_data().IsGap())) {
2694  has_gap = true;
2695  break;
2696  }
2697  }
2698  }
2699  return has_gap;
2700 }
2701 
2703 {
2705  if (! bsh) {
2706  return;
2707  }
2708 
2709  string title = sequence::CDeflineGenerator().GenerateDefline(bsh);
2710 
2711 /*bsv
2712  CMolInfo::TTech tech = CMolInfo::eTech_unknown;
2713 */
2714  CSeqdesc_CI desc(bsh, CSeqdesc::e_Molinfo);
2715  if (desc) {
2716  const CMolInfo& mi = desc->GetMolinfo();
2717 /*bsv
2718  tech = mi.GetTech();
2719 */
2721  if (m_Imp.IsGenbank()) {
2722  if (NStr::Find(title, "complete genome") != NPOS) {
2723  const CSeq_entry& ctx = *seq.GetParentEntry();
2725  "Complete genome in title without complete flag set",
2726  ctx, *desc);
2727  }
2728  }
2730  (! s_IsConWithGaps(seq)) &&
2731  ! m_Imp.IsEmbl() && ! m_Imp.IsDdbj()) {
2732  const CSeq_entry& ctx = *seq.GetParentEntry();
2734  "Circular topology without complete flag set", ctx, *desc);
2735  }
2736  }
2737  }
2738 
2739  // warning if title contains complete genome but sequence contains gap features
2740  if (NStr::FindNoCase(title, "complete genome") != NPOS && x_HasGap(seq)) {
2742  "Title contains 'complete genome' but sequence has gaps", seq);
2743  }
2744 
2745 
2746  // note - test for protein titles was moved to CValidError_bioseqset::ValidateNucProtSet
2747  // because it only applied for protein sequences in nuc-prot sets and it's more efficient
2748  // to create the defline generator once per nuc-prot set
2749 }
2750 
2751 static bool HasAssemblyOrNullGap (const CBioseq& seq)
2752 {
2753  const CSeq_inst& inst = seq.GetInst();
2754  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2755  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2756  if (! (*sg)) continue;
2757  if ((**sg).Which() != CDelta_seq::e_Literal) continue;
2758  const CSeq_literal& lit = (*sg)->GetLiteral();
2759  if (! lit.IsSetSeq_data()) return true;
2760  if (lit.GetSeq_data().IsGap()) return true;
2761  }
2762  }
2763 
2764  return false;
2765 }
2766 
2767 
2769 {
2770  const CSeq_inst& inst = seq.GetInst();
2771  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2772  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2773  if (! (*sg) ) continue;
2774  if ((**sg).Which() != CDelta_seq::e_Literal) continue;
2775  const CSeq_literal& lit = (*sg)->GetLiteral();
2776  if (! lit.IsSetSeq_data()) {
2777  PostErr(eDiag_Warning, eErr_SEQ_INST_SeqGapProblem, "TSA Seq_data NULL", seq);
2778  } else {
2779  const CSeq_data& data = lit.GetSeq_data();
2780  if (data.Which() == CSeq_data::e_Gap) {
2781  const CSeq_gap& gap = data.GetGap();
2782  if (gap.IsSetType()) {
2783  int gaptype = gap.GetType();
2784  if (gaptype == CSeq_gap::eType_unknown) {
2785  PostErr(eDiag_Warning, eErr_SEQ_INST_TSAseqGapProblem, "TSA Seq_gap.unknown", seq);
2786  } else if (gaptype == CSeq_gap::eType_other) {
2787  PostErr(eDiag_Warning, eErr_SEQ_INST_TSAseqGapProblem, "TSA Seq_gap.other", seq);
2788  }
2789  } else {
2790  PostErr(eDiag_Warning, eErr_SEQ_INST_TSAseqGapProblem, "TSA Seq_gap NULL", seq);
2791  }
2792  }
2793  }
2794  }
2795  }
2796 }
2797 
2798 
2800 {
2801  const CSeq_inst& inst = seq.GetInst();
2802  if (inst.CanGetRepr() && inst.GetRepr() == CSeq_inst::eRepr_delta && inst.CanGetExt() && inst.GetExt().IsDelta()) {
2803  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
2804  if (! (*sg)) continue;
2805  // CON division - far delta - suppresses errors
2806  if ((**sg).Which() != CDelta_seq::e_Literal) /* continue */ return false;
2807  const CSeq_literal& lit = (*sg)->GetLiteral();
2808  if (! lit.IsSetSeq_data()) {
2809  return true;
2810  } else {
2811  const CSeq_data& data = lit.GetSeq_data();
2812  if (data.Which() == CSeq_data::e_Gap) {
2813  const CSeq_gap& gap = data.GetGap();
2814  CSeq_gap::TType gap_type = gap.IsSetType() ? gap.GetType() : CSeq_gap::eType_unknown;
2815 
2816  if (gap_type != CSeq_gap::eType_centromere && gap_type != CSeq_gap::eType_heterochromatin &&
2817  gap_type != CSeq_gap::eType_short_arm && gap_type != CSeq_gap::eType_telomere &&
2818  gap_type != CSeq_gap::eType_contig) {
2819 
2820  if (! gap.IsSetLinkage_evidence() || gap.GetLinkage_evidence().empty()) {
2821  return true;
2822  }
2823  }
2824  }
2825  }
2826  }
2827  }
2828  return false;
2829 }
2830 
2831 
2833 {
2834  if (HasBadWGSGap(seq)) {
2836  "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence.", seq);
2837  }
2838 }
2839 
2840 
2842 {
2843  if (HasBadWGSGap(seq)) {
2845  "TSA submission includes wrong gap type. Gaps for TSA should be Assembly Gaps with linkage evidence.", seq);
2846  }
2847 }
2848 
2849 
2851 {
2852  if (HasBadWGSGap(seq)) {
2854  "Genome submission includes wrong gap type. Gaps for genomes should be Assembly Gaps with linkage evidence.", seq);
2855  }
2856 }
2857 
2858 
2859 bool s_FieldHasLabel(const CUser_field& field, const string& label)
2860 {
2861  if (field.IsSetLabel() && field.GetLabel().IsStr() &&
2862  NStr::EqualNocase(field.GetLabel().GetStr(), label)) {
2863  return true;
2864  } else {
2865  return false;
2866  }
2867 }
2868 
2869 
2871 {
2872  if (! field.IsSetData()) {
2873  return false;
2874  }
2875  bool rval = false;
2876  if (field.GetData().IsStr()) {
2877  if (! NStr::IsBlank(field.GetData().GetStr())) {
2878  rval = true;
2879  }
2880  } else if (field.GetData().IsStrs()) {
2882  if (! NStr::IsBlank(*s)) {
2883  rval = true;
2884  break;
2885  }
2886  }
2887  }
2888  return rval;
2889 }
2890 
2891 
2893 {
2894  bool has_biosample = false;
2895  bool has_bioproject = false;
2896 
2897  CSeqdesc_CI d(bsh, CSeqdesc::e_User);
2898  while (d) {
2900  for (auto it : d->GetUser().GetData()) {
2901  if (s_FieldHasLabel(*it, "BioSample")) {
2902  if (s_FieldHasNonBlankValue(*it)) {
2903  has_biosample = true;
2904  }
2905  } else if (s_FieldHasLabel(*it, "BioProject")) {
2906  if (s_FieldHasNonBlankValue(*it)) {
2907  has_bioproject = true;
2908  }
2909  }
2910  }
2911  } else if (m_Imp.IsGenbank()) {
2912  const CUser_object& uo = d->GetUser();
2913  if (uo.GetType().IsStr()) {
2914  const string& type = uo.GetType().GetStr();
2915  if (NStr::CompareNocase(type, "WGSProjects") == 0) {
2916  int fr = 0;
2917  int to = 0;
2918 
2919  for (auto it : uo.GetData()) {
2920  if (! it->GetLabel().IsStr()) {
2921  continue;
2922  }
2923  const string& label = it->GetLabel().GetStr();
2924  if (NStr::CompareNocase(label, "WGS_accession_first") == 0 ||
2925  NStr::CompareNocase(label, "Accession_first") == 0) {
2926  const string& str = it->GetData().GetStr();
2927  auto fst = str.find_first_of("0123456789");
2928  fr = NStr::StringToInt (str.substr(fst));
2929  } else if (NStr::CompareNocase(label, "WGS_accession_last") == 0 ||
2930  NStr::CompareNocase(label, "Accession_last") == 0) {
2931  const string& str = it->GetData().GetStr();
2932  auto lst = str.find_first_of("0123456789");
2933  to = NStr::StringToInt (str.substr(lst));
2934  }
2935  }
2936  if ((fr != 0) && (to != 0)) {
2937  int df = to - fr + 1;
2938  int blen = bsh.GetBioseqLength();
2939  if (df != blen) {
2941  "Number of accessions (" + NStr::IntToString(df) + ") does not correspond to number of records (" + NStr::IntToString(blen) +")",
2942  *(bsh.GetCompleteBioseq()));
2943  }
2944  }
2945  }
2946  }
2947  }
2948  ++d;
2949  }
2950  if (! has_biosample && ! has_bioproject) {
2952  "WGS master lacks both BioSample and BioProject",
2953  *(bsh.GetCompleteBioseq()));
2954  } else if (! has_biosample) {
2956  "WGS master lacks BioSample",
2957  *(bsh.GetCompleteBioseq()));
2958  } else if (! has_bioproject) {
2960  "WGS master lacks BioProject",
2961  *(bsh.GetCompleteBioseq()));
2962  }
2963  if (! has_biosample || ! has_bioproject) {
2964  }
2965 }
2966 
2967 
2968 static EDiagSev GetBioseqEndWarning (const CBioseq& seq, bool is_circular, EBioseqEndIsType end_is_char)
2969 {
2970  EDiagSev sev;
2971  bool only_local = true;
2972  bool is_NCACNTNW = false;
2973  bool is_patent = false;
2974  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2975  if (! (*id_it)->IsLocal()) {
2976  only_local = false;
2977  if ((*id_it)->IsPatent()) {
2978  is_patent = true;
2979  } else if (IsNTNCNWACAccession(**id_it)) {
2980  is_NCACNTNW = true;
2981  }
2982  }
2983  }
2984 
2985  if (is_NCACNTNW || is_patent) {
2986  sev = eDiag_Warning;
2987  } else if (is_circular) {
2988  sev = eDiag_Warning;
2989  } else if (only_local) {
2990  sev = eDiag_Warning;
2991  } else if (end_is_char == eBioseqEndIsType_All) {
2992  sev = eDiag_Error;
2993  } else {
2994  sev = eDiag_Warning;
2995  }
2996  return sev;
2997 }
2998 
2999 
3000 void CValidError_bioseq::x_CalculateNsStretchAndTotal(const CSeqVector& vec, TSeqPos& num_ns, TSeqPos& max_stretch, bool& n5, bool& n3)
3001 {
3003 
3004  num_ns = 0;
3005  max_stretch = 0;
3006  n5 = false;
3007  n3 = false;
3008 
3009  TSeqPos this_stretch = 0;
3010  for (TSeqPos i = 0; i < vec.size(); i++) {
3011  if (vec[i] == 'N') {
3012  num_ns++;
3013  if (vec.IsInGap(i)) {
3014  if (max_stretch < this_stretch) {
3015  max_stretch = this_stretch;
3016  }
3017  this_stretch = 0;
3018  } else {
3019  this_stretch++;
3020  if (this_stretch >= 10) {
3021  if (i < 20) {
3022  n5 = true;
3023  }
3024  if (vec.size() > 20 && i > vec.size() - 10) {
3025  n3 = true;
3026  }
3027  }
3028  }
3029  } else {
3030  if (max_stretch < this_stretch) {
3031  max_stretch = this_stretch;
3032  }
3033  this_stretch = 0;
3034  }
3035  }
3036  if (max_stretch < this_stretch) {
3037  max_stretch = this_stretch;
3038  }
3039 }
3040 
3041 
3043 {
3044  bool rval = false;
3045  if (HasAssemblyOrNullGap(seq)) {
3046  return rval;
3047  }
3048 
3050  if (! bsh) {
3051  return rval;
3052  }
3053 
3054  TSeqPos num_ns = 0;
3055  TSeqPos max_stretch = 0;
3056  bool n5 = false;
3057  bool n3 = false;
3058 
3060  x_CalculateNsStretchAndTotal(vec, num_ns, max_stretch, n5, n3);
3061 
3062  if (max_stretch >= 15) {
3064  "Sequence has a stretch of " + NStr::IntToString(max_stretch) + " Ns", seq);
3065  rval = true;
3066  } else {
3067  if (n5) {
3069  "Sequence has a stretch of at least 10 Ns within the first 20 bases", seq);
3070  rval = true;
3071  }
3072  if (n3) {
3074  "Sequence has a stretch of at least 10 Ns within the last 20 bases", seq);
3075  rval = true;
3076  }
3077  }
3078  return rval;
3079 }
3080 
3081 
3082 // check to see if sequence is all Ns
3084 {
3085  bool rval = true;
3086  bool at_least_one = false;
3087  try {
3088  for (CSeqVector_CI sv_iter(vec); (sv_iter) && rval; ++sv_iter) {
3089  if (*sv_iter != 'N') {
3090  rval = false;
3091  }
3092  at_least_one = true;
3093  }
3094  } catch (CException&) {
3095  }
3096  return (rval && at_least_one);
3097 }
3098 
3099 
3100 static int CountNs(const CSeq_data& seq_data, TSeqPos len)
3101 {
3102  int total = 0;
3103  switch (seq_data.Which()) {
3104  case CSeq_data::e_Ncbi4na: {
3105  vector<char>::const_iterator it = seq_data.GetNcbi4na().Get().begin();
3106  unsigned char mask = 0xf0;
3107  unsigned char shift = 4;
3108  for (size_t n = 0; n < len; n++) {
3109  unsigned char c = ((*it) & mask) >> shift;
3110  mask >>= 4;
3111  shift -= 4;
3112  if (! mask) {
3113  mask = 0xf0;
3114  shift = 4;
3115  ++it;
3116  }
3117  if (c == 15) {
3118  total++;
3119  }
3120  }
3121  }
3122  return total;
3123  case CSeq_data::e_Iupacna: {
3124  const string& s = seq_data.GetIupacna().Get();
3125  for (size_t n = 0; n < len && n < s.length(); n++) {
3126  if (s[n] == 'N') {
3127  total++;
3128  }
3129  }
3130  }
3131  return total;
3132  case CSeq_data::e_Ncbi8na:
3133  case CSeq_data::e_Ncbipna: {
3134  CSeq_data iupacna;
3135  if (! CSeqportUtil::Convert(seq_data, &iupacna, CSeq_data::e_Iupacna)) {
3136  return total;
3137  }
3138  const string& s = iupacna.GetIupacna().Get();
3139  for (size_t n = 0; n < len; n++) {
3140  if (s[n] == 'N') {
3141  total++;
3142  }
3143  }
3144  }
3145  return total;
3146  default:
3147  return total;
3148  }
3149 }
3150 
3151 
3153 {
3154  int count = 0;
3155  SSeqMapSelector sel;
3157  for (CSeqMap_CI seq_iter(bsh, sel); seq_iter; ++seq_iter) {
3158  switch (seq_iter.GetType()) {
3159  case CSeqMap::eSeqData:
3160  count += CountNs(seq_iter.GetData(), seq_iter.GetLength());
3161  break;
3162  default:
3163  break;
3164  }
3165  }
3166 /*
3167  int pct_n = 0;
3168  try {
3169  CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
3170  TSeqPos num_ns = 0;
3171  for (size_t i = 0; i < vec.size(); i++) {
3172  try {
3173  if (vec[i] == 'N' && !vec.IsInGap(i)) {
3174  num_ns++;
3175  }
3176  } catch (CException& e2) {
3177  //bad character
3178  }
3179  }
3180  pct_n = (num_ns * 100) / bsh.GetBioseqLength();
3181  } catch (CException& e) {
3182  pct_n = 100;
3183  }
3184 */
3185  return bsh.GetBioseqLength() ? count * 100 / bsh.GetBioseqLength() : 100;
3186 }
3187 
3188 static
3190 {
3193  bool is_first = true;
3194 
3195  if (inst.CanGetExt() && inst.GetExt().IsDelta()) {
3196  ITERATE(CDelta_ext::Tdata, iter, inst.GetExt().GetDelta().Get()) {
3197  if ((*iter)->IsLoc()) {
3198  return false;
3199  }
3200  if ((*iter)->IsLiteral()) {
3201  const CSeq_literal& lit = (*iter)->GetLiteral();
3202  if (lit.IsSetSeq_data() && lit.GetSeq_data().IsGap()) {
3203  const CSeq_gap& gap = lit.GetSeq_data().GetGap();
3205  if (gap.IsSetType()) {
3206  gaptype = gap.GetType();
3207  }
3208  if (is_first) {
3209  first = gaptype;
3210  } else {
3211  last = gaptype;
3212  }
3213  } else {
3215  }
3216  }
3217  is_first = false;
3218  }
3219  }
3220  fst = first;
3221  lst = last;
3222  return true;
3223 }
3224 
3225 static bool s_WillReportTerminalGap(const CBioseq& seq, CBioseq_Handle bsh)
3226 {
3227  if (! seq.IsSetInst() || ! seq.GetInst().IsSetRepr()) {
3228  return false;
3229  }
3230  if (! seq.GetInst().IsSetMol() || seq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
3231  return false;
3232  }
3233  CSeq_inst::TRepr repr = seq.GetInst().GetRepr();
3234 
3235  if (repr != CSeq_inst::eRepr_delta) {
3236  return false;
3237  }
3238 
3239  if (! bsh) {
3240  return false;
3241  }
3242 
3243  if (! seq.GetInst().IsSetLength() || seq.GetInst().GetLength() < 10) {
3244  return false;
3245  }
3246 
3247  if (! ShouldCheckForNsAndGap(bsh)) {
3248  return false;
3249  }
3250 
3251  return true;
3252 }
3253 
3254 
3255 static optional<int> s_MaxSeqStretchIfLessThanThreshold(const CSeqVector& vec, int threshold)
3256 {
3257  int max_stretch = 0;
3258  auto IsN = [](char c) { return c == 'N'; };
3259 
3260  for (auto begin_it = find_if_not(begin(vec), end(vec), IsN);
3261  begin_it != end(vec);) {
3262  auto distanceToEnd = distance(begin_it, end(vec));
3263  // check a sequence interval no longer than the threshold length
3264  auto interval = (distanceToEnd > threshold) ? threshold : distanceToEnd;
3265  auto end_it = find_if(begin_it, next(begin_it, interval), IsN);
3266  const auto current_stretch = distance(begin_it, end_it);
3267  if (current_stretch >= threshold) { // No Ns in the interval
3268  return {};
3269  }
3270 
3271  if (current_stretch > max_stretch) {
3272  max_stretch = current_stretch;
3273  }
3274  begin_it = find_if_not(end_it, end(vec), IsN);
3275  }
3276  return max_stretch;
3277 }
3278 
3279 
3281 {
3282  if (! seq.IsSetInst() || ! seq.GetInst().IsSetRepr()) {
3283  // can't check if no Inst or Repr
3284  return;
3285  }
3286  if (! seq.GetInst().IsSetMol() || seq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
3287  // don't check proteins here
3288  return;
3289  }
3290  CSeq_inst::TRepr repr = seq.GetInst().GetRepr();
3291 
3292  // only check for raw or for delta sequences that are delta lit only
3293  if (repr == CSeq_inst::eRepr_virtual || repr == CSeq_inst::eRepr_map) {
3294  return;
3295  }
3296 
3298  if (! bsh) {
3299  // no check if Bioseq not in scope
3300  return;
3301  }
3302 
3303  try {
3305 
3306  if (IsAllNs(vec)) {
3307  PostErr(m_Imp.IsPDB() ? eDiag_Warning : eDiag_Critical, eErr_SEQ_INST_AllNs, "Sequence is all Ns", seq);
3308  return;
3309  }
3310 
3311  // don't bother checking if length is less than 10
3312  if (! seq.IsSetInst() || ! seq.GetInst().IsSetRepr()
3313  || ! seq.GetInst().IsSetLength() || seq.GetInst().GetLength() < 10) {
3314  return;
3315  }
3316 
3317  if (const auto oMaxLength = s_MaxSeqStretchIfLessThanThreshold(vec, 10); oMaxLength.has_value()) {
3319  "Maximum contig length is " + NStr::IntToString(*oMaxLength) + " bases", seq);
3320  }
3321 
3326  bool begin_ambig = false, end_ambig = false;
3329  if (ShouldCheckForNsAndGap(bsh) && x_IsDeltaLitOnly(seq.GetInst())) {
3330  CheckBioseqEndsForNAndGap(vec, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
3331  s_GetFlankingGapTypes(seq.GetInst(), fst, lst);
3332  }
3333 
3334  bool is_circular = false;
3336  is_circular = true;
3337  }
3338  EDiagSev sev;
3339  if (begin_n != eBioseqEndIsType_None) {
3340  sev = GetBioseqEndWarning(seq, is_circular, begin_n);
3341  PostErr(sev, eErr_SEQ_INST_TerminalNs, "N at beginning of sequence", seq);
3342  } else if (begin_gap != eBioseqEndIsType_None && fst != CSeq_gap::eType_contamination) {
3343  sev = GetBioseqEndWarning(seq, is_circular, begin_gap);
3344  PostErr(sev, eErr_SEQ_INST_TerminalGap, "Gap at beginning of sequence", seq);
3345  }
3346 
3347  if (end_n != eBioseqEndIsType_None) {
3348  sev = GetBioseqEndWarning(seq, is_circular, end_n);
3349  PostErr(sev, eErr_SEQ_INST_TerminalNs, "N at end of sequence", seq);
3350  } else if (end_gap != eBioseqEndIsType_None && lst != CSeq_gap::eType_contamination) {
3351  sev = GetBioseqEndWarning(seq, is_circular, end_gap);
3352  PostErr(sev, eErr_SEQ_INST_TerminalGap, "Gap at end of sequence", seq);
3353  }
3354 
3355  if (begin_ambig && ! s_WillReportTerminalGap(seq, bsh)) {
3357  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases",
3358  seq);
3359  }
3360  if (end_ambig && ! s_WillReportTerminalGap(seq, bsh)) {
3362  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases",
3363  seq);
3364  }
3365 
3366  // don't check N content for patent sequences
3367  if (SeqIsPatent(seq)) {
3368  return;
3369  }
3370 
3371  // if TSA, check for percentage of Ns and max stretch of Ns
3372  if (IsBioseqTSA(seq, m_Scope)) {
3373  ReportBadAssemblyGap(seq);
3374  if (! HasAssemblyOrNullGap(seq)) {
3375  bool n5 = false;
3376  bool n3 = false;
3377  TSeqPos num_ns = 0, max_stretch = 0;
3378  x_CalculateNsStretchAndTotal(vec, num_ns, max_stretch, n5, n3);
3379 
3380  int pct_n = (num_ns * 100) / seq.GetLength();
3381  if (pct_n > 10) {
3383  "Sequence contains " + NStr::IntToString(pct_n) + " percent Ns", seq);
3384  }
3385 
3386  if (max_stretch >= 15) {
3388  "Sequence has a stretch of " + NStr::IntToString(max_stretch) + " Ns", seq);
3389  } else {
3390  if (n5) {
3392  "Sequence has a stretch of at least 10 Ns within the first 20 bases", seq);
3393  }
3394  if (n3) {
3396  "Sequence has a stretch of at least 10 Ns within the last 20 bases", seq);
3397  }
3398  }
3399  }
3400  } else {
3401  // not TSA, just check for really high N percent
3402  int pct_n = PctNs(bsh);
3403  if (pct_n > 50) {
3405  "Sequence contains " + NStr::IntToString(pct_n) + " percent Ns", seq);
3406  }
3407  }
3408 
3409  if (! m_Imp.IsRefSeqConventions() && ! IsRefSeq(seq) && ! IsEmblOrDdbj(seq)) {
3410  if (IsWGS(bsh)) {
3411  ReportBadWGSGap(seq);
3412  } else if (IsBioseqTSA(seq, m_Scope)) {
3413  ReportBadTSAGap(seq);
3414  } else if (m_Imp.IsGenomeSubmission()) {
3415  ReportBadGenomeGap(seq);
3416  }
3417  }
3418  } catch (exception&) {
3419  // just ignore, and continue with the validation process.
3420  }
3421 }
3422 
3424 {
3425  // rough measure of where exception occurs - triggered by certain conditions set up in unit_test_validator
3426  int errPt = 0;
3427 
3428  try {
3429 
3430  errPt++;
3431 
3432  if (! seq.IsSetInst() || ! seq.GetInst().IsSetRepr()) {
3433  // can't check if no Inst or Repr
3434  return;
3435  }
3436  if (! seq.GetInst().IsSetMol() || seq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
3437  // don't check proteins here
3438  return;
3439  }
3440  CSeq_inst::TRepr repr = seq.GetInst().GetRepr();
3441 
3442  // only check for raw or for delta sequences that are delta lit only
3443  if (repr == CSeq_inst::eRepr_virtual || repr == CSeq_inst::eRepr_map) {
3444  return;
3445  }
3446 
3448  if (! bsh) {
3449  // no check if Bioseq not in scope
3450  return;
3451  }
3452 
3453  errPt++;
3454 
3455  vector<TSeqPos> gapPositions;
3456 
3457  SSeqMapSelector sel;
3459 
3460  CSeqMap_CI gap_it(bsh, sel);
3461 
3462  errPt++;
3463 
3464  for (; gap_it; ++gap_it) {
3465 
3466  TSeqPos gp_start = gap_it.GetPosition();
3467  TSeqPos gp_end = gap_it.GetEndPosition() - 1;
3468 
3469  gapPositions.push_back(gp_start);
3470  gapPositions.push_back(gp_end);
3471 
3472  // cout << "gap start: " << gp_start << ", end: " << gp_end << endl;
3473  }
3474 
3475  errPt++;
3476 
3477  vector<TSeqPos> featPositions;
3478 
3479  CFeat_CI feat_it(bsh);
3480 
3481  errPt++;
3482 
3483  for (; feat_it; ++feat_it) {
3484 
3485  CSeq_feat_Handle feat = feat_it->GetSeq_feat_Handle();
3486  CSeqFeatData::ESubtype subtype = feat.GetFeatSubtype();
3487  if (subtype != CSeqFeatData::eSubtype_gap) continue;
3488 
3489  CConstRef<CSeq_loc> feat_loc(&feat_it->GetLocation());
3490 
3491  int ft_start = feat_loc->GetStart(eExtreme_Positional);
3492  int ft_end = feat_loc->GetStop(eExtreme_Positional);
3493 
3494  featPositions.push_back(ft_start);
3495  featPositions.push_back(ft_end);
3496 
3497  // cout << "feat start: " << ft_start << ", end: " << ft_end << endl;
3498  }
3499 
3500  errPt++;
3501 
3502  int remaininig_gaps = (int) gapPositions.size() / 2;
3503  int remaining_feats = (int) featPositions.size() / 2;
3504 
3505  if (remaininig_gaps < 1 || remaining_feats < 1) {
3506  return;
3507  }
3508 
3509  int gap_idx = 0;
3510  int feat_idx = 0;
3511 
3512  TSeqPos gap_start = gapPositions[gap_idx];
3513  gap_idx++;
3514  TSeqPos gap_end = gapPositions[gap_idx];
3515  gap_idx++;
3516  remaininig_gaps--;
3517 
3518  TSeqPos feat_start = featPositions[feat_idx];
3519  feat_idx++;
3520  TSeqPos feat_end = featPositions[feat_idx];
3521  feat_idx++;
3522  remaining_feats--;
3523 
3524  errPt++;
3525 
3526  while (remaininig_gaps >= 0 && remaining_feats >= 0) {
3527  if (gap_end < feat_start) {
3528  if (remaininig_gaps <= 0) {
3529  return;
3530  }
3531  gap_start = gapPositions[gap_idx];
3532  gap_idx++;
3533  gap_end = gapPositions[gap_idx];
3534  gap_idx++;
3535  remaininig_gaps--;
3536  } else if (feat_end < gap_start) {
3537  if (remaining_feats <= 0) {
3538  return;
3539  }
3540  feat_start = featPositions[feat_idx];
3541  feat_idx++;
3542  feat_end = featPositions[feat_idx];
3543  feat_idx++;
3544  remaining_feats--;
3545  } else {
3546  // cout << "overlap gap start: " << gap_start << ", end: " << gap_end << ", feat start: " << feat_start << ", end: " << feat_end << endl;
3547  if (feat_start != gap_start || feat_end != gap_end) {
3548  PostErr(eDiag_Warning, eErr_SEQ_INST_InstantiatedGapMismatch, "Gap feature location does not match delta gap coordinates", seq);
3549  }
3550  if (remaininig_gaps <= 0) {
3551  return;
3552  }
3553  gap_start = gapPositions[gap_idx];
3554  gap_idx++;
3555  gap_end = gapPositions[gap_idx];
3556  gap_idx++;
3557  remaininig_gaps--;
3558  if (remaining_feats <= 0) {
3559  return;
3560  }
3561  feat_start = featPositions[feat_idx];
3562  feat_idx++;
3563  feat_end = featPositions[feat_idx];
3564  feat_idx++;
3565  remaining_feats--;
3566  }
3567  }
3568 
3569  errPt++;
3570 
3571  } catch (const exception&) {
3573  string("Exception " + NStr::IntToString(errPt) + " in GapByGapInst"), seq);
3574  }
3575 }
3576 
3577 // Assumes that seq is eRepr_raw or eRepr_inst
3579 {
3580  const CSeq_inst& inst = seq.GetInst();
3582  const string& rpr = tv->FindName(inst.GetRepr(), true);
3583 
3584  if (inst.IsSetFuzz() && (!inst.IsSetSeq_data() || !inst.GetSeq_data().IsGap())) {
3586  "Fuzzy length on " + rpr + " Bioseq", seq);
3587  }
3588 
3589  if (! inst.IsSetLength() || inst.GetLength() == 0) {
3590  string len = inst.IsSetLength() ?
3591  NStr::IntToString(inst.GetLength()) : "0";
3593  "Invalid Bioseq length [" + len + "]", seq);
3594  }
3595 
3596  if (inst.GetRepr() == CSeq_inst::eRepr_raw) {
3597  const CMolInfo* mi = nullptr;
3599  if (mi_desc) {
3600  mi = &(mi_desc->GetMolinfo());
3601  }
3602  CMolInfo::TTech tech = mi ? mi->GetTech() : CMolInfo::eTech_unknown;
3603  if (tech == CMolInfo::eTech_htgs_2 &&
3604  ! GraphsOnBioseq() &&
3605  ! x_IsActiveFin()) {
3607  "HTGS 2 raw seq has no gaps and no graphs", seq);
3608  }
3609  }
3610 
3612 
3613  CSeq_data::E_Choice seqtyp = inst.IsSetSeq_data() ?
3615  if (seqtyp != CSeq_data::e_Gap) {
3616  switch (seqtyp) {
3617  case CSeq_data::e_Iupacna:
3618  case CSeq_data::e_Ncbi2na:
3619  case CSeq_data::e_Ncbi4na:
3620  case CSeq_data::e_Ncbi8na:
3621  case CSeq_data::e_Ncbipna:
3622  if (inst.IsAa()) {
3624  "Using a nucleic acid alphabet on a protein sequence",
3625  seq);
3626  return;
3627  }
3628  break;
3629  case CSeq_data::e_Iupacaa:
3630  case CSeq_data::e_Ncbi8aa:
3631  case CSeq_data::e_Ncbieaa:
3632  case CSeq_data::e_Ncbipaa:
3634  if (inst.IsNa()) {
3636  "Using a protein alphabet on a nucleic acid",
3637  seq);
3638  return;
3639  }
3640  break;
3641  case CSeq_data::e_Gap:
3642  break;
3643  default:
3645  "Sequence alphabet not set",
3646  seq);
3647  return;
3648  }
3649 
3650  bool check_alphabet = false;
3651  unsigned int factor = 1;
3652  switch (seqtyp) {
3653  case CSeq_data::e_Iupacaa:
3654  case CSeq_data::e_Iupacna:
3655  case CSeq_data::e_Ncbieaa:
3657  check_alphabet = true;
3658  break;
3659  case CSeq_data::e_Ncbi8na:
3660  case CSeq_data::e_Ncbi8aa:
3661  break;
3662  case CSeq_data::e_Ncbi4na:
3663  factor = 2;
3664  break;
3665  case CSeq_data::e_Ncbi2na:
3666  factor = 4;
3667  break;
3668  case CSeq_data::e_Ncbipna:
3669  factor = 5;
3670  break;
3671  case CSeq_data::e_Ncbipaa:
3672  factor = 21;
3673  break;
3674  default:
3675  // Logically, should not occur
3677  "Sequence alphabet not set",
3678  seq);
3679  return;
3680  }
3681  TSeqPos calc_len = inst.IsSetLength() ? inst.GetLength() : 0;
3682  if (calc_len % factor) {
3683  calc_len += factor;
3684  }
3685  calc_len /= factor;
3686 
3687  string s_len = NStr::UIntToString(inst.GetLength());
3688 
3689  size_t data_len = GetDataLen(inst);
3690  string data_len_str = NStr::NumericToString(data_len * factor);
3691  if (calc_len > data_len) {
3693  "Bioseq.seq_data too short [" + data_len_str +
3694  "] for given length [" + s_len + "]", seq);
3695  return;
3696  } else if (calc_len < data_len) {
3698  "Bioseq.seq_data is larger [" + data_len_str +
3699  "] than given length [" + s_len + "]", seq);
3700  }
3701 
3702  if (check_alphabet) {
3703  unsigned int trailingX = 0;
3704  size_t dashes = 0;
3705  bool leading_x = false, found_lower = false, cds_5_prime = false;
3706 
3709 
3710  size_t bad_cnt = 0;
3711  TSeqPos pos = 1;
3712  for (CSeqVector_CI sv_iter(*sv), sv_res_iter(sv_res); (sv_iter) && (sv_res_iter); ++sv_iter, ++sv_res_iter) {
3713  CSeqVector::TResidue res = *sv_iter;
3714  CSeqVector::TResidue n_res = *sv_res_iter;
3715  if (! IsResidue(n_res)) {
3716  if (res == 'U' && bsh.IsSetInst_Mol() && bsh.GetInst_Mol() == CSeq_inst::eMol_rna) {
3717  // U is ok for RNA
3718  } else if (res == '*' && bsh.IsAa()) {
3719  trailingX = 0;
3720  } else if (res == '-' && bsh.IsAa()) {
3721  dashes++;
3723  "Invalid residue [" + NStr::UIntToString(res)
3724  + "] at position [" + NStr::UIntToString(pos) + "]",
3725  seq);
3726  } else {
3727  if (! IsResidue(res)) {
3728  if (++bad_cnt > 10) {
3730  "More than 10 invalid residues. Checking stopped",
3731  seq);
3732  return;
3733  } else {
3735  "Invalid residue [" + NStr::UIntToString(res)
3736  + "] at position [" + NStr::UIntToString(pos) + "]",
3737  seq);
3738  }
3739  } else if (islower(res)) {
3740  found_lower = true;
3741  } else {
3742  string msg = "Invalid";
3743  if (seq.IsNa() && strchr ("EFIJLOPQXZ", res) != NULL) {
3744  msg += " nucleotide";
3745  } else if (seq.IsNa() && res == 'U') {
3746  msg += " nucleotide";
3747  }
3748  msg += " residue ";
3749  if (seqtyp == CSeq_data::e_Ncbistdaa) {
3750  msg += "[" + NStr::UIntToString(res) + "]";
3751  } else {
3752  msg += "'";
3753  msg += res;
3754  msg += "'";
3755  }
3756  msg += " at position [" + NStr::UIntToString(pos) + "]";
3757 
3759  msg, seq);
3760  }
3761  }
3762  } else if (res == '-' || sv->IsInGap(pos - 1)) {
3763  dashes++;
3764  } else if (res == '*') {
3765  trailingX = 0;
3766  } else if (res == 'X') {
3767  trailingX++;
3768  if (pos == 1) {
3769  leading_x = true;
3770  }
3771  } else if (! isalpha(res)) {
3772  string msg = "Invalid residue [";
3773  msg += res;
3774  msg += "] in position [" + NStr::UIntToString(pos) + "]";
3776  msg, seq);
3777  } else {
3778  trailingX = 0;
3779  }
3780  ++pos;
3781  }
3782 
3783  bool gap_at_start = HasBadProteinStart(*sv);
3784  size_t terminations = CountProteinStops(*sv);
3785 
3786  // only show leading or trailing X if product of NNN in nucleotide
3787  if (seq.IsAa() && (leading_x || trailingX > 0)) {
3788  CBioseq_Handle bsh2 = m_Scope->GetBioseqHandle(seq);
3789  const CSeq_feat* cds = GetCDSForProduct(bsh2);
3790  if (cds && cds->IsSetLocation()) {
3791  const CSeq_loc& cdsloc = cds->GetLocation();
3792  size_t dna_len = GetLength(cdsloc, m_Scope);
3793  if (dna_len > 5) {
3794  string cds_seq = GetSequenceStringFromLoc(cdsloc, *m_Scope);
3795  if (cds->GetData().GetCdregion().IsSetFrame()) {
3796  if (cds->GetData().GetCdregion().GetFrame() == 2) {
3797  cds_seq = cds_seq.substr(1);
3798  } else if (cds->GetData().GetCdregion().GetFrame() == 3) {
3799  cds_seq = cds_seq.substr(2);
3800  }
3801  }
3802 
3803  if (! NStr::StartsWith(cds_seq, "NNN")) {
3804  leading_x = false;
3805  }
3806  if (cds_seq.length() >= 3) {
3807  string lastcodon = cds_seq.substr(cds_seq.length() - 3);
3808  if (! NStr::StartsWith(lastcodon, "NNN")) {
3809  trailingX = 0;
3810  }
3811  }
3812  }
3813  // only need to calculate cds_5_prime to set severity for subsequent eErr_SEQ_INST_LeadingX message
3814  if (leading_x) {
3815  if (cdsloc.IsPartialStart(eExtreme_Biological)) {
3816  cds_5_prime = true;
3817  }
3818  }
3819  }
3820  }
3821 
3822  if (leading_x) {
3823  EDiagSev sev = eDiag_Warning;
3824  if (cds_5_prime) {
3825  sev = eDiag_Info;
3826  }
3828  "Sequence starts with leading X", seq);
3829  }
3830 
3831  if (trailingX > 0 && ! SuppressTrailingXMsg(seq)) {
3832  // Suppress if cds ends in "*" or 3' partial
3833  string msg = "Sequence ends in " +
3834  NStr::IntToString(trailingX) + " trailing X";
3835  if (trailingX > 1) {
3836  msg += "s";
3837  }
3839  }
3840 
3841  if (found_lower) {
3843  "Sequence contains lower-case characters", seq);
3844  }
3845 
3846  if (terminations > 0 || dashes > 0) {
3847  // Post error indicating terminations found in protein sequence
3848  // if possible, get gene and protein names
3849  CBioseq_Handle bsh3 = m_Scope->GetBioseqHandle(seq);
3850  // First get gene label
3851  string gene_label;
3852  try {
3853  const CSeq_feat* cds = GetCDSForProduct(bsh3);
3854  if (cds) {
3856  if (gene && gene->IsSetData() && gene->GetData().IsGene()) {
3857  gene->GetData().GetGene().GetLabel(&gene_label);
3858  }
3859  }
3860  } catch (...) {
3861  }
3862  // get protein label
3863  string protein_label;
3864  try {
3865  CCacheImpl::SFeatKey prot_key(
3867  const CCacheImpl::TFeatValue& prots =
3868  GetCache().GetFeatFromCache(prot_key);
3869  if (! prots.empty()) {
3870  const CSeqFeatData_Base::TProt& first_prot =
3871  prots[0].GetData().GetProt();
3872  if (! RAW_FIELD_IS_EMPTY_OR_UNSET(first_prot, Name)) {
3873  protein_label = first_prot.GetName().front();
3874  }
3875  }
3876  } catch (const CException&) {
3877  } catch (const std::exception&) {
3878  }
3879 
3880  if (NStr::IsBlank(gene_label)) {
3881  gene_label = "gene?";
3882  }
3883  if (NStr::IsBlank(protein_label)) {
3884  protein_label = "prot?";
3885  }
3886 
3887  if (dashes > 0) {
3888  if (gap_at_start && dashes == 1) {
3890  "gap symbol at start of protein sequence (" + gene_label + " - " + protein_label + ")",
3891  seq);
3892  } else if (gap_at_start) {
3894  "gap symbol at start of protein sequence (" + gene_label + " - " + protein_label + ")",
3895  seq);
3897  "[" + NStr::SizetToString (dashes - 1) + "] internal gap symbols in protein sequence (" + gene_label + " - " + protein_label + ")",
3898  seq);
3899  } else {
3901  "[" + NStr::SizetToString (dashes) + "] internal gap symbols in protein sequence (" + gene_label + " - " + protein_label + ")",
3902  seq);
3903  }
3904  }
3905 
3906  if (terminations > 0) {
3907  string msg = "[" + NStr::SizetToString(terminations) + "] termination symbols in protein sequence";
3908  msg += " (" + gene_label + " - " + protein_label + ")";
3909  const CSeq_feat* cds = GetCDSForProduct(bsh3);
3910  if (cds) {
3912  } else {
3914  }
3915  }
3916  }
3917  }
3918 
3919  bool is_wgs = IsWGS(bsh);
3920 
3921  if (seq.IsNa() && seq.GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
3922  // look for runs of Ns and gap characters
3923  bool has_gap_char = false;
3924  size_t run_len = 0;
3925  TSeqPos start_pos = 0;
3926  TSeqPos pos = 1;
3928  const size_t run_len_cutoff = ( is_wgs ? 20 : 100 );
3929  for (CSeqVector_CI sv_iter(sv); (sv_iter); ++sv_iter, ++pos) {
3930  CSeqVector::TResidue res = *sv_iter;
3931  switch (res) {
3932  case 'N':
3933  if (run_len == 0) {
3934  start_pos = pos;
3935  }
3936  run_len++;
3937  break;
3938  case '-':
3939  has_gap_char = true;
3940  ///////////////////////////////////
3941  ////////// FALL-THROUGH! //////////
3942  ///////////////////////////////////
3944  default:
3945  if (run_len >= run_len_cutoff && start_pos > 1) {
3947  "Run of " + NStr::SizetToString (run_len) + " Ns in raw sequence starting at base "
3948  + NStr::IntToString (start_pos),
3949  seq);
3950  }
3951  run_len = 0;
3952  break;
3953  }
3954  }
3955  if (has_gap_char) {
3957  "Raw nucleotide should not contain gap characters", seq);
3958  }
3959  }
3960  }
3961 }
3962 
3963 
3964 //LCOV_EXCL_START
3965 //part of segset validation, no longer used
3966 // Assumes seq is eRepr_seg or eRepr_ref
3968 {
3969  string id_test_label;
3970  seq.GetLabel(&id_test_label, CBioseq::eContent);
3971 
3973  const CSeq_inst& inst = seq.GetInst();
3974 
3975  // Validate extension data -- wrap in CSeq_loc_mix for convenience
3976  CRef<CSeq_loc> loc = GetLocFromSeq(seq);
3977  if (loc) {
3978  if (inst.IsSetRepr() && inst.GetRepr() == CSeq_inst::eRepr_seg) {
3979  m_Imp.ValidateSeqLoc(*loc, bsh, true, "Segmented Bioseq", seq);
3980  }
3981 
3982  // Validate Length
3983  try {
3984  TSeqPos loclen = GetLength(*loc, m_Scope);
3985  TSeqPos seqlen = inst.IsSetLength() ? inst.GetLength() : 0;
3986  if (seqlen > loclen) {
3988  "Bioseq.seq_data too short [" + NStr::IntToString(loclen) +
3989  "] for given length [" + NStr::IntToString(seqlen) + "]",
3990  seq);
3991  } else if (seqlen < loclen) {
3993  "Bioseq.seq_data is larger [" + NStr::IntToString(loclen) +
3994  "] than given length [" + NStr::IntToString(seqlen) + "]",
3995  seq);
3996  }
3997  } catch (const CObjmgrUtilException&) {
3998  ERR_POST_X(6, Critical << "Unable to calculate length: ");
3999  }
4000  }
4001 
4002  // Check for multiple references to the same Bioseq
4003  if (inst.IsSetExt() && inst.GetExt().IsSeg()) {
4004  const list<CRef<CSeq_loc>>& locs = inst.GetExt().GetSeg().Get();
4005  ITERATE(list<CRef<CSeq_loc>>, i1, locs) {
4006  if (! IsOneBioseq(**i1, m_Scope)) {
4007  continue;
4008  }
4009  const CSeq_id& id1 = GetId(**i1, m_Scope);
4010  list<CRef<CSeq_loc>>::const_iterator i2 = i1;
4011  for (++i2; i2 != locs.end(); ++i2) {
4012  if (! IsOneBioseq(**i2, m_Scope)) {
4013  continue;
4014  }
4015  const CSeq_id& id2 = GetId(**i2, m_Scope);
4016  if (IsSameBioseq(id1, id2, m_Scope)) {
4017  string sid;
4018  id1.GetLabel(&sid);
4019  if ((**i1).IsWhole() && (**i2).IsWhole()) {
4022  "Segmented sequence has multiple references to " +
4023  sid, seq);
4024  } else {
4027  "Segmented sequence has multiple references to " +
4028  sid + " that are not SEQLOC_WHOLE", seq);
4029  }
4030  }
4031  }
4032  }
4033  }
4034 
4035  // Check that partial sequence info on sequence segments is consistent with
4036  // partial sequence info on sequence -- aa sequences only
4037  int partial = SeqLocPartialCheck(*loc, m_Scope);
4038  if (seq.IsAa()) {
4039  bool got_partial = false;
4040  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (sd, seq) {
4041  if (! (*sd)->IsMolinfo() || ! (*sd)->GetMolinfo().IsSetCompleteness()) {
4042  continue;
4043  }
4044 
4045  switch ((*sd)->GetMolinfo().GetCompleteness()) {
4047  got_partial = true;
4048  if (! partial) {
4050  "Complete segmented sequence with MolInfo partial", seq);
4051  }
4052  break;
4054  if (! (partial & eSeqlocPartial_Start) || (partial & eSeqlocPartial_Stop)) {
4056  "No-left inconsistent with segmented SeqLoc",
4057  seq);
4058  }
4059  got_partial = true;
4060  break;
4062  if (! (partial & eSeqlocPartial_Stop) || (partial & eSeqlocPartial_Start)) {
4064  "No-right inconsistent with segmented SeqLoc",
4065  seq);
4066  }
4067  got_partial = true;
4068  break;
4070  if (! (partial & eSeqlocPartial_Start) || ! (partial & eSeqlocPartial_Stop)) {
4072  "No-ends inconsistent with segmented SeqLoc",
4073  seq);
4074  }
4075  got_partial = true;
4076  break;
4077  default:
4078  break;
4079  }
4080  }
4081  if (! got_partial) {
4083  "Partial segmented sequence without MolInfo partial", seq);
4084  }
4085  }
4086 }
4087 //LCOV_EXCL_STOP
4088 
4089 
4091 {
4092  int max_ns = -1;
4093 
4094  switch (tech) {
4098  max_ns = 80;
4099  break;
4100  case CMolInfo::eTech_wgs:
4101  max_ns = 19;
4102  break;
4103  default:
4104  max_ns = 99;
4105  break;
4106  }
4107  return max_ns;
4108 }
4109 
4110 
4111 static bool s_IsSwissProt(const CBioseq& seq)
4112 {
4113  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
4114  if ((*it)->IsSwissprot()) {
4115  return true;
4116  }
4117  }
4118  return false;
4119 }
4120 
4122 {
4123  TIntId cmp = q1->GetId()->CompareOrdered(*(q2->GetId()));
4124  if (cmp < 0) {
4125  return true;
4126  } else if (cmp > 0) {
4127  return false;
4128  }
4129 
4130  TSeqPos start1 = q1->GetStart(eExtreme_Positional);
4131  TSeqPos start2 = q2->GetStart(eExtreme_Positional);
4132  if (start1 < start2) {
4133  return true;
4134  } else if (start2 < start1) {
4135  return false;
4136  }
4137 
4138  TSeqPos stop1 = q1->GetStop(eExtreme_Positional);
4139  TSeqPos stop2 = q2->GetStop(eExtreme_Positional);
4140 
4141  if (stop1 < stop2) {
4142  return true;
4143  } else {
4144  return false;
4145  }
4146 }
4147 
4148 
4150 {
4151  bool rval = false;
4152 
4153  if (! seq.IsSetInst() || ! seq.GetInst().IsSetExt() ||
4154  ! seq.GetInst().GetExt().IsDelta()) {
4155  return false;
4156  }
4157 
4158  ITERATE(CDelta_ext::Tdata, sg, seq.GetInst().GetExt().GetDelta().Get()) {
4159  if (! (*sg)) {
4160  // skip NULL element
4161  } else if ((*sg)->IsLoc()) {
4162  const CSeq_id* id = (*sg)->GetLoc().GetId();
4163  if (id) {
4164  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
4165  if ((*id_it)->Compare(*id) == CSeq_id::e_YES) {
4166  rval = true;
4167  break;
4168  }
4169  }
4170  }
4171  if (rval) break;
4172  }
4173  }
4174  return rval;
4175 }
4176 
4177 
4179 {
4180  if (! loc.IsInt()) {
4181  return false;
4182  }
4183 
4184  TSeqPos stop = loc.GetStop(eExtreme_Positional);
4185  TSeqPos start = loc.GetStart(eExtreme_Positional);
4186 
4187  if (start > 1) {
4188  CRef<CSeq_loc> far_loc(new CSeq_loc());
4189  far_loc->SetInt().SetFrom(start - 2);
4190  far_loc->SetInt().SetTo(start - 1);
4191  far_loc->SetInt().SetId().Assign(loc.GetInt().GetId());
4192  CFeat_CI f(far_bsh.GetScope(), *far_loc);
4193  if (f) {
4194  return true;
4195  }
4196  }
4197  if (stop < far_bsh.GetBioseqLength() - 2) {
4198  CRef<CSeq_loc> far_loc(new CSeq_loc());
4199  far_loc->SetInt().SetFrom(stop + 1);
4200  far_loc->SetInt().SetTo(stop + 2);
4201  far_loc->SetInt().SetId().Assign(loc.GetInt().GetId());
4202  CFeat_CI f(far_bsh.GetScope(), *far_loc);
4203  if (f) {
4204  return true;
4205  }
4206  }
4207  return false;
4208 }
4209 
4210 
4212  const CSeq_loc& loc,
4213  const CBioseq& seq,
4214  TSeqPos& len)
4215 {
4216  if (loc.IsWhole()) {
4218  "Delta seq component should not be of type whole", seq);
4219  }
4220 
4221  const CSeq_id* id = loc.GetId();
4222  if (id) {
4223  if (id->IsGi() && loc.GetId()->GetGi() == ZERO_GI) {
4225  "Delta component is gi|0", seq);
4226  }
4227  if (! loc.IsWhole()
4228  && (id->IsGi()
4229  || id->IsGenbank()
4230  || id->IsEmbl()
4231  || id->IsDdbj() || id->IsTpg()
4232  || id->IsTpe()
4233  || id->IsTpd()
4234  || id->IsOther())) {
4235  TSeqPos stop = loc.GetStop(eExtreme_Positional);
4236  try {
4238  if (bsh) {
4239  TSeqPos seq_len = bsh.GetBioseqLength();
4240  if (seq_len <= stop) {
4241  string id_label = id->AsFastaString();
4243  "Seq-loc extent (" + NStr::IntToString (stop + 1)
4244  + ") greater than length of " + id_label
4245  + " (" + NStr::IntToString(seq_len) + ")",
4246  seq);
4247  }
4248  if (! m_Imp.IsRefSeq() && IsWGS(seq) && HasExcludedAnnotation(loc, bsh)) {
4249  string id_label = id->AsFastaString();
4251  "Scaffold points to some but not all of " +
4252  id_label + ", excluded portion contains features", seq);
4253  }
4254  } else {
4256  "Unable to find far delta sequence component", seq);
4257  }
4258  } catch (const CException&) {
4259  } catch (const std::exception&) {
4260  }
4261  }
4262  }
4263 
4264  try {
4265  if (seq.IsSetInst()) {
4266  const CSeq_inst& inst = seq.GetInst();
4267  TSeqPos loc_len = GetLength(loc, m_Scope);
4268  if (loc_len == numeric_limits<TSeqPos>::max()) {
4270  "-1 length on seq-loc of delta seq_ext", seq);
4271  string loc_str;
4272  loc.GetLabel(&loc_str);
4273  if (loc_str.empty()) {
4274  loc_str = "?";
4275  }
4276  if (x_IsDeltaLitOnly(inst)) {
4278  "Short length (-1) on seq-loc (" + loc_str + ") of delta seq_ext", seq);
4279  }
4280  } else {
4281  len += loc_len;
4282  }
4283  if (loc_len <= 10) {
4284  string loc_str;
4285  loc.GetLabel(&loc_str);
4286  if (loc_str.empty()) {
4287  loc_str = "?";
4288  }
4289  if (x_IsDeltaLitOnly(inst)) {
4291  "Short length (" + NStr::SizetToString(loc_len) +
4292  ") on seq-loc (" + loc_str + ") of delta seq_ext", seq);
4293  }
4294  }
4295  }
4296 
4297  } catch (const CObjmgrUtilException&) {
4298  string loc_str;
4299  loc.GetLabel(&loc_str);
4300  if (loc_str.empty()) {
4301  loc_str = "?";
4302  }
4304  "No length for Seq-loc (" + loc_str + ") of delta seq-ext",
4305  seq);
4306  }
4307 }
4308 
4309 
4310 static TSeqPos s_GetDeltaLen(const CDelta_seq& seg, CScope* scope)
4311 {
4312  if (seg.IsLiteral()) {
4313  return seg.GetLiteral().GetLength();
4314  } else if (seg.IsLoc()) {
4315  return GetLength(seg.GetLoc(), scope);
4316  } else {
4317  return 0;
4318  }
4319 }
4320 
4321 
4322 static string linkEvStrings[] = {
4323  "paired-ends",
4324  "align genus",
4325  "align xgenus",
4326  "align trnscpt",
4327  "within clone",
4328  "clone contig",
4329  "map",
4330  "strobe",
4331  "unspecified",
4332  "pcr",
4333  "proximity ligation",
4334  "other",
4335  "UNKNOWN VALUE"
4336 };
4337 
4338 /*bsv
4339 static bool s_IsGapComponent(const CDelta_seq& seg)
4340 {
4341  if (! seg.IsLiteral()) return false;
4342  const CSeq_literal& lit = seg.GetLiteral();
4343  if (! lit.IsSetSeq_data()) return true;
4344  if (lit.GetSeq_data().IsGap() && lit.GetLength() > 0) return true;
4345  return false;
4346 }
4347 */
4348 
4349 static bool s_IsUnspecified(const CSeq_gap& gap)
4350 {
4351  bool is_unspec = false;
4353  const CLinkage_evidence& evidence = **ev_itr;
4354  if (! evidence.CanGetType())
4355  continue;
4356  int linktype = evidence.GetType();
4357  if (linktype == 8) {
4358  is_unspec = true;
4359  }
4360  }
4361  return is_unspec;
4362 }
4363 
4364 
4366 {
4367  // always ignore for circular sequences
4368  if (bsh.GetInst().IsSetTopology() &&
4370  return true;
4371  }
4372 
4373  // ignore if location is genomic and gap is of certain type
4374  if (gap_type != CSeq_gap::eType_centromere &&
4375  gap_type != CSeq_gap::eType_telomere &&
4376  gap_type != CSeq_gap::eType_heterochromatin &&
4377  gap_type != CSeq_gap::eType_short_arm &&
4378  gap_type != CSeq_gap::eType_contamination) {
4379  return false;
4380  }
4381 
4382  CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
4383  if (src && src->GetSource().IsSetGenome() && src->GetSource().GetGenome() == CBioSource::eGenome_chromosome) {
4384  return true;
4385  } else {
4386  return false;
4387  }
4388 }
4389 
4390 
4391 // Assumes seq is a delta sequence
4393 {
4394  const CSeq_inst& inst = seq.GetInst();
4395 
4396  // Get CMolInfo and tech used for validating technique and gap positioning
4397  const CMolInfo* mi = nullptr;
4399  if (mi_desc) {
4400  mi = &(mi_desc->GetMolinfo());
4401  }
4402  CMolInfo::TTech tech = mi ? mi->GetTech() : CMolInfo::eTech_unknown;
4403 
4404  if (! inst.IsSetExt() || ! inst.GetExt().IsDelta() ||
4405  inst.GetExt().GetDelta().Get().empty()) {
4407  "No CDelta_ext data for delta Bioseq", seq);
4408  }
4409 
4410  bool any_tech_ok = false;
4411  bool has_gi = false;
4412  FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
4413  if (IsNTNCNWACAccession(**id_it)) {
4414  any_tech_ok = true;
4415  break;
4416  } else if ((*id_it)->IsGi()) {
4417  has_gi = true;
4418  }
4419  }
4421  if (! any_tech_ok && seq.IsNa()
4422  && tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4423  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3
4426  && tech != CMolInfo::eTech_htc && tech != CMolInfo::eTech_barcode
4427  && tech != CMolInfo::eTech_tsa) {
4429  "Delta seq technique should not be [" + NStr::IntToString(tech) + "]", seq);
4430  }
4431 
4432  // set severity for first / last gap error
4433  TSeqPos len = 0;
4434  TSeqPos seg = 0;
4435  bool last_is_gap = false;
4436  int prev_gap_linkage = -1;
4437  CSeq_gap::TType prev_gap_type = CSeq_gap::eType_unknown;
4438  int gap_linkage = -1;
4440  size_t num_gaps = 0;
4441  size_t num_adjacent_gaps = 0;
4442  bool non_interspersed_gaps = false;
4443  bool first = true;
4444  int num_gap_known_or_spec = 0;
4445  int num_gap_unknown_unspec = 0;
4446 
4447  vector<CConstRef<CSeq_loc> > delta_locs;
4448 
4449  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
4450  ++seg;
4451  if (! (*sg)) {
4453  "NULL pointer in delta seq_ext valnode (segment " +
4454  NStr::IntToString(seg) + ")", seq);
4455  continue;
4456  }
4457  switch ((**sg).Which()) {
4458  case CDelta_seq::e_Loc: {
4459  const CSeq_loc& loc = (**sg).GetLoc();
4460  CConstRef<CSeq_loc> tmp(&loc);
4461  delta_locs.push_back(tmp);
4462 
4463  ValidateDeltaLoc(loc, seq, len);
4464 
4465  if (! last_is_gap && ! first) {
4466  non_interspersed_gaps = true;
4467  }
4468  last_is_gap = false;
4469  prev_gap_linkage = -1;
4470  prev_gap_type = CSeq_gap::eType_unknown;
4471  gap_linkage = CSeq_gap::eType_unknown;
4472  first = false;
4473  } break;
4474  case CDelta_seq::e_Literal: {
4475  // The C toolkit code checks for valid alphabet here
4476  // The C++ object serializaton will not load if invalid alphabet
4477  // so no check needed here
4478  const CSeq_literal& lit = (*sg)->GetLiteral();
4479  TSeqPos start_len = len;
4480  len += lit.CanGetLength() ? lit.GetLength() : 0;
4481  if (lit.IsSetSeq_data() && ! lit.GetSeq_data().IsGap()
4482  && (! lit.IsSetLength() || lit.GetLength() == 0)) {
4484  "Seq-lit of length 0 in delta chain", seq);
4485  }
4486 
4487  // Check for invalid residues
4488  if (lit.IsSetSeq_data() && !lit.GetSeq_data().IsGap()) {
4489  if (! last_is_gap && ! first) {
4490  non_interspersed_gaps = true;
4491  }
4492  last_is_gap = false;
4493  prev_gap_linkage = -1;
4494  prev_gap_type = CSeq_gap::eType_unknown;
4495  const CSeq_data& data = lit.GetSeq_data();
4496  vector<TSeqPos> badIdx;
4497  CSeqportUtil::Validate(data, &badIdx);
4498  const string* ss = nullptr;
4499  switch (data.Which()) {
4500  case CSeq_data::e_Iupacaa:
4501  ss = &data.GetIupacaa().Get();
4502  break;
4503  case CSeq_data::e_Iupacna:
4504  ss = &data.GetIupacna().Get();
4505  break;
4506  case CSeq_data::e_Ncbieaa:
4507  ss = &data.GetNcbieaa().Get();
4508  break;
4509  case CSeq_data::e_Ncbistdaa: {
4510  const vector<char>& c = data.GetNcbistdaa().Get();
4511  ITERATE (vector<TSeqPos>, ci, badIdx) {
4513  "Invalid residue [" +
4514  NStr::IntToString((int)c[*ci]) + "] at position [" +
4515  NStr::IntToString((*ci) + 1) + "]", seq);
4516  }
4517  } break;
4518  default:
4519  break;
4520  }
4521 
4522  if (ss) {
4523  ITERATE (vector<TSeqPos>, it, badIdx) {
4525  "Invalid residue [" +
4526  ss->substr(*it, 1) + "] at position [" +
4527  NStr::IntToString((*it) + 1) + "]", seq);
4528  }
4529  }
4530 
4531  if (mi) {
4532  // Count adjacent Ns in Seq-lit
4533  int max_ns = s_MaxNsInSeqLitForTech(tech);
4534  size_t adjacent_ns = x_CountAdjacentNs(lit);
4535  if (max_ns >= 0 && adjacent_ns > unsigned(max_ns)) {
4537  "Run of " + NStr::NumericToString(adjacent_ns) +
4538  " Ns in delta component " + NStr::UIntToString(seg) +
4539  " that starts at base " + NStr::UIntToString(start_len + 1),
4540  seq);
4541  }
4542  }
4543  } else {
4544  gap_linkage = -1;
4545  gap_type = CSeq_gap::eType_unknown;
4546  if (lit.IsSetSeq_data() && lit.GetSeq_data().IsGap()) {
4547  const CSeq_data& data = lit.GetSeq_data();
4548  if (data.Which() == CSeq_data::e_Gap) {
4549  const CSeq_gap& gap = data.GetGap();
4550 
4551  if (gap.IsSetType()) {
4552  gap_type = gap.GetType();
4553  if (gap_type == CSeq_gap::eType_unknown && s_IsUnspecified(gap)) {
4554  num_gap_unknown_unspec++;
4555  } else {
4556  num_gap_known_or_spec++;
4557  }
4558  }
4559  if (gap.IsSetLinkage())
4560  gap_linkage = gap.GetLinkage();
4561  }
4562  }
4563  if (first && ! x_IgnoreEndGap(bsh, gap_type) && ! s_WillReportTerminalGap(seq, bsh)) {
4564  EDiagSev sev = eDiag_Error;
4565  if (tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4566  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3) {
4567  sev = eDiag_Warning;
4568  }
4570  "First delta seq component is a gap", seq);
4571  }
4572 
4573  if (last_is_gap &&
4574  (prev_gap_type == gap_type ||
4575  prev_gap_linkage != gap_linkage ||
4576  gap_linkage != CSeq_gap::eLinkage_unlinked)) {
4577  if (prev_gap_type != CSeq_gap::eType_contamination && gap_type != CSeq_gap::eType_contamination) {
4578  ++num_adjacent_gaps;
4579  }
4580  }
4581 
4582  if (lit.IsSetSeq_data() && lit.GetSeq_data().IsGap()) {
4583  ValidateSeqGap(lit.GetSeq_data().GetGap(), seq);
4584  } else if (! lit.CanGetLength() || lit.GetLength() == 0) {
4585  if (! lit.IsSetFuzz() || ! lit.GetFuzz().IsLim() || lit.GetFuzz().GetLim() != CInt_fuzz::eLim_unk) {
4587  "Gap of length 0 in delta chain", seq);
4588  } else {
4590  "Gap of length 0 with unknown fuzz in delta chain", seq);
4591  }
4592  } else if (lit.CanGetLength() && lit.GetLength() != 100) {
4593  if (lit.IsSetFuzz()) {
4595  "Gap of unknown length should have length 100", seq);
4596  }
4597  }
4598  last_is_gap = true;
4599  prev_gap_type = gap_type;
4600  prev_gap_linkage = gap_linkage;
4601  ++num_gaps;
4602  }
4603  first = false;
4604  } break;
4605  default:
4607  "CDelta_seq::Which() is e_not_set", seq);
4608  }
4609  }
4610 
4611  if (num_gap_unknown_unspec > 0 && num_gap_known_or_spec == 0) {
4612  if (num_gap_unknown_unspec > 1) {
4614  "All " + NStr::IntToString(num_gap_unknown_unspec) +
4615  " Seq-gaps have unknown type and unspecified linkage", seq);
4616  } else {
4618  "Single Seq-gap has unknown type and unspecified linkage", seq);
4619  }
4620  }
4621 
4622  if (inst.GetLength() > len) {
4624  "Bioseq.seq_data too short [" + NStr::IntToString(len) +
4625  "] for given length [" + NStr::IntToString(inst.GetLength()) +
4626  "]", seq);
4627  } else if (inst.GetLength() < len) {
4629  "Bioseq.seq_data is larger [" + NStr::IntToString(len) +
4630  "] than given length [" + NStr::IntToString(inst.GetLength()) +
4631  "]", seq);
4632  }
4633  if (non_interspersed_gaps && ! has_gi && mi &&
4634  (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
4635  tech == CMolInfo::eTech_htgs_2)) {
4636  EDiagSev missing_gaps_sev = eDiag_Error;
4638  while (desc_i) {
4639  if (desc_i->GetUser().IsRefGeneTracking()) {
4640  missing_gaps_sev = eDiag_Info;
4641  break;
4642  }
4643  ++desc_i;
4644  }
4645 
4646  PostErr(missing_gaps_sev, eErr_SEQ_INST_MissingGaps,
4647  "HTGS delta seq should have gaps between all sequence runs", seq);
4648  }
4649  if (num_adjacent_gaps >= 1) {
4650  string msg = (num_adjacent_gaps == 1) ?
4651  "There is 1 adjacent gap in delta seq" :
4652  "There are " + NStr::SizetToString(num_adjacent_gaps) +
4653  " adjacent gaps in delta seq";
4655  }
4656  if (last_is_gap && ! x_IgnoreEndGap(bsh, gap_type) && ! s_WillReportTerminalGap(seq, bsh)) {
4657  EDiagSev sev = eDiag_Error;
4658  if (tech != CMolInfo::eTech_htgs_0 && tech != CMolInfo::eTech_htgs_1
4659  && tech != CMolInfo::eTech_htgs_2 && tech != CMolInfo::eTech_htgs_3) {
4660  sev = eDiag_Warning;
4661  }
4663  "Last delta seq component is a gap", seq);
4664  }
4665 
4666  // Validate technique
4667  if (num_gaps == 0 && mi) {
4668  if (tech == CMolInfo::eTech_htgs_2 &&
4669  ! GraphsOnBioseq() &&
4670  ! x_IsActiveFin()) {
4672  "HTGS 2 delta seq has no gaps and no graphs", seq);
4673  }
4674  }
4675 
4676  // look for multiple delta locs overlapping
4677  if (delta_locs.size() > 1) {
4678  stable_sort(delta_locs.begin(), delta_locs.end(), s_LocSortCompare);
4679  vector<CConstRef<CSeq_loc>>::iterator it1 = delta_locs.begin();
4680  vector<CConstRef<CSeq_loc>>::iterator it2 = it1;
4681  ++it2;
4682  while (it2 != delta_locs.end()) {
4683  if ((*it1)->GetId()->Compare(*(*it2)->GetId()) == CSeq_id::e_YES
4684  && Compare (**it1, **it2, m_Scope, fCompareOverlapping) != eNoOverlap) {
4685  string seq_label = (*it1)->GetId()->AsFastaString();
4687  "Overlapping delta range " + NStr::IntToString((*it2)->GetStart(eExtreme_Positional) + 1)
4688  + "-" + NStr::IntToString((*it2)->GetStop(eExtreme_Positional) + 1)
4689  + " and " + NStr::IntToString((*it1)->GetStart(eExtreme_Positional) + 1)
4690  + "-" + NStr::IntToString((*it1)->GetStop(eExtreme_Positional) + 1)
4691  + " on a Bioseq " + seq_label,
4692  seq);
4693  }
4694  ++it1;
4695  ++it2;
4696  }
4697  }
4698 
4699  if (IsSelfReferential(seq)) {
4701  "Self-referential delta sequence", seq);
4702  }
4703 
4704  // look for Ns next to gaps
4705  if (seq.IsNa() && seq.GetLength() > 1 && x_IsDeltaLitOnly(inst)) {
4706  try {
4707  TSeqPos pos = 0;
4709  ITERATE (CDelta_ext::Tdata, delta_i, seq.GetInst().GetExt().GetDelta().Get()) {
4710  if (delta_i->Empty()) {
4711  continue; // Ignore NULLs, reported separately above.
4712  }
4713  const CDelta_seq& seg2 = **delta_i;
4714  TSeqPos delta_len = s_GetDeltaLen(seg2, m_Scope);
4715  if (pos > 0) {
4716  if (sv.IsInGap(pos)) {
4717  CSeqVector::TResidue res = sv[pos - 1];
4718  if (res == 'N' && ! sv.IsInGap(pos - 1)) {
4720  "Ambiguous residue N is adjacent to a gap around position " + NStr::SizetToString (pos + 1),
4721  seq);
4722  }
4723  }
4724  }
4725  if (delta_len > 0 && pos + delta_len < len) {
4726  if (sv.IsInGap(pos + delta_len - 1)) {
4727  CSeqVector::TResidue res = sv[pos + delta_len];
4728  if (res == 'N' && ! sv.IsInGap(pos + delta_len)) {
4730  "Ambiguous residue N is adjacent to a gap around position " + NStr::SizetToString(pos + delta_len + 1),
4731  seq);
4732  }
4733  }
4734  }
4735  pos += delta_len;
4736  }
4737  } catch (const CException&) {
4738  } catch (const std::exception&) {
4739  }
4740  }
4741 
4742 }
4743 
4744 
4745 bool s_HasGI(const CBioseq& seq)
4746 {
4747  bool has_gi = false;
4748  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
4749  if ((*id_it)->IsGi()) {
4750  has_gi = true;
4751  break;
4752  }
4753  }
4754  return has_gi;
4755 }
4756 
4757 
4759 {
4760  if (gap.IsSetLinkage_evidence()) {
4761  int linkcount = 0;
4762  int linkevarray[13];
4763  for (int i = 0; i < 13; i++) {
4764  linkevarray[i] = 0;
4765  }
4766  bool is_unspec = false;
4768  const CLinkage_evidence& evidence = **ev_itr;
4769  if (! evidence.CanGetType())
4770  continue;
4771  int linktype = evidence.GetType();
4772  if (linktype == 8) {
4773  is_unspec = true;
4774  }
4775  linkcount++;
4776  if (linktype == 255) {
4777  (linkevarray[11])++;
4778  } else if (linktype < 0 || linktype > 10) {
4779  (linkevarray[12])++;
4780  } else {
4781  (linkevarray[linktype])++;
4782  }
4783  }
4784  if (linkevarray[8] > 0 && linkcount > linkevarray[8]) {
4786  "Seq-gap type has unspecified and additional linkage evidence", seq);
4787  }
4788  for (int i = 0; i < 13; i++) {
4789  if (linkevarray[i] > 1) {
4791  "Linkage evidence '" + linkEvStrings[i] + "' appears " +
4792  NStr::IntToString(linkevarray[i]) + " times", seq);
4793  }
4794  }
4795  if (! gap.IsSetLinkage() || gap.GetLinkage() != CSeq_gap::eLinkage_linked) {
4797  "Seq-gap with linkage evidence must have linkage field set to linked", seq);
4798  }
4799  if (gap.IsSetType()) {
4800  int gaptype = gap.GetType();
4801  if (gaptype != CSeq_gap::eType_fragment &&
4802  gaptype != CSeq_gap::eType_clone &&
4803  gaptype != CSeq_gap::eType_repeat &&
4804  gaptype != CSeq_gap::eType_scaffold) {
4805  if (gaptype == CSeq_gap::eType_unknown && is_unspec) {
4806  /* suppress for legacy records */
4807  } else if (gaptype == CSeq_gap::eType_contamination) {
4808  if (linkevarray[8] > 0 && linkcount == linkevarray[8]) {
4809  /* contamination can only have linked unspecified */
4810  } else {
4812  "Contamination gaps must have linkage evidence 'unspecified'", seq);
4813  }
4814  } else {
4816  "Seq-gap of type " + NStr::IntToString(gaptype) +
4817  " should not have linkage evidence", seq);
4818  }
4819  }
4820  }
4821  } else {
4822  if (gap.IsSetType()) {
4823  int gaptype = gap.GetType();
4824  if (gaptype == CSeq_gap::eType_scaffold) {
4826  "Seq-gap type == scaffold is missing required linkage evidence", seq);
4827  }
4828  if (gaptype == CSeq_gap::eType_repeat && gap.IsSetLinkage() && gap.GetLinkage() == CSeq_gap::eLinkage_linked) {
4829  bool suppress_SEQ_INST_SeqGapProblem = false;
4830  if (seq.IsSetDescr() && s_HasGI(seq)) {
4832  {
4833  if ((**it).IsCreate_date())
4834  {
4835  CDate threshold_date(CTime(2012, 10, 1));
4836  if ((**it).GetCreate_date().Compare(threshold_date) == CDate::eCompare_before)
4837  suppress_SEQ_INST_SeqGapProblem = true;
4838  break;
4839  }
4840  }
4841  }
4842  if (! suppress_SEQ_INST_SeqGapProblem)
4844  "Seq-gap type == repeat and linkage == linked is missing required linkage evidence", seq);
4845 
4846  }
4847  if (gaptype == CSeq_gap::eType_contamination) {
4849  "Contamination gap-types must be linked and have linkage-evidence of type 'unspecified'", seq);
4850  }
4851  }
4852  }
4853 }
4854 
4855 
4857  const CSeq_inst& inst,
4858  const CBioseq& seq)
4859 {
4860  bool rtn = true;
4862  string rpr = tv->FindName(inst.GetRepr(), true);
4863  if (NStr::Equal(rpr, "ref")) {
4864  rpr = "reference";
4865  } else if (NStr::Equal(rpr, "const")) {
4866  rpr = "constructed";
4867  }
4868  const string err0 = "Bioseq-ext not allowed on " + rpr + " Bioseq";
4869  const string err1 = "Missing or incorrect Bioseq-ext on " + rpr + " Bioseq";
4870  const string err2 = "Missing Seq-data on " + rpr + " Bioseq";
4871  const string err3 = "Seq-data not allowed on " + rpr + " Bioseq";
4872  switch (inst.GetRepr()) {
4874  if (inst.IsSetExt()) {
4876  rtn = false;
4877  }
4878  if (inst.IsSetSeq_data()) {
4880  rtn = false;
4881  }
4882  break;
4883  case CSeq_inst::eRepr_map:
4884  if (! inst.IsSetExt() || ! inst.GetExt().IsMap()) {
4886  rtn = false;
4887  }
4888  if (inst.IsSetSeq_data()) {
4890  rtn = false;
4891  }
4892  break;