NCBI C++ ToolKit
unit_test_validator.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: unit_test_validator.cpp 101299 2023-11-28 18:18:38Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin, NCBI
27  *
28  * File Description:
29  * Unit tests for the validator.
30  *
31  * ===========================================================================
32  */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "unit_test_validator.hpp"
37 
38 #include <corelib/ncbi_system.hpp>
39 
40 // This macro should be defined before inclusion of test_boost.hpp in all
41 // "*.cpp" files inside executable except one. It is like function main() for
42 // non-Boost.Test executables is defined only in one *.cpp file - other files
43 // should not include it. If NCBI_BOOST_NO_AUTO_TEST_MAIN will not be defined
44 // then test_boost.hpp will define such "main()" function for tests.
45 //
46 // Usually if your unit tests contain only one *.cpp file you should not
47 // care about this macro at all.
48 //
49 //#define NCBI_BOOST_NO_AUTO_TEST_MAIN
50 
51 #define BAD_VALIDATOR
52 
53 // This header must be included before all Boost.Test headers if there are any
54 #include <corelib/test_boost.hpp>
55 
56 // for ignoring external config files
57 #include <util/util_misc.hpp>
58 
60 #include <objects/biblio/Title.hpp>
66 #include <objects/pub/Pub.hpp>
68 #include <objects/seq/GIBB_mol.hpp>
69 #include <objects/seq/Seq_ext.hpp>
73 #include <objects/seq/Ref_ext.hpp>
74 #include <objects/seq/Map_ext.hpp>
75 #include <objects/seq/Seg_ext.hpp>
76 #include <objects/seq/Seq_gap.hpp>
77 #include <objects/seq/Seq_data.hpp>
79 #include <objects/seq/Seqdesc.hpp>
80 #include <objects/seq/MolInfo.hpp>
81 #include <objects/seq/Pubdesc.hpp>
82 #include <objects/seq/Seq_hist.hpp>
100 #include <objmgr/object_manager.hpp>
101 #include <objmgr/scope.hpp>
102 #include <objmgr/bioseq_ci.hpp>
103 #include <objmgr/feat_ci.hpp>
104 #include <objmgr/seq_vector.hpp>
105 #include <objmgr/util/sequence.hpp>
106 #include <objmgr/seqdesc_ci.hpp>
107 #include <objmgr/util/sequence.hpp>
115 #include <corelib/ncbiapp.hpp>
116 #include <common/ncbi_export.h>
120 #include <objtools/edit/cds_fix.hpp>
122 
123 // for writing out tmp files
124 #include <serial/objostrasn.hpp>
125 #include <serial/objostrasnb.hpp>
126 
128 
131 
132 using namespace validator;
133 using namespace unit_test_util;
134 
135 
136 CExpectedError::CExpectedError(string accession, EDiagSev severity, string err_code, string err_msg)
137 : m_Accession (accession), m_Severity (severity), m_ErrCode(err_code), m_ErrMsg(err_msg)
138 {
139 }
140 
142 {
143 }
144 
145 
146 bool CExpectedError::Match(const CValidErrItem& err_item, bool ignore_severity)
147 {
148  if (!NStr::IsBlank(m_Accession) && !NStr::IsBlank(err_item.GetAccnver()) &&
149  !NStr::Equal(err_item.GetAccnver(), m_Accession)) {
150  return false;
151  }
152  if (!NStr::Equal(err_item.GetErrCode(), m_ErrCode)) {
153  return false;
154  }
155  string msg = err_item.GetMsg();
156  size_t pos = NStr::Find(msg, " EXCEPTION: NCBI C++ Exception:");
157  if (pos != string::npos) {
158  msg = msg.substr(0, pos);
159  }
160 
161  if (!NStr::Equal(msg, m_ErrMsg)) {
162  return false;
163  }
164  if (!ignore_severity && m_Severity != err_item.GetSeverity()) {
165  return false;
166  }
167  return true;
168 }
169 
170 
171 void CExpectedError::Test(const CValidErrItem& err_item)
172 {
173  if (!NStr::IsBlank (m_Accession) && !NStr::IsBlank (err_item.GetAccnver())) {
174  BOOST_CHECK_EQUAL(err_item.GetAccnver(), m_Accession);
175  }
176  BOOST_CHECK_EQUAL(err_item.GetSeverity(), m_Severity);
177  BOOST_CHECK_EQUAL(err_item.GetErrCode(), m_ErrCode);
178  string msg = err_item.GetMsg();
179  size_t pos = NStr::Find(msg, " EXCEPTION: NCBI C++ Exception:");
180  if (pos != string::npos) {
181  msg = msg.substr(0, pos);
182  }
183  BOOST_CHECK_EQUAL(msg, m_ErrMsg);
184 }
185 
186 
188 {
189  string description = err_item.GetAccnver() + ":"
190  + CValidErrItem::ConvertSeverity(err_item.GetSeverity()) + ":"
191  + err_item.GetErrCode() + ":"
192  + err_item.GetMsg();
193  printf("%s\n", description.c_str());
194 
195 }
196 
197 
199 {
200  string description = m_Accession + ":"
202  + m_ErrCode + ":"
203  + m_ErrMsg;
204  printf("%s\n", description.c_str());
205 
206 }
207 
208 
209 static bool s_debugMode = false;
210 
211 void WriteErrors(const CValidError& eval, bool debug_mode)
212 {
213  if (debug_mode) {
214  printf ("\n-\n");
215  }
216  for ( CValidError_CI vit(eval); vit; ++vit) {
218  }
219  if (debug_mode) {
220  printf ("\n\n");
221  }
222  printf ("\n\n");
223 }
224 
225 
226 void CheckErrors(const CValidError& eval,
227  vector< CExpectedError* >& expected_errors)
228 {
229  //static int count(1);
230  //if (count == 1367) {
231  // cerr << "";
232  //}
233  //cerr << count++ << "\n";
234 
235  bool problem_found = false;
236 
237  if (s_debugMode) {
238  WriteErrors (eval, true);
239  return;
240  }
241 
242  vector<bool> expected_found;
243  for (size_t i = 0; i < expected_errors.size(); i++) {
244  if (expected_errors[i]) {
245  expected_found.push_back(false);
246  } else {
247  expected_found.push_back(true);
248  }
249  }
250 
251  for (CValidError_CI vit(eval); vit; ++vit) {
252  bool found = false;
253  for (size_t i = 0; i < expected_errors.size(); i++) {
254  if (!expected_found[i] && expected_errors[i]->Match(*vit)) {
255  expected_found[i] = true;
256  found = true;
257  break;
258  }
259  }
260  if (!found) {
261  for (size_t i = 0; i < expected_errors.size(); i++) {
262  if (!expected_found[i] && expected_errors[i]->Match(*vit, true)) {
263  printf("Problem with ");
265  expected_errors[i]->Test(*vit);
266  expected_found[i] = true;
267  found = true;
268  problem_found = true;
269  break;
270  }
271  }
272  }
273  if (!found) {
274  BOOST_CHECK_EQUAL("Unexpected error", "Error not found");
276  problem_found = true;
277  }
278  }
279 
280  for (size_t i = 0; i < expected_errors.size(); i++) {
281  if (!expected_found[i]) {
282  BOOST_CHECK_EQUAL(expected_errors[i]->GetErrMsg(), "Expected error not found");
283  problem_found = true;
284  }
285  }
286 
287  if (problem_found) {
288  WriteErrors (eval, false);
289 
290  printf("Expected:\n");
291  for (auto it : expected_errors) {
292  if (it) {
293  it->Print();
294  }
295  }
296  }
297 }
298 
299 
300 void CheckStrings(const vector<string>& seen, const vector<string>& expected)
301 {
302  auto it1 = seen.begin();
303  auto it2 = expected.begin();
304  bool any = false;
305  while (it1 != seen.end() && it2 != expected.end()) {
306  BOOST_CHECK_EQUAL(*it1, *it2);
307  if (!NStr::Equal(*it1, *it2)) {
308  any = true;
309  }
310  it1++;
311  it2++;
312  }
313  while (it1 != seen.end()) {
314  BOOST_CHECK_EQUAL(*it1, "Unexpected string");
315  it1++;
316  any = true;
317  }
318  while (it2 != expected.end()) {
319  BOOST_CHECK_EQUAL("Missing string", *it2);
320  it2++;
321  any = true;
322  }
323 
324  if (any) {
325  printf("Seen:\n");
326  auto it1 = seen.begin();
327  while (it1 != seen.end()) {
328  printf("%s\n", (*it1).c_str());
329  it1++;
330  }
331  printf("Expected:\n");
332  auto it2 = expected.begin();
333  while (it2 != expected.end()) {
334  printf("%s\n", (*it2).c_str());
335  it2++;
336  }
337  }
338 }
339 
340 
341 // Not currently used, but I'll leave it here in case
342 // it's useful in the future.
343 
344 #if 0
345 static void SetCountryOnSrc(CBioSource& src, string country)
346 {
347  if (NStr::IsBlank(country)) {
348  if (src.IsSetSubtype()) {
349  auto& cont = src.SetSubtype();
350  cont.remove_if(
351  [](CSubSource* it) { return (it->IsSetSubtype() && it->GetSubtype() == CSubSource::eSubtype_country); });
352  }
353  } else {
355  src.SetSubtype().push_back(sub);
356  }
357 }
358 #endif
359 
361 static string ToAsn1(const CRef<CSeq_entry>& entry)
362 {
363  CNcbiOstrstream os;
364  os << MSerial_AsnText << entry;
365  return os.str();
366 }
367 
370 
373 
375 {
376  if ( !CNcbiApplication::Instance()->GetConfig().HasEntry("NCBI", "Data") ) {
377  NCBITEST_DISABLE(Test_Descr_BadStructuredCommentFormat);
378  NCBITEST_DISABLE(Test_Descr_MissingKeyword);
379  }
380 }
381 
382 
383 static void SetErrorsAccessions (vector< CExpectedError *> & expected_errors, string accession)
384 {
385  size_t i, len = expected_errors.size();
386  for (i = 0; i < len; i++) {
387  expected_errors[i]->SetAccession(accession);
388  }
389 }
390 
392 {
393  // Here we make descriptions of command line parameters that we are
394  // going to use.
395 
396  arg_desc->AddFlag( "debug_mode",
397  "Debugging mode writes errors seen for each test" );
398 }
399 
401 {
402  // initialization function body
403 
404  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
405  if (args["debug_mode"]) {
406  s_debugMode = true;
407  }
408  g_IgnoreDataFile("institution_codes.txt");
409 }
410 
411 void AddChromosomeNoLocation(vector< CExpectedError *>& expected_errors, const string& id)
412 {
413  expected_errors.push_back(new CExpectedError(id, eDiag_Error, "ChromosomeWithoutLocation",
414  "INDEXER_ONLY - source contains chromosome value '1' but the BioSource location is not set to chromosome"));
415 }
416 
417 void AddChromosomeNoLocation(vector< CExpectedError *>& expected_errors, CRef<CSeq_entry> entry)
418 {
419  if (entry->IsSeq()) {
420  CConstRef<CSeq_id> seqid = sequence::GetId(entry->GetSeq(), sequence::eGetId_Best).GetSeqId();
421  AddChromosomeNoLocation(expected_errors, seqid->AsFastaString());
422  } else if (entry->IsSet()) {
423  if (entry->GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
425  AddChromosomeNoLocation(expected_errors, nuc_entry);
426  } else {
427  for (auto it : entry->SetSet().SetSeq_set()) {
428  AddChromosomeNoLocation(expected_errors, it);
429  }
430  }
431  }
432 }
433 
434 
435 // new case test ground
436 
437 BOOST_AUTO_TEST_CASE(Test_Descr_MissingKeyword)
438 {
439  // prepare entry
441  CRef<CSeqdesc> sdesc(new CSeqdesc());
442  sdesc->SetUser().SetType().SetStr("StructuredComment");
443  entry->SetSeq().SetDescr().Set().push_back(sdesc);
444 
445  sdesc->SetUser().AddField("StructuredCommentPrefix", "##MIGS-Data-START##", CUser_object::eParse_String);
446  sdesc->SetUser().AddField("alt_elev", "foo", CUser_object::eParse_String);
447  sdesc->SetUser().AddField("assembly", "foo", CUser_object::eParse_String);
448  sdesc->SetUser().AddField("collection_date", "foo", CUser_object::eParse_String);
449  sdesc->SetUser().AddField("country", "foo", CUser_object::eParse_String);
450  sdesc->SetUser().AddField("depth", "foo", CUser_object::eParse_String);
451  sdesc->SetUser().AddField("environment", "foo", CUser_object::eParse_String);
452  sdesc->SetUser().AddField("investigation_type", "eukaryote", CUser_object::eParse_String);
453  sdesc->SetUser().AddField("isol_growth_condt", "foo", CUser_object::eParse_String);
454  sdesc->SetUser().AddField("sequencing_meth", "foo", CUser_object::eParse_String);
455  sdesc->SetUser().AddField("project_name", "foo", CUser_object::eParse_String);
456  sdesc->SetUser().AddField("ploidy", "foo", CUser_object::eParse_String);
457  sdesc->SetUser().AddField("num_replicons", "foo", CUser_object::eParse_String);
458  sdesc->SetUser().AddField("estimated_size", "foo", CUser_object::eParse_String);
459  sdesc->SetUser().AddField("trophic_level", "foo", CUser_object::eParse_String);
460  sdesc->SetUser().AddField("propagation", "foo", CUser_object::eParse_String);
461  sdesc->SetUser().AddField("lat_lon", "foo", CUser_object::eParse_String);
462 
463  CRef<CSeqdesc> gdesc(new CSeqdesc());
464  gdesc->SetGenbank().SetKeywords().push_back("GSC:MIGS:2.1");
465  entry->SetSeq().SetDescr().Set().push_back(gdesc);
466 
468 
469  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadKeywordForStrucComm",
470  "Structured Comment is non-compliant, keyword should be removed"));
471  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommMissingField",
472  "Required field finishing_strategy is missing when investigation_type has value 'eukaryote'"));
473  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue",
474  "Structured Comment invalid; the field value and/or name are incorrect"));
475  // AddChromosomeNoLocation(expected_errors, entry);
476  eval = validator.Validate(seh, options);
477  CheckErrors(*eval, expected_errors);
478 
479  // if no keyword, no badkeyword error
480  entry->SetSeq().SetDescr().Set().pop_back();
481  delete expected_errors[0];
482  expected_errors[0] = nullptr;
483  eval = validator.Validate(seh, options);
484  CheckErrors(*eval, expected_errors);
485 
487 
488  // make the comment valid, should complain about missing keyword
489  sdesc->SetUser().AddField("finishing_strategy", "foo", CUser_object::eParse_String);
490  // AddChromosomeNoLocation(expected_errors, entry);
491  eval = validator.Validate(seh, options);
492  CheckErrors(*eval, expected_errors);
493 
495  // put keyword back, should have no errors
496  entry->SetSeq().SetDescr().Set().push_back(gdesc);
497  // AddChromosomeNoLocation(expected_errors, entry);
498  eval = validator.Validate(seh, options);
499  CheckErrors(*eval, expected_errors);
501 }
502 
503 
504 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonValue)
505 {
506  // prepare entry
510 
512 
513  /*
514  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
515  "Latitude should be set to N (northern hemisphere)"));
516  eval = validator.Validate(seh, options);
517  CheckErrors(*eval, expected_errors);
518  */
519 
522  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
523  "Longitude should be set to W (western hemisphere)"));
524  eval = validator.Validate(seh, options);
525  CheckErrors(*eval, expected_errors);
526 
531  expected_errors[0]->SetErrMsg("Latitude should be set to S (southern hemisphere)");
532  eval = validator.Validate(seh, options);
533  CheckErrors(*eval, expected_errors);
534 
535  /*
536  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
537  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "25 S 47 W");
538  expected_errors[0]->SetErrMsg("Longitude should be set to E (eastern hemisphere)");
539  eval = validator.Validate(seh, options);
540  CheckErrors(*eval, expected_errors);
541  */
542 
544 
549  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
550  "Latitude and longitude values appear to be exchanged"));
551  eval = validator.Validate(seh, options);
552  CheckErrors(*eval, expected_errors);
553 
555 }
556 
557 
558 void TestOneLatLonCountry(const string& country, const string& lat_lon, const string& error, bool use_state = false, const string& err_code = "LatLonCountry")
559 {
560  // prepare entry
564 
566 
567  if (use_state) {
569  }
570 
571  if (!error.empty()) {
572  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, err_code, error));
573  }
574  eval = validator.Validate(seh, options);
575  CheckErrors(*eval, expected_errors);
576 
577  if (!error.empty()) {
578  CValidErrorFormat format(*objmgr);
579  vector<string> expected;
580  expected.push_back("LatLonCountry Errors");
581  expected.push_back("lcl|good:" + error);
582  expected.push_back("");
583 
584  vector<string> seen;
585  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
586  for (const string& it : cat_list) {
587  vector<string> sublist;
588  NStr::Split(it, "\n", sublist);
589  for (const string& sit : sublist) {
590  seen.push_back(sit);
591  }
592  }
593 
594  CheckStrings(seen, expected);
595  }
596 
598 }
599 
600 
602 {
603  TestOneLatLonCountry("Portugal", "37.7715 N 25.3097 W", "", true);
604 }
605 
606 
607 
608 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonCountry)
609 {
610  TestOneLatLonCountry("Romania", "46.5 N 20 E",
611  "Lat_lon '46.5 N 20 E' maps to 'Hungary' instead of 'Romania' - claimed region 'Romania' is at distance 45 km");
612  TestOneLatLonCountry("Romania", "34 N 65 E", "Lat_lon '34 N 65 E' maps to 'Afghanistan' instead of 'Romania'");
613  TestOneLatLonCountry("Romania", "48 N 15 E", "Lat_lon '48 N 15 E' maps to 'Austria' instead of 'Romania'");
614  TestOneLatLonCountry("Romania", "48 N 15 W", "Lat_lon '48 N 15 W' is in water 'Atlantic Ocean'", false, "LatLonWater");
615  // RW-1137 this had inconsistent behavior in production vs. development tests, possibly due to version skew in
616  // Puerto Rico cleanup code, so commenting out to avoid spurious error reports
617  /*
618  TestOneLatLonCountry("Puerto Rico: Rio Mameyes in Luquillo", "18.47 N 64.23000000000002 W",
619  "Lat_lon '18.47 N 64.23000000000002 W' is in water 'Caribbean Sea', 'Puerto Rico: Rio Mameyes in Luquillo' is 108 km away",
620  false, "LatLonWater");
621  */
622 
623 }
624 
625 
626 BOOST_AUTO_TEST_CASE(Test_ValidError_Format)
627 {
629 
630  // Create consensus splice problems
633  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
634  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
635  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'G';
636  CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
637  unit_test_util::AddFeat(intron, nuc);
638 
640  other_intron->SetData().SetImp().SetKey("intron");
642  gene->SetData().SetGene().SetLocus_tag("fake_locustag");
643  AddFeat(gene, nuc);
644 
645  // create EC number problems
646  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
647  prot->SetData().SetProt().SetEc().push_back("1.2.3.10");
648  prot->SetData().SetProt().SetEc().push_back("1.1.3.22");
649  prot->SetData().SetProt().SetEc().push_back("1.1.99.n");
650  prot->SetData().SetProt().SetEc().push_back("1.1.1.17");
651  prot->SetData().SetProt().SetEc().push_back("11.22.33.44");
652  prot->SetData().SetProt().SetEc().push_back("11.22.n33.44");
653  prot->SetData().SetProt().SetEc().push_back("11.22.33.n44");
654 
655 
656  // create bad institution code errors
660 
661  // create lat-lon country error
664 
666 
667  eval = validator.Validate(seh, options);
668 
669  CValidErrorFormat format(*objmgr);
670 
671  vector<string> expected;
672  expected.push_back("intron\tlcl|nuc\tGT at 17");
673  expected.push_back("intron\tlcl|nuc\tGT at 1");
674  expected.push_back("intron\tlcl|nuc\tAG at 11");
675  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
676  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
677  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
678  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
679  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
680  expected.push_back("CDS\tlcl|nuc\tGT at 16");
681  expected.push_back("lcl|nuc:Lat_lon '30 N 30 E' maps to 'Egypt' instead of 'Panama'");
682  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
683  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
684  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
685 
686  vector<string> seen;
687  for (CValidError_CI vit(*eval); vit; ++vit) {
688  string val = format.FormatForSubmitterReport(*vit, scope);
689  seen.push_back(val);
690  }
691  CheckStrings(seen, expected);
692 
693  expected.clear();
694  seen.clear();
695  for (CValidError_CI vit(*eval); vit; ++vit) {
696  seen.push_back(vit->GetErrCode());
697  }
698  expected.push_back("NotSpliceConsensusDonor");
699  expected.push_back("NotSpliceConsensusDonorTerminalIntron");
700  expected.push_back("NotSpliceConsensusAcceptor");
701  expected.push_back("DeletedEcNumber");
702  expected.push_back("ReplacedEcNumber");
703  expected.push_back("BadEcNumberValue");
704  expected.push_back("BadEcNumberFormat");
705  expected.push_back("BadEcNumberValue");
706  expected.push_back("NotSpliceConsensusDonor");
707  expected.push_back("LatLonCountry");
708  expected.push_back("BadInstitutionCode");
709  expected.push_back("BadInstitutionCode");
710  expected.push_back("BadInstitutionCode");
711  CheckStrings(seen, expected);
712 
713  seen.clear();
714  expected.clear();
715  vector<unsigned int> codes = format.GetListOfErrorCodes(*eval);
716  for (unsigned int it : codes) {
717  string val = CValidErrItem::ConvertErrCode(it);
718  seen.push_back(val);
719  }
720  expected.push_back("LatLonCountry");
721  expected.push_back("BadInstitutionCode");
722  expected.push_back("BadEcNumberFormat");
723  expected.push_back("BadEcNumberValue");
724  expected.push_back("NotSpliceConsensusDonor");
725  expected.push_back("NotSpliceConsensusAcceptor");
726  expected.push_back("DeletedEcNumber");
727  expected.push_back("ReplacedEcNumber");
728  expected.push_back("NotSpliceConsensusDonorTerminalIntron");
729  CheckStrings(seen, expected);
730 
731  string rval = format.FormatForSubmitterReport(*eval, scope, eErr_SEQ_FEAT_NotSpliceConsensusDonor);
732  expected.clear();
733  seen.clear();
734  NStr::Split(rval, "\n", seen);
735  expected.push_back("Not Splice Consensus");
736  expected.push_back("intron\tlcl|nuc\tGT at 17");
737  expected.push_back("CDS\tlcl|nuc\tGT at 16");
738  expected.push_back("");
739  CheckStrings(seen, expected);
740 
741  rval = format.FormatCategoryForSubmitterReport(*eval, scope, eSubmitterFormatErrorGroup_ConsensusSplice);
742  expected.clear();
743  seen.clear();
744  NStr::Split(rval, "\n", seen);
745  expected.push_back("Not Splice Consensus");
746  expected.push_back("intron\tlcl|nuc\tGT at 17");
747  expected.push_back("intron\tlcl|nuc\tGT at 1");
748  expected.push_back("intron\tlcl|nuc\tAG at 11");
749  expected.push_back("CDS\tlcl|nuc\tGT at 16");
750  expected.push_back("");
751  CheckStrings(seen, expected);
752 
753  expected.clear();
754  seen.clear();
755  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
756  for (const string& it : cat_list) {
757  vector<string> sublist;
758  NStr::Split(it, "\n", sublist);
759  for (const string& sit : sublist) {
760  seen.push_back(sit);
761  }
762  }
763  expected.push_back("Not Splice Consensus");
764  expected.push_back("intron\tlcl|nuc\tGT at 17");
765  expected.push_back("intron\tlcl|nuc\tGT at 1");
766  expected.push_back("intron\tlcl|nuc\tAG at 11");
767  expected.push_back("CDS\tlcl|nuc\tGT at 16");
768  expected.push_back("");
769  expected.push_back("EC Number Format");
770  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
771  expected.push_back("");
772  expected.push_back("EC Number Value");
773  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
774  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
775  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
776  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
777  expected.push_back("");
778  expected.push_back("Bad Institution Codes");
779  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
780  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
781  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
782  expected.push_back("");
783  expected.push_back("LatLonCountry Errors");
784  expected.push_back("lcl|nuc:Lat_lon '30 N 30 E' maps to 'Egypt' instead of 'Panama'");
785  expected.push_back("");
786  CheckStrings(seen, expected);
787 
788 }
789 
790 
791 BOOST_AUTO_TEST_CASE(Test_GB_6395)
792 {
793  // prepare entry
795  unit_test_util::SetTaxon(entry, 0);
796 
798 
799  eval = validator.Validate(seh, options);
800 
801  CValidErrorFormat format(*objmgr);
802  vector<string> expected;
803  vector<string> seen;
804 
805  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
806  for (const string& it : cat_list) {
807  vector<string> sublist;
808  NStr::Split(it, "\n", sublist);
809  for (const string& sit : sublist) {
810  seen.push_back(sit);
811  }
812  }
813  expected.push_back("NoTaxonID");
814  expected.push_back("lcl|good:Sebaea microphylla");
815  expected.push_back("");
816 
817  CheckStrings(seen, expected);
818 }
819 
820 
821 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonState)
822 {
823  // prepare entry
825  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "USA: South Carolina");
827 
829 
830  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "LatLonState",
831  "Lat_lon '36 N 80 W' maps to 'USA: North Carolina' instead of 'USA: South Carolina' - claimed region 'USA: South Carolina' is at distance 130 km"));
832  // AddChromosomeNoLocation(expected_errors, "lcl|good");
834  eval = validator.Validate(seh, options);
835  CheckErrors(*eval, expected_errors);
836 
838 }
839 
840 
842 {
844  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
845  prot->SetData().SetProt().SetEc().push_back("1.2.3.10");
846  prot->SetData().SetProt().SetEc().push_back("1.1.3.22");
847  prot->SetData().SetProt().SetEc().push_back("1.1.99.n");
848  prot->SetData().SetProt().SetEc().push_back("1.1.1.17");
849  prot->SetData().SetProt().SetEc().push_back("11.22.33.44");
850  prot->SetData().SetProt().SetEc().push_back("11.22.n33.44");
851  prot->SetData().SetProt().SetEc().push_back("11.22.33.n44");
852  return entry;
853 }
854 
855 
856 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadEcNumberValue)
857 {
859  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
860 
862 
863  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "DeletedEcNumber",
864  "EC_number 1.2.3.10 was deleted"));
865  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ReplacedEcNumber",
866  "EC_number 1.1.3.22 was transferred and is no longer valid"));
867  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadEcNumberValue",
868  "11.22.33.44 is not a legal value for qualifier EC_number"));
869  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadEcNumberFormat",
870  "11.22.n33.44 is not in proper EC_number format"));
871  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Info, "BadEcNumberValue",
872  "11.22.33.n44 is not a legal preliminary value for qualifier EC_number"));
873  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
874  eval = validator.Validate(seh, options);
875  CheckErrors(*eval, expected_errors);
876 
877  scope.RemoveTopLevelSeqEntry(seh);
878  prot->SetData().SetProt().ResetEc();
880  misc->SetData().SetImp().SetKey("exon");
881  misc->AddQualifier("EC_number", "1.2.3.10");
882  misc->AddQualifier("EC_number", "1.1.3.22");
883  misc->AddQualifier("EC_number", "1.1.99.n");
884  misc->AddQualifier("EC_number", "1.1.1.17");
885  misc->AddQualifier("EC_number", "11.22.33.44");
886  misc->AddQualifier("EC_number", "11.22.n33.44");
887  misc->AddQualifier("EC_number", "11.22.33.n44");
888  SetErrorsAccessions(expected_errors, "lcl|nuc");
889  expected_errors[1]->SetErrMsg("EC_number 1.1.3.22 was replaced");
890  seh = scope.AddTopLevelSeqEntry(*entry);
891  eval = validator.Validate(seh, options);
892  CheckErrors(*eval, expected_errors);
893 
895 }
896 
897 
898 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidQualifierValue)
899 {
902  misc->SetData().SetImp().SetKey("repeat_region");
903  misc->AddQualifier("rpt_unit_seq", "ATA");
904 
906 
907  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "RepeatSeqDoNotMatch",
908  "repeat_region /rpt_unit and underlying sequence do not match"));
909  // AddChromosomeNoLocation(expected_errors, "lcl|good");
910  eval = validator.Validate(seh, options);
911  CheckErrors(*eval, expected_errors);
912 
913  scope.RemoveTopLevelSeqEntry(seh);
915  misc = unit_test_util::AddMiscFeature(entry);
916  misc->SetData().SetImp().SetKey("repeat_region");
917  misc->AddQualifier("rpt_unit_seq", "ATAGTGATAGTG");
918  seh = scope.AddTopLevelSeqEntry(*entry);
919  expected_errors[0]->SetErrCode("InvalidRepeatUnitLength");
920  expected_errors[0]->SetErrMsg("Length of rpt_unit_seq is greater than feature length");
921  expected_errors[0]->SetSeverity(eDiag_Info);
922  eval = validator.Validate(seh, options);
923  CheckErrors(*eval, expected_errors);
924 
926 }
927 
928 
929 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ExtNotAllowed)
930 {
932 
934 
935  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "ExtNotAllowed", "Bioseq-ext not allowed on virtual Bioseq"));
936  // AddChromosomeNoLocation(expected_errors, "lcl|good");
937 
938  // repr = virtual
939  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_virtual);
940  entry->SetSeq().SetInst().ResetSeq_data();
941  entry->SetSeq().SetInst().SetExt().SetDelta();
942  eval = validator.Validate(seh, options);
943  CheckErrors(*eval, expected_errors);
944 
945  // repr = raw
946  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_raw);
947  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
948  expected_errors[0]->SetErrMsg("Bioseq-ext not allowed on raw Bioseq");
949  eval = validator.Validate(seh, options);
950  CheckErrors(*eval, expected_errors);
951 
952  entry->SetSeq().SetInst().ResetExt();
953  entry->SetSeq().SetInst().ResetSeq_data();
954  expected_errors[0]->SetErrCode("SeqDataNotFound");
955  expected_errors[0]->SetErrMsg("Missing Seq-data on raw Bioseq");
956  expected_errors[0]->SetSeverity(eDiag_Critical);
957  eval = validator.Validate(seh, options);
958  CheckErrors(*eval, expected_errors);
959 
960  entry->SetSeq().SetInst().SetSeq_data().SetGap();
961  eval = validator.Validate(seh, options);
962  CheckErrors(*eval, expected_errors);
963 
964  // repr = const
965  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_const);
966  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
967  entry->SetSeq().SetInst().SetExt().SetDelta();
968  expected_errors[0]->SetErrCode("ExtNotAllowed");
969  expected_errors[0]->SetErrMsg("Bioseq-ext not allowed on constructed Bioseq");
970  eval = validator.Validate(seh, options);
971  CheckErrors(*eval, expected_errors);
972 
973  entry->SetSeq().SetInst().ResetExt();
974  entry->SetSeq().SetInst().ResetSeq_data();
975  expected_errors[0]->SetErrCode("SeqDataNotFound");
976  expected_errors[0]->SetErrMsg("Missing Seq-data on constructed Bioseq");
977  expected_errors[0]->SetSeverity(eDiag_Critical);
978  eval = validator.Validate(seh, options);
979  CheckErrors(*eval, expected_errors);
980 
981  entry->SetSeq().SetInst().SetSeq_data().SetGap();
982  eval = validator.Validate(seh, options);
983  CheckErrors(*eval, expected_errors);
984 
985  // repr = map
986  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_map);
987  entry->SetSeq().SetInst().ResetSeq_data();
988  expected_errors[0]->SetErrCode("ExtBadOrMissing");
989  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on map Bioseq");
990  expected_errors[0]->SetSeverity(eDiag_Error);
991  eval = validator.Validate(seh, options);
992  CheckErrors(*eval, expected_errors);
993 
994  entry->SetSeq().SetInst().SetExt().SetDelta();
995  eval = validator.Validate(seh, options);
996  CheckErrors(*eval, expected_errors);
997 
998  entry->SetSeq().SetInst().SetExt().SetRef();
999  eval = validator.Validate(seh, options);
1000  CheckErrors(*eval, expected_errors);
1001 
1002  entry->SetSeq().SetInst().SetExt().SetMap();
1003  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
1004  expected_errors[0]->SetErrCode("SeqDataNotAllowed");
1005  expected_errors[0]->SetErrMsg("Seq-data not allowed on map Bioseq");
1006  eval = validator.Validate(seh, options);
1007  CheckErrors(*eval, expected_errors);
1008 
1009 
1010  // repr = ref
1011  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_ref);
1012  entry->SetSeq().SetInst().ResetExt();
1013  entry->SetSeq().SetInst().ResetSeq_data();
1014  expected_errors[0]->SetErrCode("ExtBadOrMissing");
1015  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on reference Bioseq");
1016  eval = validator.Validate(seh, options);
1017  CheckErrors(*eval, expected_errors);
1018 
1019  /*
1020  // repr = seg
1021  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
1022  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on seg Bioseq");
1023  eval = validator.Validate(seh, options);
1024  CheckErrors(*eval, expected_errors);
1025  */
1026 
1027  // repr = consen
1028  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_consen);
1029  expected_errors[0]->SetSeverity(eDiag_Critical);
1030  expected_errors[0]->SetErrCode("ReprInvalid");
1031  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 6");
1032  eval = validator.Validate(seh, options);
1033  CheckErrors(*eval, expected_errors);
1034 
1035  // repr = notset
1036  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_not_set);
1037  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 0");
1038  eval = validator.Validate(seh, options);
1039  CheckErrors(*eval, expected_errors);
1040 
1041  // repr = other
1042  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_other);
1043  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 255");
1044  eval = validator.Validate(seh, options);
1045  CheckErrors(*eval, expected_errors);
1046 
1047  // repr = delta
1048  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1049  entry->SetSeq().SetInst().SetExt().SetDelta();
1050  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
1051  expected_errors[0]->SetSeverity(eDiag_Error);
1052  expected_errors[0]->SetErrCode("SeqDataNotAllowed");
1053  expected_errors[0]->SetErrMsg("Seq-data not allowed on delta Bioseq");
1054  eval = validator.Validate(seh, options);
1055  CheckErrors(*eval, expected_errors);
1056 
1057  entry->SetSeq().SetInst().ResetExt();
1058  entry->SetSeq().SetInst().ResetSeq_data();
1059  expected_errors[0]->SetSeverity(eDiag_Error);
1060  expected_errors[0]->SetErrCode("ExtBadOrMissing");
1061  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on delta Bioseq");
1062  eval = validator.Validate(seh, options);
1063  CheckErrors(*eval, expected_errors);
1064 
1065  CLEAR_ERRORS
1066 }
1067 
1068 
1069 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ReprInvalid)
1070 {
1072 
1074  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "ReprInvalid", "Invalid Bioseq->repr = 0"));
1075  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1076 
1077  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_not_set);
1078  eval = validator.Validate(seh, options);
1079  CheckErrors(*eval, expected_errors);
1080 
1081  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 255");
1082  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_other);
1083  eval = validator.Validate(seh, options);
1084  CheckErrors(*eval, expected_errors);
1085 
1086  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 6");
1087  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_consen);
1088  eval = validator.Validate(seh, options);
1089  CheckErrors(*eval, expected_errors);
1090 
1091  CLEAR_ERRORS
1092 }
1093 
1094 
1095 BOOST_AUTO_TEST_CASE(Test_CollidingLocusTags)
1096 {
1097  CRef<CSeq_entry> entry(new CSeq_entry());
1098  {
1100  istr >> MSerial_AsnText >> *entry;
1101  }
1102 
1104  CScope scope(*objmgr);
1105  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
1106 
1107  CValidator validator(*objmgr);
1108 
1109  // Set validator options
1110  unsigned int options = CValidator::eVal_need_isojta
1114 
1115  // list of expected errors
1116  vector< CExpectedError *> expected_errors;
1117  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Warning, "TerminalNs", "N at end of sequence"));
1118  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Warning, "GeneLocusCollidesWithLocusTag", "locus collides with locus_tag in another gene"));
1119  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "CollidingLocusTags", "Colliding locus_tags in gene features"));
1120  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "CollidingLocusTags", "Colliding locus_tags in gene features"));
1121  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoMolInfoFound", "No Mol-info applies to this Bioseq"));
1122  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "LocusTagGeneLocusMatch", "Gene locus and locus_tag 'foo' match"));
1123  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoPubFound", "No publications anywhere on this entire record."));
1124  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Info, "MissingPubRequirement", "No submission citation anywhere on this entire record."));
1125  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoSourceDescriptor", "No source information included on this record."));
1126 
1127  CConstRef<CValidError> eval = validator.Validate(seh, options);
1128  CheckErrors(*eval, expected_errors);
1129 
1130  CLEAR_ERRORS
1131 }
1132 
1133 
1134 const string sc_TestEntryCollidingLocusTags ="Seq-entry ::= seq {\
1135  id {\
1136  local str \"LocusCollidesWithLocusTag\" } ,\
1137  inst {\
1138  repr raw ,\
1139  mol dna ,\
1140  length 24 ,\
1141  seq-data\
1142  iupacna \"AATTGGCCAANNAATTGGCCAANN\" } ,\
1143  annot {\
1144  {\
1145  data\
1146  ftable {\
1147  {\
1148  data\
1149  gene {\
1150  locus \"foo\" ,\
1151  locus-tag \"foo\" } ,\
1152  location\
1153  int {\
1154  from 0 ,\
1155  to 4 ,\
1156  strand plus ,\
1157  id\
1158  local str \"LocusCollidesWithLocusTag\" } } ,\
1159  {\
1160  data\
1161  gene {\
1162  locus \"bar\" ,\
1163  locus-tag \"foo\" } ,\
1164  location\
1165  int {\
1166  from 5 ,\
1167  to 9 ,\
1168  strand plus ,\
1169  id\
1170  local str \"LocusCollidesWithLocusTag\" } } ,\
1171  {\
1172  data\
1173  gene {\
1174  locus \"bar\" ,\
1175  locus-tag \"baz\" } ,\
1176  location\
1177  int {\
1178  from 10 ,\
1179  to 14 ,\
1180  strand plus ,\
1181  id\
1182  local str \"LocusCollidesWithLocusTag\" } } ,\
1183  {\
1184  data\
1185  gene {\
1186  locus \"quux\" ,\
1187  locus-tag \"baz\" } ,\
1188  location\
1189  int {\
1190  from 15 ,\
1191  to 19 ,\
1192  strand plus ,\
1193  id\
1194  local str \"LocusCollidesWithLocusTag\" } } } } } }\
1195 ";
1196 
1197 
1198 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_CircularProtein)
1199 {
1201 
1203 
1204  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "CircularProtein", "Non-linear topology set on protein"));
1205  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1206 
1208 
1209  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
1210  eval = validator.Validate(seh, options);
1211  CheckErrors(*eval, expected_errors);
1212 
1213  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_tandem);
1214  eval = validator.Validate(seh, options);
1215  CheckErrors(*eval, expected_errors);
1216 
1217  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_other);
1218  eval = validator.Validate(seh, options);
1219  CheckErrors(*eval, expected_errors);
1220 
1221  // should be no error for not set or linear
1222  CLEAR_ERRORS
1223 
1224  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_not_set);
1225  eval = validator.Validate(seh, options);
1226  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1227  CheckErrors(*eval, expected_errors);
1228 
1229  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_linear);
1230  eval = validator.Validate(seh, options);
1231  CheckErrors(*eval, expected_errors);
1232 
1233  CLEAR_ERRORS
1234 }
1235 
1236 
1237 BOOST_AUTO_TEST_CASE(Test_BadProteinMoltype)
1238 {
1240 
1242 
1243  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadProteinMoltype", "Protein not single stranded"));
1244  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1245 
1246  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ds);
1247  eval = validator.Validate(seh, options);
1248  CheckErrors(*eval, expected_errors);
1249 
1250  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_mixed);
1251  eval = validator.Validate(seh, options);
1252  CheckErrors(*eval, expected_errors);
1253 
1254  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_other);
1255  eval = validator.Validate(seh, options);
1256  CheckErrors(*eval, expected_errors);
1257 
1258  // no errors expected for not set or single strand
1259  CLEAR_ERRORS
1260 
1261  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1262 
1263  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_not_set);
1264  eval = validator.Validate(seh, options);
1265  CheckErrors(*eval, expected_errors);
1266 
1267  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ss);
1268  eval = validator.Validate(seh, options);
1269  CheckErrors(*eval, expected_errors);
1270 
1271  CLEAR_ERRORS
1272 }
1273 
1274 
1275 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNotSet)
1276 {
1278 
1280 
1281  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MolNotSet", "Bioseq.mol is 0"));
1282  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1283 
1284  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_not_set);
1285  eval = validator.Validate(seh, options);
1286  CheckErrors(*eval, expected_errors);
1287 
1288  expected_errors[0]->SetErrCode("MolOther");
1289  expected_errors[0]->SetErrMsg("Bioseq.mol is type other");
1290  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_other);
1291  eval = validator.Validate(seh, options);
1292  CheckErrors(*eval, expected_errors);
1293 
1294  expected_errors[0]->SetErrCode("MolNuclAcid");
1295  expected_errors[0]->SetErrMsg("Bioseq.mol is type nucleic acid");
1296  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_na);
1297  eval = validator.Validate(seh, options);
1298  CheckErrors(*eval, expected_errors);
1299 
1300  CLEAR_ERRORS
1301 
1302 }
1303 
1304 
1305 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_FuzzyLen)
1306 {
1308 
1310 
1311  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "FuzzyLen", "Fuzzy length on raw Bioseq"));
1312  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1313 
1314  entry->SetSeq().SetInst().SetFuzz();
1315  eval = validator.Validate(seh, options);
1316  CheckErrors(*eval, expected_errors);
1317 
1318  expected_errors[0]->SetErrMsg("Fuzzy length on const Bioseq");
1319  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_const);
1320  eval = validator.Validate(seh, options);
1321  CheckErrors(*eval, expected_errors);
1322 
1323  // shouldn't get fuzzy length if gap
1324  expected_errors[0]->SetErrCode("SeqDataNotFound");
1325  expected_errors[0]->SetErrMsg("Missing Seq-data on constructed Bioseq");
1326  expected_errors[0]->SetSeverity(eDiag_Critical);
1327  entry->SetSeq().SetInst().SetSeq_data().SetGap();
1328  eval = validator.Validate(seh, options);
1329  CheckErrors(*eval, expected_errors);
1330 
1331  CLEAR_ERRORS
1332 }
1333 
1334 
1335 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidAlphabet)
1336 {
1338 
1340  CScope scope(*objmgr);
1341  scope.AddDefaults();
1342  CSeq_entry_Handle prot_seh = scope.AddTopLevelSeqEntry(*prot_entry);
1343 
1344  CValidator validator(*objmgr);
1345 
1346  // Set validator options
1347  unsigned int options = CValidator::eVal_need_isojta
1351 
1352  // list of expected errors
1353  vector< CExpectedError *> expected_errors;
1354  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidAlphabet", "Using a nucleic acid alphabet on a protein sequence"));
1355  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1356  prot_entry->SetSeq().SetInst().SetSeq_data().SetIupacna();
1357  CConstRef<CValidError> eval = validator.Validate(prot_seh, options);
1358  CheckErrors(*eval, expected_errors);
1359 
1360  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi2na();
1361  eval = validator.Validate(prot_seh, options);
1362  CheckErrors(*eval, expected_errors);
1363 
1364  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na();
1365  eval = validator.Validate(prot_seh, options);
1366  CheckErrors(*eval, expected_errors);
1367 
1368  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi8na();
1369  eval = validator.Validate(prot_seh, options);
1370  CheckErrors(*eval, expected_errors);
1371 
1372  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbipna();
1373  eval = validator.Validate(prot_seh, options);
1374  CheckErrors(*eval, expected_errors);
1375 
1377  CScope scope2(*objmgr);
1378  scope2.AddDefaults();
1379  CSeq_entry_Handle seh = scope2.AddTopLevelSeqEntry(*entry);
1380 
1381  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa();
1382  expected_errors[0]->SetErrMsg("Using a protein alphabet on a nucleic acid");
1383 
1384  eval = validator.Validate(seh, options);
1385  CheckErrors(*eval, expected_errors);
1386 
1387  entry->SetSeq().SetInst().SetSeq_data().SetNcbi8aa();
1388  eval = validator.Validate(seh, options);
1389  CheckErrors(*eval, expected_errors);
1390 
1391  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa();
1392  eval = validator.Validate(seh, options);
1393  CheckErrors(*eval, expected_errors);
1394 
1395  entry->SetSeq().SetInst().SetSeq_data().SetNcbipaa();
1396  eval = validator.Validate(seh, options);
1397  CheckErrors(*eval, expected_errors);
1398 
1399  entry->SetSeq().SetInst().SetSeq_data().SetNcbistdaa();
1400  eval = validator.Validate(seh, options);
1401  CheckErrors(*eval, expected_errors);
1402 
1403  CLEAR_ERRORS
1404 }
1405 
1406 
1407 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidResidue)
1408 {
1410 
1412 
1413  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1414  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1415  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1416  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1417  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1418  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1419  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1420  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1421  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1422  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1423  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFE');
1424  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFE');
1425  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFF');
1426  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFF');
1427  entry->SetSeq().SetInst().SetLength(65);
1428  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'E' at position [5]"));
1429  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'F' at position [6]"));
1430  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'I' at position [9]"));
1431  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'J' at position [10]"));
1432  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'L' at position [12]"));
1433  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'O' at position [15]"));
1434  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'P' at position [16]"));
1435  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Q' at position [17]"));
1436  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'U' at position [21]"));
1437  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'X' at position [24]"));
1438  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Z' at position [26]"));
1439  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'E' at position [31]"));
1440  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'F' at position [32]"));
1441  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'I' at position [35]"));
1442  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'J' at position [36]"));
1443  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'L' at position [38]"));
1444  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'O' at position [41]"));
1445  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'P' at position [42]"));
1446  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Q' at position [43]"));
1447  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'U' at position [47]"));
1448  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'X' at position [50]"));
1449  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Z' at position [52]"));
1450  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [53]"));
1451  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [54]"));
1452  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [55]"));
1453  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [56]"));
1454  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [57]"));
1455  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [58]"));
1456  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [59]"));
1457  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [60]"));
1458  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [61]"));
1459  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [254] at position [62]"));
1460  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "More than 10 invalid residues. Checking stopped"));
1461  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
1462  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1463 
1464  eval = validator.Validate(seh, options);
1465  CheckErrors(*eval, expected_errors);
1466 
1467  // now repeat test, but with mRNA - this time Us should not be reported
1468  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
1469  delete expected_errors[8];
1470  expected_errors[8] = nullptr;
1471  delete expected_errors[19];
1472  expected_errors[19] = nullptr;
1473  eval = validator.Validate(seh, options);
1474  CheckErrors(*eval, expected_errors);
1475 
1476  // now repeat test, but with protein
1477  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_aa);
1478  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1479  if (it->IsMolinfo()) {
1480  it->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1481  }
1482  }
1483  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1484  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1485  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1486  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1487  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1488  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1489  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1490  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1491  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1492  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1493  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFE');
1494  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFE');
1495  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFF');
1496  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFF');
1497  entry->SetSeq().SetInst().SetLength(65);
1498  CRef<CSeq_feat> feat (new CSeq_feat());
1499  feat->SetData().SetProt().SetName().push_back("fake protein name");
1500  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
1501  feat->SetLocation().SetInt().SetFrom(0);
1502  feat->SetLocation().SetInt().SetTo(64);
1503  unit_test_util::AddFeat(feat, entry);
1504  scope.RemoveEntry (*entry);
1505  seh = scope.AddTopLevelSeqEntry(*entry);
1506 
1507  for (int j = 0; j < 22; j++) {
1508  if (expected_errors[j]) {
1509  delete expected_errors[j];
1510  expected_errors[j] = nullptr;
1511  }
1512  }
1513  eval = validator.Validate(seh, options);
1514  CheckErrors(*eval, expected_errors);
1515 
1516  CLEAR_ERRORS
1517 
1518  // now look for lowercase characters
1519  scope.RemoveEntry (*entry);
1520  entry = unit_test_util::BuildGoodSeq();
1521  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("abcdefghijklmnopqrstuvwxyz");
1522  entry->SetSeq().SetInst().SetLength(26);
1523  seh = scope.AddTopLevelSeqEntry(*entry);
1524  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Sequence contains lower-case characters"));
1525  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1526  eval = validator.Validate(seh, options);
1527  CheckErrors(*eval, expected_errors);
1528 
1529  scope.RemoveEntry (*entry);
1531  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("protein");
1532  seh = scope.AddTopLevelSeqEntry(*entry);
1533  eval = validator.Validate(seh, options);
1534  CheckErrors(*eval, expected_errors);
1535 
1536 
1537  CLEAR_ERRORS
1538 
1539  // now try delta sequence
1540  scope.RemoveEntry (*entry);
1541  entry = unit_test_util::BuildGoodSeq();
1542  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1543  entry->SetSeq().SetInst().ResetSeq_data();
1544  CRef<CDelta_seq> seg(new CDelta_seq());
1545  seg->SetLiteral().SetSeq_data().SetIupacna().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1546  seg->SetLiteral().SetLength(52);
1547  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(seg);
1548  entry->SetSeq().SetInst().SetLength(52);
1549  seh = scope.AddTopLevelSeqEntry(*entry);
1550 
1551  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [E] at position [5]"));
1552  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [F] at position [6]"));
1553  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [I] at position [9]"));
1554  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [J] at position [10]"));
1555  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [L] at position [12]"));
1556  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [O] at position [15]"));
1557  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [P] at position [16]"));
1558  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Q] at position [17]"));
1559  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [U] at position [21]"));
1560  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [X] at position [24]"));
1561  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Z] at position [26]"));
1562  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [E] at position [31]"));
1563  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [F] at position [32]"));
1564  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [I] at position [35]"));
1565  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [J] at position [36]"));
1566  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [L] at position [38]"));
1567  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [O] at position [41]"));
1568  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [P] at position [42]"));
1569  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Q] at position [43]"));
1570  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [U] at position [47]"));
1571  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [X] at position [50]"));
1572  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Z] at position [52]"));
1573  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1574  eval = validator.Validate(seh, options);
1575  CheckErrors(*eval, expected_errors);
1576 
1577  CLEAR_ERRORS
1578 
1579  // try protein delta sequence
1580  scope.RemoveEntry (*entry);
1582  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1583  entry->SetSeq().SetInst().ResetSeq_data();
1584  CRef<CDelta_seq> seg2(new CDelta_seq());
1585  seg2->SetLiteral().SetSeq_data().SetIupacaa().Set("1234567");
1586  seg2->SetLiteral().SetLength(7);
1587  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(seg2);
1588  entry->SetSeq().SetInst().SetLength(7);
1589  seh = scope.AddTopLevelSeqEntry(*entry);
1590 
1591  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [1] at position [1]"));
1592  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [2] at position [2]"));
1593  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [3] at position [3]"));
1594  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [4] at position [4]"));
1595  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [5] at position [5]"));
1596  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [6] at position [6]"));
1597  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [7] at position [7]"));
1598  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1599 
1600  eval = validator.Validate(seh, options);
1601  CheckErrors(*eval, expected_errors);
1602 
1603  CLEAR_ERRORS
1604 }
1605 
1606 
1607 /*
1608 static void WriteOutTemp (CRef<CSeq_entry> entry)
1609 {
1610  // construct a temp file name
1611  CNcbiOstrstream oss;
1612  oss << "test.asn";
1613  string filename = CNcbiOstrstreamToString(oss);
1614  string fullPath = CDirEntry::MakePath(".", filename);
1615 
1616  // initialize a binary output stream
1617  unique_ptr<CNcbiOstream> outStream;
1618  outStream.reset(new CNcbiOfstream(
1619  fullPath.c_str(),
1620  IOS_BASE::out));
1621  if (!(*outStream)) {
1622  return;
1623  }
1624 
1625  unique_ptr<CObjectOStream> outObject;
1626  // Associate ASN.1 text serialization methods with the input
1627  outObject.reset(new CObjectOStreamAsn(*outStream));
1628 
1629  // write the asn data
1630  try {
1631  *outObject << *entry;
1632  outStream->flush();
1633  } catch (exception& ) {
1634  }
1635 }
1636 */
1637 
1638 
1639 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_StopInProtein)
1640 {
1642 
1644 
1645  entry->SetSet().SetSeq_set().back()->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MP*K*E*N");
1646  entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("GTGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
1648  cds->SetExcept(true);
1649  cds->SetExcept_text("unclassified translation discrepancy");
1650 
1651  BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
1652  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
1653 
1654  // list of expected errors
1655  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
1656  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "ExceptionProblem", "unclassified translation discrepancy is not a legal exception explanation"));
1657  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
1658  expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
1659  "CDS has unnecessary translated product replaced exception"));
1660  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
1661 
1662  eval = validator.Validate(seh, options);
1663  CheckErrors(*eval, expected_errors);
1664  // WriteOutTemp(entry);
1665 
1666  CLEAR_ERRORS
1667  cds->ResetExcept();
1668  cds->ResetExcept_text();
1669  BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
1670  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
1671  BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
1672 
1673  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
1674  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon", "Illegal start codon (and 3 internal stops). Probably wrong genetic code [0]"));
1675  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
1676  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
1677 
1678  eval = validator.Validate(seh, options);
1679  CheckErrors(*eval, expected_errors);
1680  // WriteOutTemp(entry);
1681 
1683  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
1684 
1685  // write out seq-entry
1686  // WriteOutTemp(entry);
1687 
1688  delete expected_errors[1];
1689  expected_errors[1] = nullptr;
1690  expected_errors[2]->SetErrMsg("3 internal stops. Genetic code [0]");
1691  eval = validator.Validate(seh, options);
1692  CheckErrors(*eval, expected_errors);
1693 
1694  CLEAR_ERRORS
1695 }
1696 
1697 
1698 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_PartialInconsistent)
1699 {
1700 #if 0
1701  //We don't care about segmented sets any more
1703 
1705 
1706  entry->SetSeq().SetInst().ResetSeq_data();
1707  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
1708  CRef<CSeq_id> id(new CSeq_id("gb|AY123456"));
1709  CRef<CSeq_loc> loc1(new CSeq_loc(*id, 0, 3));
1710  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc1);
1711  CRef<CSeq_id> id2(new CSeq_id("gb|AY123457"));
1712  CRef<CSeq_loc> loc2(new CSeq_loc(*id2, 0, 2));
1713  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc2);
1714 
1715  // list of expected errors
1716  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "PartialInconsistent", "Partial segmented sequence without MolInfo partial"));
1717 
1718  // not-set
1719  loc1->SetPartialStart(true, eExtreme_Biological);
1720  loc2->SetPartialStop(true, eExtreme_Biological);
1721  eval = validator.Validate(seh, options);
1722  CheckErrors(*eval, expected_errors);
1723  loc1->SetPartialStart(true, eExtreme_Biological);
1724  loc2->SetPartialStop(false, eExtreme_Biological);
1725  eval = validator.Validate(seh, options);
1726  CheckErrors(*eval, expected_errors);
1727  loc1->SetPartialStart(false, eExtreme_Biological);
1728  loc2->SetPartialStop(true, eExtreme_Biological);
1729  eval = validator.Validate(seh, options);
1730  CheckErrors(*eval, expected_errors);
1731 
1732  // unknown
1734 
1735  loc1->SetPartialStart(true, eExtreme_Biological);
1736  loc2->SetPartialStop(true, eExtreme_Biological);
1737  eval = validator.Validate(seh, options);
1738  CheckErrors(*eval, expected_errors);
1739  loc1->SetPartialStart(true, eExtreme_Biological);
1740  loc2->SetPartialStop(false, eExtreme_Biological);
1741  eval = validator.Validate(seh, options);
1742  CheckErrors(*eval, expected_errors);
1743  loc1->SetPartialStart(false, eExtreme_Biological);
1744  loc2->SetPartialStop(true, eExtreme_Biological);
1745  eval = validator.Validate(seh, options);
1746  CheckErrors(*eval, expected_errors);
1747 
1748  // complete
1750 
1751  loc1->SetPartialStart(true, eExtreme_Biological);
1752  loc2->SetPartialStop(true, eExtreme_Biological);
1753  eval = validator.Validate(seh, options);
1754  CheckErrors(*eval, expected_errors);
1755  loc1->SetPartialStart(true, eExtreme_Biological);
1756  loc2->SetPartialStop(false, eExtreme_Biological);
1757  eval = validator.Validate(seh, options);
1758  CheckErrors(*eval, expected_errors);
1759  loc1->SetPartialStart(false, eExtreme_Biological);
1760  loc2->SetPartialStop(true, eExtreme_Biological);
1761  eval = validator.Validate(seh, options);
1762  CheckErrors(*eval, expected_errors);
1763 
1764  // partial
1766 
1767  loc1->SetPartialStart(false, eExtreme_Biological);
1768  loc2->SetPartialStop(false, eExtreme_Biological);
1769  expected_errors[0]->SetErrMsg("Complete segmented sequence with MolInfo partial");
1770  eval = validator.Validate(seh, options);
1771  CheckErrors(*eval, expected_errors);
1772 
1773  // no-left
1775 
1776  loc1->SetPartialStart(true, eExtreme_Biological);
1777  loc2->SetPartialStop(true, eExtreme_Biological);
1778  expected_errors[0]->SetErrMsg("No-left inconsistent with segmented SeqLoc");
1779  eval = validator.Validate(seh, options);
1780  CheckErrors(*eval, expected_errors);
1781  loc1->SetPartialStart(false, eExtreme_Biological);
1782  loc2->SetPartialStop(true, eExtreme_Biological);
1783  eval = validator.Validate(seh, options);
1784  CheckErrors(*eval, expected_errors);
1785  loc1->SetPartialStart(false, eExtreme_Biological);
1786  loc2->SetPartialStop(false, eExtreme_Biological);
1787  eval = validator.Validate(seh, options);
1788  CheckErrors(*eval, expected_errors);
1789 
1790  // no-right
1792 
1793  loc1->SetPartialStart(true, eExtreme_Biological);
1794  loc2->SetPartialStop(true, eExtreme_Biological);
1795  expected_errors[0]->SetErrMsg("No-right inconsistent with segmented SeqLoc");
1796  eval = validator.Validate(seh, options);
1797  CheckErrors(*eval, expected_errors);
1798  loc1->SetPartialStart(true, eExtreme_Biological);
1799  loc2->SetPartialStop(false, eExtreme_Biological);
1800  eval = validator.Validate(seh, options);
1801  CheckErrors(*eval, expected_errors);
1802  loc1->SetPartialStart(false, eExtreme_Biological);
1803  loc2->SetPartialStop(false, eExtreme_Biological);
1804  eval = validator.Validate(seh, options);
1805  CheckErrors(*eval, expected_errors);
1806 
1807  // no-ends
1809 
1810  expected_errors[0]->SetErrMsg("No-ends inconsistent with segmented SeqLoc");
1811  loc1->SetPartialStart(true, eExtreme_Biological);
1812  loc2->SetPartialStop(false, eExtreme_Biological);
1813  eval = validator.Validate(seh, options);
1814  CheckErrors(*eval, expected_errors);
1815  loc1->SetPartialStart(false, eExtreme_Biological);
1816  loc2->SetPartialStop(true, eExtreme_Biological);
1817  eval = validator.Validate(seh, options);
1818  CheckErrors(*eval, expected_errors);
1819  loc1->SetPartialStart(false, eExtreme_Biological);
1820  loc2->SetPartialStop(false, eExtreme_Biological);
1821  eval = validator.Validate(seh, options);
1822  CheckErrors(*eval, expected_errors);
1823 
1824  CLEAR_ERRORS
1825 #endif
1826 }
1827 
1828 
1829 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ShortSeq)
1830 {
1832 
1834 
1835  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPR");
1836  entry->SetSeq().SetInst().SetLength(3);
1837  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetTo(2);
1838 
1839  // don't report if pdb
1840  CRef<CPDB_seq_id> pdb_id(new CPDB_seq_id());
1841  pdb_id->SetMol().Set("foo");
1842  entry->SetSeq().SetId().front()->SetPdb(*pdb_id);
1843  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetPdb(*pdb_id);
1844  scope.RemoveTopLevelSeqEntry(seh);
1845  seh = scope.AddTopLevelSeqEntry(*entry);
1846  eval = validator.Validate(seh, options);
1847  // AddChromosomeNoLocation(expected_errors, "pdb|foo| ");
1848  CheckErrors(*eval, expected_errors);
1849 
1850  // new test if no coding region
1851  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PartialsInconsistent", "Molinfo completeness and protein feature partials conflict"));
1852  expected_errors[0]->SetAccession("lcl|good");
1853  entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
1854  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
1855  scope.RemoveTopLevelSeqEntry(seh);
1856  seh = scope.AddTopLevelSeqEntry(*entry);
1858 
1859  eval = validator.Validate(seh, options);
1860  CheckErrors(*eval, expected_errors);
1862  eval = validator.Validate(seh, options);
1863  CheckErrors(*eval, expected_errors);
1865  eval = validator.Validate(seh, options);
1866  CheckErrors(*eval, expected_errors);
1868  eval = validator.Validate(seh, options);
1869  CheckErrors(*eval, expected_errors);
1870 
1871  CLEAR_ERRORS
1872 
1873  // for all other completeness, report
1874  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortSeq", "Sequence only 3 residues"));
1875  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1876  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1877  if (it->IsMolinfo()) {
1878  it->SetMolinfo().ResetCompleteness();
1879  }
1880  }
1881  eval = validator.Validate(seh, options);
1882  CheckErrors(*eval, expected_errors);
1884  eval = validator.Validate(seh, options);
1885  CheckErrors(*eval, expected_errors);
1887  eval = validator.Validate(seh, options);
1888  CheckErrors(*eval, expected_errors);
1890  eval = validator.Validate(seh, options);
1891  CheckErrors(*eval, expected_errors);
1892 
1893  // nucleotide
1894  scope.RemoveTopLevelSeqEntry(seh);
1895  entry = unit_test_util::BuildGoodSeq();
1896  seh = scope.AddTopLevelSeqEntry(*entry);
1897  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCTTT");
1898  entry->SetSeq().SetInst().SetLength(9);
1899  expected_errors[0]->SetErrMsg("Sequence only 9 residues");
1900  eval = validator.Validate(seh, options);
1901  CheckErrors(*eval, expected_errors);
1902 
1903  CLEAR_ERRORS
1904 
1905  // don't report if pdb
1906  entry->SetSeq().SetId().front()->SetPdb(*pdb_id);
1907  scope.RemoveTopLevelSeqEntry(seh);
1908  seh = scope.AddTopLevelSeqEntry(*entry);
1909  eval = validator.Validate(seh, options);
1910  // AddChromosomeNoLocation(expected_errors, "pdb|foo| ");
1911  CheckErrors(*eval, expected_errors);
1912 
1913  CLEAR_ERRORS
1914 }
1915 
1916 
1918 {
1919  bool rval = false;
1920 
1921  switch (tech) {
1924  case CMolInfo::eTech_both:
1928  rval = true;
1929  break;
1930  default:
1931  break;
1932  }
1933  return rval;
1934 }
1935 
1936 
1938 {
1939  CRef<CSeqdesc> desc(new CSeqdesc());
1942  if (entry->IsSeq()) {
1943  entry->SetSeq().SetDescr().Set().push_back(desc);
1944  } else if (entry->IsSet()) {
1945  entry->SetSet().SetDescr().Set().push_back(desc);
1946  }
1947 }
1948 
1949 
1950 static void SetRefGeneTrackingStatus(CRef<CSeq_entry> entry, string status)
1951 {
1952  if (entry->IsSeq()) {
1953  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1954  if (it->IsUser() && it->GetUser().IsRefGeneTracking()) {
1955  it->SetUser().SetData().front()->SetData().SetStr(status);
1956  }
1957  }
1958  } else if (entry->IsSet()) {
1959  for (auto& it : entry->SetSet().SetDescr().Set()) {
1960  if (it->IsUser() && it->GetUser().IsRefGeneTracking()) {
1961  it->SetUser().SetData().front()->SetData().SetStr(status);
1962  }
1963  }
1964  }
1965 }
1966 
1967 
1968 static void SetTitle(CRef<CSeq_entry> entry, string title)
1969 {
1970  bool found = false;
1971 
1972  if (entry->IsSetDescr()) {
1973  auto& cont = entry->SetDescr().Set();
1974  for (auto it = cont.begin(); it != cont.end();) {
1975  if ((*it)->IsTitle()) {
1976  found = true;
1977  if (NStr::IsBlank((*it)->GetTitle())) {
1978  it = cont.erase(it);
1979  continue;
1980  } else {
1981  (*it)->SetTitle(title);
1982  }
1983  }
1984  ++it;
1985  }
1986  }
1987 
1988  if (!found && !NStr::IsBlank(title)) {
1989  CRef<CSeqdesc> desc(new CSeqdesc());
1990  desc->SetTitle(title);
1991  entry->SetSeq().SetDescr().Set().push_back(desc);
1992  }
1993 }
1994 
1995 
1996 static void AddGenbankKeyword (CRef<CSeq_entry> entry, string keyword)
1997 {
1998  bool found = false;
1999 
2000  for (auto& it : entry->SetSeq().SetDescr().Set()) {
2001  if (it->IsGenbank()) {
2002  it->SetGenbank().SetKeywords().push_back(keyword);
2003  found = true;
2004  }
2005  }
2006  if (!found) {
2007  CRef<CSeqdesc> desc(new CSeqdesc());
2008  desc->SetGenbank().SetKeywords().push_back(keyword);
2009  entry->SetSeq().SetDescr().Set().push_back(desc);
2010  }
2011 }
2012 
2013 
2015 {
2018 
2019  SetTech(entry, tech);
2020  eval = validator.Validate(seh, options);
2021  if (tech == CMolInfo::eTech_barcode) {
2022  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
2023  } else if (tech == CMolInfo::eTech_tsa) {
2024  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TSAseqGapProblem", "TSA Seq_gap NULL"));
2025  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2026  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"genomic\" is not appropriate for sequences that use the TSA technique."));
2027  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAseqGapProblem", "TSA submission includes wrong gap type. Gaps for TSA should be Assembly Gaps with linkage evidence."));
2028  } else if (tech == CMolInfo::eTech_wgs) {
2029  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
2030  }
2031  if (tech == CMolInfo::eTech_wgs) {
2032  AddChromosomeNoLocation(expected_errors, "lcl|good");
2033  }
2034 
2035  CheckErrors(*eval, expected_errors);
2036 
2037  CLEAR_ERRORS
2038 }
2039 
2040 
2042 {
2045 
2046  SetTech(entry, tech);
2047  eval = validator.Validate(seh, options);
2048  if (IsProteinTech(tech)) {
2049  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide", "Nucleic acid with protein sequence method"));
2050  } else if (tech == CMolInfo::eTech_est) {
2051  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA", "EST sequence should be mRNA"));
2052  }
2053  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDeltaSeq", "Delta seq technique should not be [" + NStr::UIntToString(tech) + "]"));
2054  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2055  eval = validator.Validate(seh, options);
2056  CheckErrors(*eval, expected_errors);
2057  CLEAR_ERRORS
2058 }
2059 
2060 
2062 {
2064 
2066 
2067  CRef<CDelta_seq> start_gap_seg(new CDelta_seq());
2068  start_gap_seg->SetLiteral().SetLength(10);
2069  start_gap_seg->SetLiteral().SetSeq_data().SetGap();
2070  entry->SetSeq().SetInst().SetExt().SetDelta().Set().insert(entry->SetSeq().SetInst().SetExt().SetDelta().Set().begin(), start_gap_seg);
2071  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2072  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2073  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("AAATTTGGGC", CSeq_inst::eMol_dna);
2074  CRef<CDelta_seq> end_gap_seg(new CDelta_seq());
2075  end_gap_seg->SetLiteral().SetLength(10);
2076  end_gap_seg->SetLiteral().SetSeq_data().SetGap();
2077  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(end_gap_seg);
2078  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2079  entry->SetSeq().SetInst().SetLength(94);
2080  SetTech(entry, tech);
2081  if (tech == CMolInfo::eTech_wgs) {
2082  AddChromosomeNoLocation(expected_errors, "lcl|good");
2083  }
2084  // expected_errors.push_back(new CExpectedError("lcl|good", tech == CMolInfo::eTech_wgs ? eDiag_Warning : eDiag_Error, "BadDeltaSeq", "First delta seq component is a gap"));
2085  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDeltaSeq", "There is 1 adjacent gap in delta seq"));
2086  // expected_errors.push_back(new CExpectedError("lcl|good", tech == CMolInfo::eTech_wgs ? eDiag_Warning : eDiag_Error, "BadDeltaSeq", "Last delta seq component is a gap"));
2087  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at beginning of sequence"));
2088  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at end of sequence"));
2089  /*
2090  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
2091  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
2092  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
2093  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
2094  */
2095  if (tech == CMolInfo::eTech_wgs) {
2096  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
2097  }
2098  eval = validator.Validate(seh, options);
2099  CheckErrors(*eval, expected_errors);
2100 
2101  CLEAR_ERRORS
2102 }
2103 
2104 
2105 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadDeltaSeq)
2106 {
2108 
2110 
2111  for (auto& it : entry->SetSeq().SetDescr().Set()) {
2112  if (it->IsMolinfo()) {
2113  it->SetMolinfo().SetTech(CMolInfo::eTech_derived);
2114  }
2115  }
2116 
2117  // don't report if NT or NC
2118  scope.RemoveTopLevelSeqEntry(seh);
2119  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
2120  seh = scope.AddTopLevelSeqEntry(*entry);
2121  eval = validator.Validate(seh, options);
2122  // AddChromosomeNoLocation(expected_errors, "ref|NC_123456|");
2123  CheckErrors(*eval, expected_errors);
2124  CLEAR_ERRORS
2125 
2126  entry->SetSeq().SetId().front()->SetOther().SetAccession("NT_123456");
2127  scope.RemoveTopLevelSeqEntry(seh);
2128  seh = scope.AddTopLevelSeqEntry(*entry);
2129  eval = validator.Validate(seh, options);
2130  // AddChromosomeNoLocation(expected_errors, "ref|NT_123456|");
2131  CheckErrors(*eval, expected_errors);
2132  CLEAR_ERRORS
2133 
2134  // don't report if gen-prod-set
2135 
2136  entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
2137  scope.RemoveTopLevelSeqEntry(seh);
2138  seh = scope.AddTopLevelSeqEntry(*entry);
2139 
2140  // allowed tech values
2141  vector<CMolInfo::TTech> allowed_list;
2142  allowed_list.push_back(CMolInfo::eTech_htgs_0);
2143  allowed_list.push_back(CMolInfo::eTech_htgs_1);
2144  allowed_list.push_back(CMolInfo::eTech_htgs_2);
2145  allowed_list.push_back(CMolInfo::eTech_htgs_3);
2146  allowed_list.push_back(CMolInfo::eTech_wgs);
2147  allowed_list.push_back(CMolInfo::eTech_composite_wgs_htgs);
2148  allowed_list.push_back(CMolInfo::eTech_unknown);
2149  allowed_list.push_back(CMolInfo::eTech_standard);
2150  allowed_list.push_back(CMolInfo::eTech_htc);
2151  allowed_list.push_back(CMolInfo::eTech_barcode);
2152  allowed_list.push_back(CMolInfo::eTech_tsa);
2153 
2155  bool allowed = false;
2156  for (CMolInfo::TTech it : allowed_list) {
2157  if (it == i) {
2158  allowed = true;
2159  break;
2160  }
2161  }
2162  if (allowed) {
2163  // don't report for htgs_0
2165  } else {
2167  }
2168  }
2169 
2170  CLEAR_ERRORS
2171 
2174 
2175  CLEAR_ERRORS
2176 }
2177 
2178 
2179 void AdjustGap(CSeq_gap& gap, CSeq_gap::EType gap_type, bool is_linked, vector<CLinkage_evidence::EType> linkage_evidence)
2180 {
2181  gap.Reset();
2182  gap.SetType(gap_type);
2183  if (is_linked) {
2185  } else {
2186  gap.ResetLinkage();
2187  }
2188  gap.ResetLinkage_evidence();
2189  for (auto it : linkage_evidence) {
2191  ev->SetType(it);
2192  gap.SetLinkage_evidence().push_back(ev);
2193  }
2194 }
2195 
2196 
2197 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqGapBadLinkage)
2198 {
2200 
2201  vector<CLinkage_evidence::EType> evidence;
2202  evidence.push_back(CLinkage_evidence::eType_align_genus);
2203  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2204  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data() &&
2205  it->GetLiteral().GetSeq_data().IsGap()) {
2206  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2207  CSeq_gap::eType_short_arm, true, evidence);
2208  }
2209  }
2210 
2212 
2213  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2214  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
2215  "SeqGapBadLinkage", "Seq-gap of type 3 should not have linkage evidence"));
2216 
2217  eval = validator.Validate(seh, options);
2218  CheckErrors(*eval, expected_errors);
2219 
2220  CLEAR_ERRORS
2221 
2222  scope.RemoveTopLevelSeqEntry(seh);
2223  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2224  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data() &&
2225  it->GetLiteral().GetSeq_data().IsGap()) {
2226  CSeq_gap& gap = it->SetLiteral().SetSeq_data().SetGap();
2227  gap.ResetLinkage();
2228  gap.ResetType();
2229  }
2230  }
2231  seh = scope.AddTopLevelSeqEntry(*entry);
2232 
2233  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2234  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
2235  "SeqGapBadLinkage", "Seq-gap with linkage evidence must have linkage field set to linked"));
2236 
2237  eval = validator.Validate(seh, options);
2238  CheckErrors(*eval, expected_errors);
2239 
2240  CLEAR_ERRORS
2241 
2242  scope.RemoveTopLevelSeqEntry(seh);
2243  evidence.push_back(CLinkage_evidence::eType_align_genus);
2244  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2245  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data() &&
2246  it->GetLiteral().GetSeq_data().IsGap()) {
2247  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2248  CSeq_gap::eType_fragment, true, evidence);
2249  }
2250  }
2251  seh = scope.AddTopLevelSeqEntry(*entry);
2252 
2253  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2254  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
2255  "SeqGapBadLinkage", "Linkage evidence 'align genus' appears 2 times"));
2256 
2257  eval = validator.Validate(seh, options);
2258  CheckErrors(*eval, expected_errors);
2259 
2260  CLEAR_ERRORS
2261 
2262  evidence.pop_back();
2263  evidence.push_back(CLinkage_evidence::eType_unspecified);
2264  scope.RemoveTopLevelSeqEntry(seh);
2265  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2266  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data() &&
2267  it->GetLiteral().GetSeq_data().IsGap()) {
2268  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2269  CSeq_gap::eType_fragment, true, evidence);
2270  }
2271  }
2272  seh = scope.AddTopLevelSeqEntry(*entry);
2273 
2274  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2275  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
2276  "SeqGapBadLinkage", "Seq-gap type has unspecified and additional linkage evidence"));
2277 
2278  eval = validator.Validate(seh, options);
2279  CheckErrors(*eval, expected_errors);
2280 
2281  CLEAR_ERRORS
2282 
2283  scope.RemoveTopLevelSeqEntry(seh);
2284  evidence.clear();
2285  evidence.push_back(CLinkage_evidence::eType_unspecified);
2286  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2287  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data() &&
2288  it->GetLiteral().GetSeq_data().IsGap()) {
2289  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2290  CSeq_gap::eType_unknown, true, evidence);
2291  }
2292  }
2293  seh = scope.AddTopLevelSeqEntry(*entry);
2294 
2295  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2296  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
2297  "SeqGapBadLinkage", "Single Seq-gap has unknown type and unspecified linkage"));
2298 
2299  eval = validator.Validate(seh, options);
2300  CheckErrors(*eval, expected_errors);
2301 
2302  CLEAR_ERRORS
2303 
2304  scope.RemoveTopLevelSeqEntry(seh);
2305  CRef<CDelta_seq> gap_seg(new CDelta_seq());
2306  gap_seg->SetLiteral().SetLength(10);
2307  AdjustGap(gap_seg->SetLiteral().SetSeq_data().SetGap(),
2308  CSeq_gap::eType_unknown, true, evidence);
2309 
2310  // adjust delta to avoid errors about large number of Ns in first and last 50 bp
2311  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCATGATGATGTACCGTACGTTTTCCCATGATGATGTACCGTACGTTTT");
2312  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetLength(50);
2313  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
2314  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATGTACCGTACGTTTTCCCATGATGATGTACCGTACGTTTT", CSeq_inst::eMol_dna);
2315  entry->SetSeq().SetInst().SetLength(132);
2316 
2317  seh = scope.AddTopLevelSeqEntry(*entry);
2318 
2319  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2320  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
2321  "SeqGapBadLinkage", "All 2 Seq-gaps have unknown type and unspecified linkage"));
2322 
2323  eval = validator.Validate(seh, options);
2324  CheckErrors(*eval, expected_errors);
2325 
2326  CLEAR_ERRORS
2327 }
2328 
2329 
2330 void ChangeErrorAcc(vector<CExpectedError *> expected_errors, const string& acc)
2331 {
2332  for (auto it : expected_errors) {
2333  if (it) {
2334  it->SetAccession(acc);
2335  }
2336  }
2337 }
2338 
2339 
2340 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingIdsOnBioseq)
2341 {
2343 
2345 
2346  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (lcl|good - lcl|bad)"));
2347  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2348 
2349  // local IDs
2350  scope.RemoveTopLevelSeqEntry(seh);
2351  CRef<CSeq_id> id2(new CSeq_id());
2352  id2->SetLocal().SetStr("bad");
2353  entry->SetSeq().SetId().push_back(id2);
2354  seh = scope.AddTopLevelSeqEntry(*entry);
2355  eval = validator.Validate(seh, options);
2356  CheckErrors(*eval, expected_errors);
2357 
2358  // GIBBSQ
2359  scope.RemoveTopLevelSeqEntry(seh);
2360  CRef<CSeq_id> id1 = entry->SetSeq().SetId().front();
2361  id1->SetGibbsq(1);
2362  id2->SetGibbsq(2);
2363  seh = scope.AddTopLevelSeqEntry(*entry);
2364  ChangeErrorAcc(expected_errors, "bbs|1");
2365  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (bbs|1 - bbs|2)");
2366  eval = validator.Validate(seh, options);
2367  CheckErrors(*eval, expected_errors);
2368 
2369  // GIBBSQ
2370  scope.RemoveTopLevelSeqEntry(seh);
2371  id1->SetGibbmt(1);
2372  id2->SetGibbmt(2);
2373  seh = scope.AddTopLevelSeqEntry(*entry);
2374  ChangeErrorAcc(expected_errors, "bbm|1");
2375  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (bbm|1 - bbm|2)");
2376  eval = validator.Validate(seh, options);
2377  CheckErrors(*eval, expected_errors);
2378 
2379  // GI
2380  scope.RemoveTopLevelSeqEntry(seh);
2381  id1->SetGi(GI_CONST(1));
2382  id2->SetGi(GI_CONST(2));
2383  CRef<CSeq_id> id3(new CSeq_id("gb|AY123456.1"));
2384  entry->SetSeq().SetId().push_back (id3);
2385  seh = scope.AddTopLevelSeqEntry(*entry);
2386  ChangeErrorAcc(expected_errors, "gb|AY123456.1|");
2387  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (gi|1 - gi|2)");
2388  eval = validator.Validate(seh, options);
2389  CheckErrors(*eval, expected_errors);
2390  entry->SetSeq().SetId().pop_back();
2391 
2392  // GIIM
2393  scope.RemoveTopLevelSeqEntry(seh);
2394  id1->SetGiim().SetId(1);
2395  id1->SetGiim().SetDb("foo");
2396  id2->SetGiim().SetId(2);
2397  id2->SetGiim().SetDb("foo");
2398  seh = scope.AddTopLevelSeqEntry(*entry);
2399  CLEAR_ERRORS
2400 
2401  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "IdOnMultipleBioseqs", "BioseqFind (gim|1) unable to find itself - possible internal error"));
2402  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gim|1 - gim|2)"));
2403  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "IdOnMultipleBioseqs", "BioseqFind (gim|2) unable to find itself - possible internal error"));
2404  // AddChromosomeNoLocation(expected_errors, "gim|1");
2405  eval = validator.Validate(seh, options);
2406  CheckErrors(*eval, expected_errors);
2407  CLEAR_ERRORS
2408 
2409  // patent
2410  scope.RemoveTopLevelSeqEntry(seh);
2411  id1->SetPatent().SetSeqid(1);
2412  id1->SetPatent().SetCit().SetCountry("USA");
2413  id1->SetPatent().SetCit().SetId().SetNumber("1");
2414  id2->SetPatent().SetSeqid(2);
2415  id2->SetPatent().SetCit().SetCountry("USA");
2416  id2->SetPatent().SetCit().SetId().SetNumber("2");
2417  seh = scope.AddTopLevelSeqEntry(*entry);
2418  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (pat|USA|1|1 - pat|USA|2|2)"));
2419  // AddChromosomeNoLocation(expected_errors, "pat|USA|1|1");
2420  eval = validator.Validate(seh, options);
2421  CheckErrors(*eval, expected_errors);
2422 
2423  // pdb
2424  scope.RemoveTopLevelSeqEntry(seh);
2425  id1->SetPdb().SetMol().Set("good");
2426  id2->SetPdb().SetMol().Set("badd");
2427  seh = scope.AddTopLevelSeqEntry(*entry);
2428  ChangeErrorAcc(expected_errors, "pdb|good| ");
2429  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (pdb|good| - pdb|badd| )");
2430  eval = validator.Validate(seh, options);
2431  CheckErrors(*eval, expected_errors);
2432 
2433  // general
2434  scope.RemoveTopLevelSeqEntry(seh);
2435  id1->SetGeneral().SetDb("a");
2436  id1->SetGeneral().SetTag().SetStr("good");
2437  id2->SetGeneral().SetDb("a");
2438  id2->SetGeneral().SetTag().SetStr("bad");
2439  seh = scope.AddTopLevelSeqEntry(*entry);
2440  ChangeErrorAcc(expected_errors, "gnl|a|good");
2441  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (gnl|a|good - gnl|a|bad)");
2442  eval = validator.Validate(seh, options);
2443  CheckErrors(*eval, expected_errors);
2444 
2445  CLEAR_ERRORS
2446  // should get no error if db values are different
2447  scope.RemoveTopLevelSeqEntry(seh);
2448  id2->SetGeneral().SetDb("b");
2449  seh = scope.AddTopLevelSeqEntry(*entry);
2450  // AddChromosomeNoLocation(expected_errors, "gnl|a|good");
2451  eval = validator.Validate(seh, options);
2452  CheckErrors(*eval, expected_errors);
2453 
2454  // genbank
2455  scope.RemoveTopLevelSeqEntry(seh);
2456  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gb|AY222222|)"));
2457  id1->SetGenbank().SetAccession("AY123456");
2458  id2->SetGenbank().SetAccession("AY222222");
2459  seh = scope.AddTopLevelSeqEntry(*entry);
2460  eval = validator.Validate(seh, options);
2461  CheckErrors(*eval, expected_errors);
2462 
2463  // try genbank with accession same, versions different
2464  scope.RemoveTopLevelSeqEntry(seh);
2465  id2->SetGenbank().SetAccession("AY123456");
2466  id2->SetGenbank().SetVersion(2);
2467  seh = scope.AddTopLevelSeqEntry(*entry);
2468  CLEAR_ERRORS
2469  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.2|");
2470  expected_errors.push_back(new CExpectedError("gb|AY123456.2|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gb|AY123456.2|)"));
2471  eval = validator.Validate(seh, options);
2472  CheckErrors(*eval, expected_errors);
2473 
2474  // try similar id type
2475  scope.RemoveTopLevelSeqEntry(seh);
2476  id2->SetGpipe().SetAccession("AY123456");
2477  seh = scope.AddTopLevelSeqEntry(*entry);
2478  CLEAR_ERRORS
2479  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gpp|AY123456|)"));
2480  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
2481  eval = validator.Validate(seh, options);
2482  CheckErrors(*eval, expected_errors);
2483 
2484  // LRG
2485  scope.RemoveTopLevelSeqEntry(seh);
2486  id1->SetGeneral().SetDb("LRG");
2487  id1->SetGeneral().SetTag().SetStr("good");
2488  seh = scope.AddTopLevelSeqEntry(*entry);
2489  ChangeErrorAcc(expected_errors, "gpp|AY123456|");
2490  expected_errors[0]->SetErrMsg("LRG sequence needs NG_ accession");
2491  expected_errors[0]->SetSeverity(eDiag_Critical);
2492  eval = validator.Validate(seh, options);
2493  CheckErrors(*eval, expected_errors);
2494  // no error if has NG
2495  scope.RemoveTopLevelSeqEntry(seh);
2496  id2->SetOther().SetAccession("NG_123456");
2497  seh = scope.AddTopLevelSeqEntry(*entry);
2498  CLEAR_ERRORS
2499  // AddChromosomeNoLocation(expected_errors, "ref|NG_123456|");
2500  eval = validator.Validate(seh, options);
2501  CheckErrors(*eval, expected_errors);
2502 
2503  CLEAR_ERRORS
2504 }
2505 
2506 
2507 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNuclAcid)
2508 {
2510 
2512 
2513  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2514  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MolNuclAcid", "Bioseq.mol is type nucleic acid"));
2515 
2516  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_na);
2517  eval = validator.Validate(seh, options);
2518  CheckErrors(*eval, expected_errors);
2519 
2520  CLEAR_ERRORS
2521 }
2522 
2523 
2524 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingBiomolTech)
2525 {
2527 
2529 
2530  // allowed tech values
2531  vector<CMolInfo::TTech> genomic_list;
2532  genomic_list.push_back(CMolInfo::eTech_sts);
2533  genomic_list.push_back(CMolInfo::eTech_survey);
2534  genomic_list.push_back(CMolInfo::eTech_wgs);
2535  genomic_list.push_back(CMolInfo::eTech_htgs_0);
2536  genomic_list.push_back(CMolInfo::eTech_htgs_1);
2537  genomic_list.push_back(CMolInfo::eTech_htgs_2);
2538  genomic_list.push_back(CMolInfo::eTech_htgs_3);
2539  genomic_list.push_back(CMolInfo::eTech_composite_wgs_htgs);
2540 
2542  bool genomic = false;
2543  for (CMolInfo::TTech it : genomic_list) {
2544  if (it == i) {
2545  genomic = true;
2546  break;
2547  }
2548  }
2549  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
2550  SetTech (entry, i);
2552  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolType", "Molecule type (DNA) does not match biomol (RNA)"));
2553  if (i == CMolInfo::eTech_wgs) {
2554  AddChromosomeNoLocation(expected_errors, "lcl|good");
2555  }
2556  if (i == CMolInfo::eTech_est) {
2557  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA", "EST sequence should be mRNA"));
2558  }
2559  if (i == CMolInfo::eTech_htgs_2) {
2560  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 raw seq has no gaps and no graphs"));
2561  }
2562  if (genomic) {
2563  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic", "HTGS/STS/GSS/WGS sequence should be genomic"));
2564  eval = validator.Validate(seh, options);
2565  CheckErrors(*eval, expected_errors);
2567  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
2568  delete expected_errors[0];
2569  expected_errors[0] = nullptr;
2570  expected_errors.back()->SetErrCode("HTGS_STS_GSS_WGSshouldNotBeRNA");
2571  expected_errors.back()->SetErrMsg("HTGS/STS/GSS/WGS sequence should not be RNA");
2572  eval = validator.Validate(seh, options);
2573  CheckErrors(*eval, expected_errors);
2574  } else {
2575  if (IsProteinTech(i)) {
2576  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide", "Nucleic acid with protein sequence method"));
2577  }
2578  if (i == CMolInfo::eTech_barcode) {
2579  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
2580  } else if (i == CMolInfo::eTech_tsa) {
2581  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2582  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"cRNA\" is not appropriate for sequences that use the TSA technique."));
2583  }
2584  eval = validator.Validate(seh, options);
2585  CheckErrors(*eval, expected_errors);
2586  }
2587  CLEAR_ERRORS
2588  }
2589 
2590  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
2591  SetTech (entry, CMolInfo::eTech_tsa);
2592  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2593  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolType", "Molecule type (DNA) does not match biomol (RNA)"));
2594  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2595  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"cRNA\" is not appropriate for sequences that use the TSA technique."));
2596  eval = validator.Validate(seh, options);
2597  CheckErrors(*eval, expected_errors);
2598 
2599  CLEAR_ERRORS
2600 
2601  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2602  eval = validator.GetTSAConflictingBiomolTechErrors(seh);
2603  CheckErrors(*eval, expected_errors);
2604  eval = validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
2605  CheckErrors(*eval, expected_errors);
2606  CLEAR_ERRORS
2607 }
2608 
2609 
2610 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqIdNameHasSpace)
2611 {
2613  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
2614  entry->SetSeq().SetId().front()->SetOther().SetName("good one");
2615 
2617 
2618  expected_errors.push_back(new CExpectedError("ref|NC_123456|good one", eDiag_Critical, "SeqIdNameHasSpace", "Seq-id.name 'good one' should be a single word without any spaces"));
2619  // AddChromosomeNoLocation(expected_errors, "ref|NC_123456|good one");
2620 
2621  eval = validator.Validate(seh, options);
2622  CheckErrors(*eval, expected_errors);
2623 
2624  CLEAR_ERRORS
2625 }
2626 
2627 
2628 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_DuplicateSegmentReferences)
2629 {
2630 #if 0
2631  // removed per VR-779
2633  entry->SetSeq().SetInst().ResetSeq_data();
2634  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
2635  CRef<CSeq_loc> seg1 (new CSeq_loc());
2636  seg1->SetWhole().SetGenbank().SetAccession("AY123456");
2637  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(seg1);
2638  CRef<CSeq_loc> seg2 (new CSeq_loc());
2639  seg2->SetWhole().SetGenbank().SetAccession("AY123456");
2640  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(seg2);
2641  entry->SetSeq().SetInst().SetLength(970);
2642 
2644  // need to call this statement before calling AddDefaults
2645  // to make sure that we can fetch the sequence referenced by the
2646  // delta sequence so that we can detect that the loc in the
2647  // delta sequence is longer than the referenced sequence
2649  CScope scope(*objmgr);
2650  scope.AddDefaults();
2651  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
2652 
2653  CValidator validator(*objmgr);
2654 
2655  // Set validator options
2656  unsigned int options = CValidator::eVal_need_isojta
2660 
2661  // list of expected errors
2662  vector< CExpectedError *> expected_errors;
2663  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLocOrder", "Segmented BioseqIntervals out of order in SeqLoc [[gb|AY123456|, gb|AY123456|]]"));
2664  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DuplicateSegmentReferences", "Segmented sequence has multiple references to gb|AY123456"));
2666 
2667  eval = validator.Validate(seh, options);
2668  CheckErrors(*eval, expected_errors);
2669 
2670  seg2->SetInt().SetId().SetGenbank().SetAccession("AY123456");
2671  seg2->SetInt().SetFrom(0);
2672  seg2->SetInt().SetTo(484);
2673  expected_errors[0]->SetErrMsg("Segmented BioseqIntervals out of order in SeqLoc [[gb|AY123456|, 1-485]]");
2674  expected_errors[1]->SetSeverity(eDiag_Warning);
2675  expected_errors[1]->SetErrMsg("Segmented sequence has multiple references to gb|AY123456 that are not SEQLOC_WHOLE");
2676  eval = validator.Validate(seh, options);
2677  CheckErrors(*eval, expected_errors);
2678 
2679  CLEAR_ERRORS
2680 #endif
2681 }
2682 
2683 
2684 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_TrailingX)
2685 {
2687  CRef<CSeq_entry> nuc = entry->SetSet().SetSeq_set().front();
2688  CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
2689  CRef<CSeq_feat> prot_feat = prot->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
2691  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATANNNNNN");
2692  nuc->SetSeq().SetInst().SetLength(27);
2693  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEIXX");
2694  prot->SetSeq().SetInst().SetLength(9);
2696  prot_feat->SetLocation().SetInt().SetTo(8);
2697  prot_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
2698  prot_feat->SetPartial(true);
2699  cds_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
2700  cds_feat->SetPartial(true);
2701 
2703 
2704  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TerminalNs", "N at end of sequence"));
2705  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "TrailingX", "Sequence ends in 2 trailing Xs"));
2706  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "HighNpercent3Prime",
2707  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
2708  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2709 
2710  eval = validator.Validate(seh, options);
2711  CheckErrors(*eval, expected_errors);
2712 
2713  CLEAR_ERRORS
2714 
2715 }
2716 
2717 
2718 void TestBadProtId(const string& id_str)
2719 {
2720  // bad for just prots
2722  CRef<CSeq_id> bad_id(new CSeq_id());
2723  bad_id->SetGenbank().SetAccession(id_str);
2724  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2725  good_nuc_id->SetLocal().SetStr("nuc");
2726  CRef<CSeq_id> good_prot_id(new CSeq_id());
2727  good_prot_id->SetLocal().SetStr("prot");
2728 
2729  unit_test_util::ChangeNucId(entry, good_nuc_id);
2730  unit_test_util::ChangeProtId(entry, bad_id);
2731 
2733 
2734  expected_errors.push_back(new CExpectedError("gb|" + id_str + "|", eDiag_Error, "BadSeqIdFormat", "Bad accession " + id_str));
2735  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2736 
2737  eval = validator.Validate(seh, options);
2738  CheckErrors(*eval, expected_errors);
2739  CLEAR_ERRORS
2740 }
2741 
2742 
2743 void TestGoodProtId(const string& id_str)
2744 {
2746  CRef<CSeq_id> bad_id(new CSeq_id());
2747  bad_id->SetGenbank().SetAccession(id_str);
2748  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2749  good_nuc_id->SetLocal().SetStr("nuc");
2750  CRef<CSeq_id> good_prot_id(new CSeq_id());
2751  good_prot_id->SetLocal().SetStr("prot");
2752 
2753  unit_test_util::ChangeNucId(entry, good_nuc_id);
2754  unit_test_util::ChangeProtId(entry, bad_id);
2755 
2757 
2758  eval = validator.Validate(seh, options);
2759  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2760 
2761  CheckErrors(*eval, expected_errors);
2762  CLEAR_ERRORS
2763 }
2764 
2765 
2766 void TestGoodNucId(const string& id_str)
2767 {
2769  CRef<CSeq_id> bad_id(new CSeq_id());
2770  bad_id->SetGenbank().SetAccession(id_str);
2771  CRef<CSeq_id> good_prot_id(new CSeq_id());
2772  good_prot_id->SetLocal().SetStr("prot");
2773  unit_test_util::ChangeNucId(entry, bad_id);
2774  unit_test_util::ChangeProtId(entry, good_prot_id);
2775  bool is_wgs = false;
2776  if (id_str.length() == 12 || id_str.length() == 13 || id_str.length() == 14 || id_str.length() == 15) {
2777  SetTech(entry->SetSet().SetSeq_set().front(), CMolInfo::eTech_wgs);
2778  is_wgs = true;
2779  }
2780 
2782 
2783  if (is_wgs) {
2784  AddChromosomeNoLocation(expected_errors, "gb|" + id_str + "|");
2785  }
2786  eval = validator.Validate(seh, options);
2787  CheckErrors(*eval, expected_errors);
2788  CLEAR_ERRORS
2789 }
2790 
2791 
2792 BOOST_FIXTURE_TEST_CASE(Test_SEQ_INST_BadSeqIdFormat, CGenBankFixture)
2793 {
2795  CRef<CSeq_entry> nuc_entry = entry->SetSet().SetSeq_set().front();
2796  CRef<CSeq_entry> prot_entry = entry->SetSet().SetSeq_set().back();
2797  CRef<CSeq_feat> prot_feat = prot_entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
2799 
2801 
2802  expected_errors.push_back(new CExpectedError("",eDiag_Error, "BadSeqIdFormat", "Bad accession"));
2803 
2804  vector<string> bad_ids;
2805  bad_ids.push_back("AY123456ABC"); // can't have letters after digits
2806  bad_ids.push_back("A1234"); // for a single letter, only acceptable number of digits is 5
2807  bad_ids.push_back("A123456");
2808  bad_ids.push_back("AY12345"); // for two letters, only acceptable number of digits is 6
2809  bad_ids.push_back("AY1234567");
2810  bad_ids.push_back("ABC1234"); // three letters bad unless prot and 5 digits
2811  bad_ids.push_back("ABC123456");
2812  bad_ids.push_back("ABCD1234567"); // four letters
2813  bad_ids.push_back("ABCDE123456"); // five letters
2814  bad_ids.push_back("ABCDE12345678");
2815 
2816  vector<string> bad_nuc_ids;
2817  bad_nuc_ids.push_back("ABC12345");
2818 
2819  vector<string> bad_prot_ids;
2820  bad_prot_ids.push_back("AY123456");
2821  bad_prot_ids.push_back("A12345");
2822 
2823  vector<string> good_ids;
2824 
2825  vector<string> good_nuc_ids;
2826  good_nuc_ids.push_back("AY123456");
2827  good_nuc_ids.push_back("A12345");
2828  good_nuc_ids.push_back("ABCD123456789");
2829  good_nuc_ids.push_back("ABCD1234567890");
2830 
2831  vector<string> good_prot_ids;
2832  good_prot_ids.push_back("ABC12345");
2833 
2834 
2835  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2836  good_nuc_id->SetLocal().SetStr("nuc");
2837  CRef<CSeq_id> good_prot_id(new CSeq_id());
2838  good_prot_id->SetLocal().SetStr("prot");
2839 
2840  CRef<CSeq_id> bad_id(new CSeq_id());
2841 
2842  // bad for both
2843  for (const string& id_str : bad_ids) {
2844  const string acc_str = "gb|" + id_str + "|";
2845  ChangeErrorAcc(expected_errors, acc_str);
2846  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2847 
2848  // GenBank
2849  scope.RemoveTopLevelSeqEntry(seh);
2850  scope.ResetDataAndHistory();
2851  bad_id->SetGenbank().SetAccession(id_str);
2852  unit_test_util::ChangeNucId(entry, bad_id);
2853  unit_test_util::ChangeProtId(entry, good_prot_id);
2854  seh = scope.AddTopLevelSeqEntry(*entry);
2855  eval = validator.Validate(seh, options);
2856  CheckErrors(*eval, expected_errors);
2857  scope.RemoveTopLevelSeqEntry(seh);
2858  scope.ResetDataAndHistory();
2859  unit_test_util::ChangeNucId(entry, good_nuc_id);
2860  unit_test_util::ChangeProtId(entry, bad_id);
2861  seh = scope.AddTopLevelSeqEntry(*entry);
2862  eval = validator.Validate(seh, options);
2863  CheckErrors(*eval, expected_errors);
2864  }
2865 
2866  for (const string& id_it : bad_ids) {
2867  const string id_str = "B" + id_it.substr(1);
2868  expected_errors[0]->SetAccession("embl|" + id_str + "|");
2869  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2870 
2871  // EMBL
2872  scope.RemoveTopLevelSeqEntry(seh);
2873  scope.ResetDataAndHistory();
2874  bad_id->SetEmbl().SetAccession(id_str);
2875  unit_test_util::ChangeNucId(entry, bad_id);
2876  unit_test_util::ChangeProtId(entry, good_prot_id);
2877  seh = scope.AddTopLevelSeqEntry(*entry);
2878  eval = validator.Validate(seh, options);
2879  expected_errors[0]->SetAccession("emb|" + id_str + "|");
2880  CheckErrors(*eval, expected_errors);
2881  scope.RemoveTopLevelSeqEntry(seh);
2882  scope.ResetDataAndHistory();
2883  unit_test_util::ChangeNucId(entry, good_nuc_id);
2884  unit_test_util::ChangeProtId(entry, bad_id);
2885  seh = scope.AddTopLevelSeqEntry(*entry);
2886  eval = validator.Validate(seh, options);
2887  CheckErrors(*eval, expected_errors);
2888  }
2889 
2890  for (const string& id_it : bad_ids) {
2891  const string id_str = "C" + id_it.substr(1);
2892  expected_errors[0]->SetAccession("dbj|" + id_str + "|");
2893  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2894 
2895  // DDBJ
2896  scope.RemoveTopLevelSeqEntry(seh);
2897  scope.ResetDataAndHistory();
2898  bad_id->SetDdbj().SetAccession(id_str);
2899  unit_test_util::ChangeNucId(entry, bad_id);
2900  unit_test_util::ChangeProtId(entry, good_prot_id);
2901  seh = scope.AddTopLevelSeqEntry(*entry);
2902  eval = validator.Validate(seh, options);
2903  expected_errors[0]->SetAccession("dbj|" + id_str + "|");
2904  CheckErrors(*eval, expected_errors);
2905  scope.RemoveTopLevelSeqEntry(seh);
2906  scope.ResetDataAndHistory();
2907  unit_test_util::ChangeNucId(entry, good_nuc_id);
2908  unit_test_util::ChangeProtId(entry, bad_id);
2909  seh = scope.AddTopLevelSeqEntry(*entry);
2910  eval = validator.Validate(seh, options);
2911  CheckErrors(*eval, expected_errors);
2912  }
2913 
2914  // bad for just nucs
2915  for (const string& id_str : bad_nuc_ids) {
2916  bad_id->SetGenbank().SetAccession(id_str);
2917  scope.RemoveTopLevelSeqEntry(seh);
2918  unit_test_util::ChangeNucId(entry, bad_id);
2919  unit_test_util::ChangeProtId(entry, good_prot_id);
2920  expected_errors[0]->SetAccession("gb|" + id_str + "|");
2921  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2922  seh = scope.AddTopLevelSeqEntry(*entry);
2923  eval = validator.Validate(seh, options);
2924  CheckErrors(*eval, expected_errors);
2925  }
2926 
2927  // bad for just prots
2928  for (auto id_it : bad_prot_ids) {
2929  TestBadProtId(id_it);
2930  }
2931 
2932  CLEAR_ERRORS
2933 
2934  // good for both
2935  for (const string& id_str : good_ids) {
2936  bad_id->SetGenbank().SetAccession(id_str);
2937  scope.RemoveTopLevelSeqEntry(seh);
2938  unit_test_util::ChangeNucId(entry, bad_id);
2939  unit_test_util::ChangeProtId(entry, good_prot_id);
2940  seh = scope.AddTopLevelSeqEntry(*entry);
2941  eval = validator.Validate(seh, options);
2942  // AddChromosomeNoLocation(expected_errors, "gb|" + *id_it + "|");
2943  CheckErrors(*eval, expected_errors);
2944  scope.RemoveTopLevelSeqEntry(seh);
2945  unit_test_util::ChangeNucId(entry, good_nuc_id);
2946  unit_test_util::ChangeProtId(entry, bad_id);
2947  seh = scope.AddTopLevelSeqEntry(*entry);
2948  eval = validator.Validate(seh, options);
2949  CheckErrors(*eval, expected_errors);
2950  CLEAR_ERRORS
2951  }
2952 
2953  // good for nucs
2954  for (const string& id_it : good_nuc_ids) {
2955  TestGoodNucId(id_it);
2956  }
2957 
2958  // good for just prots
2959  for (const string& id_it : good_prot_ids) {
2960  TestGoodProtId(id_it);
2961  }
2962 
2963  // if GI, needs version
2964  scope.RemoveTopLevelSeqEntry(seh);
2965  bad_id->SetGenbank().SetAccession("AY123456");
2966  bad_id->SetGenbank().SetVersion(0);
2967  unit_test_util::ChangeNucId(entry, bad_id);
2968  unit_test_util::ChangeProtId(entry, good_prot_id);
2969  CRef<CSeq_id> gi_id(new CSeq_id("gi|21914627"));
2970  nuc_entry->SetSeq().SetId().push_back(gi_id);
2971  seh = scope.AddTopLevelSeqEntry(*entry);
2972  eval = validator.Validate(seh, options);
2973  expected_errors.push_back (new CExpectedError ("gb|AY123456|", eDiag_Critical, "BadSeqIdFormat",
2974  "Accession AY123456 has 0 version"));
2975  expected_errors.push_back (new CExpectedError ("gb|AY123456|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123456|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
2976  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
2977  CheckErrors(*eval, expected_errors);
2978 
2979  CLEAR_ERRORS
2980 
2981  nuc_entry->SetSeq().SetId().pop_back();
2982 
2983  // id that is too long
2984  scope.RemoveTopLevelSeqEntry(seh);
2985  bad_id->SetLocal().SetStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234");
2986  unit_test_util::ChangeNucId(entry, bad_id);
2987  seh = scope.AddTopLevelSeqEntry(*entry);
2988  eval = validator.Validate(seh, options);
2989  // AddChromosomeNoLocation(expected_errors, "lcl|ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234");
2990  CheckErrors(*eval, expected_errors);
2991 
2992  CLEAR_ERRORS
2993 
2994  // shouldn't report if ncbifile ID
2995  scope.RemoveTopLevelSeqEntry(seh);
2996  CRef<CSeq_id> ncbifile(new CSeq_id("gnl|NCBIFILE|ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234"));
2997  unit_test_util::ChangeNucId(entry, good_nuc_id);
2998  nuc_entry->SetSeq().SetId().push_back(ncbifile);
2999  seh = scope.AddTopLevelSeqEntry(*entry);
3000  eval = validator.Validate(seh, options);
3001  // AddChromosomeNoLocation(expected_errors, entry);
3002  CheckErrors(*eval, expected_errors);
3003  nuc_entry->SetSeq().SetId().pop_back();
3004  CLEAR_ERRORS
3005 
3006  // report if database name len too long
3007  scope.RemoveTopLevelSeqEntry(seh);
3008  entry = unit_test_util::BuildGoodSeq();
3009  CRef<CSeq_id> general(new CSeq_id());
3010  general->SetGeneral().SetDb("thisdatabasevalueislong");
3011  general->SetGeneral().SetTag().SetStr("b");
3012  entry->SetSeq().ResetId();
3013  entry->SetSeq().SetId().push_back(general);
3014  seh = scope.AddTopLevelSeqEntry(*entry);
3015  expected_errors.push_back (new CExpectedError ("gnl|thisdatabasevalueislong|b", eDiag_Critical, "BadSeqIdFormat",
3016  "General database longer than 20 characters"));
3017 
3018  // AddChromosomeNoLocation(expected_errors, "gnl|thisdatabasevalueislong|b");
3019  eval = validator.Validate(seh, options);
3020  CheckErrors(*eval, expected_errors);
3021 
3022  CLEAR_ERRORS
3023 
3024  // do not report forward slash
3025  scope.RemoveTopLevelSeqEntry(seh);
3026  entry = unit_test_util::BuildGoodSeq();
3027  entry->SetSeq().SetId().front()->SetLocal().SetStr("a/b");
3028  seh = scope.AddTopLevelSeqEntry(*entry);
3029  eval = validator.Validate(seh, options);
3030  // AddChromosomeNoLocation(expected_errors, "lcl|a/b");
3031  CheckErrors(*eval, expected_errors);
3032 
3033  CLEAR_ERRORS
3034 }
3035 
3036 
3037 void TestOneGeneralSeqId(const string& db, const string& tag, const string& errmsg)
3038 {
3040  CRef<CSeq_id> id(new CSeq_id());
3041  id->SetGeneral().SetDb(db);
3042  id->SetGeneral().SetTag().SetStr(tag);
3043  entry->SetSeq().SetId().push_back(id);
3044 
3046 
3047  string acc_str = "lcl|good";
3048  if (!errmsg.empty()) {
3049  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Warning, "BadSeqIdCharacter",
3050  errmsg));
3051  }
3052  // AddChromosomeNoLocation(expected_errors, entry);
3053  eval = validator.Validate(seh, options);
3054  CheckErrors(*eval, expected_errors);
3055 
3056  CLEAR_ERRORS
3057 }
3058 
3059 
3061 {
3062  TestOneGeneralSeqId("PRJNA318798", " CpPA02_0001", "Bad character ' ' in sequence ID 'gnl|PRJNA318798| CpPA02_0001'");
3063  TestOneGeneralSeqId("PRJNA3 18798", "CpPA02_0001", "Bad character ' ' in sequence ID 'gnl|PRJNA3 18798|CpPA02_0001'");
3064 }
3065 
3066 
3067 void TestOneLongGeneral(bool emb, bool err)
3068 {
3070  CRef<CSeq_id> id(new CSeq_id());
3071  id->SetGeneral().SetDb("lgsi");
3072  id->SetGeneral().SetTag().SetStr("thisidentifierismorethanfiftycharactersinlengthsoitshouldberejected");
3073  entry->SetSeq().SetId().push_back(id);
3074 
3075  if (emb) {
3076  CRef<CSeq_id> emb(new CSeq_id());
3077  emb->SetEmbl().SetAccession("AY123457");
3078  emb->SetEmbl().SetVersion(1);
3079  entry->SetSeq().SetId().push_back(emb);
3080  }
3081 
3083 
3084  if (err) {
3085  string acc_str = "lcl|good";
3086  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Critical, "BadSeqIdLength",
3087  "General identifier longer than 50 characters"));
3088  }
3089 
3090  eval = validator.Validate(seh, options);
3091  CheckErrors(*eval, expected_errors);
3092 
3093  CLEAR_ERRORS
3094 }
3095 
3096 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_LongGeneralSeqId)
3097 {
3098  TestOneLongGeneral(false, true);
3099  TestOneLongGeneral(true, false);
3100 }
3101 
3102 
3103 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadSecondaryAccn)
3104 {
3106  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3107 
3109 
3110  CRef<CSeqdesc> gbdesc (new CSeqdesc());
3111  gbdesc->SetGenbank().SetExtra_accessions().push_back("AY123456");
3112  entry->SetSeq().SetDescr().Set().push_back(gbdesc);
3113 
3114  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "BadSecondaryAccn", "AY123456 used for both primary and secondary accession"));
3115  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
3116  eval = validator.Validate(seh, options);
3117  CheckErrors(*eval, expected_errors);
3118 
3119  gbdesc->SetEmbl().SetExtra_acc().push_back("AY123456");
3120  eval = validator.Validate(seh, options);
3121  CheckErrors(*eval, expected_errors);
3122 
3123  CLEAR_ERRORS
3124 }
3125 
3126 
3127 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ZeroGiNumber)
3128 {
3130  entry->SetSeq().SetId().front()->SetGi(ZERO_GI);
3131 
3133 
3134  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Critical, "ZeroGiNumber", "Invalid GI number"));
3135  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Error, "GiWithoutAccession", "No accession on sequence with gi number"));
3136  // AddChromosomeNoLocation(expected_errors, "gi|0");
3137  eval = validator.Validate(seh, options);
3138  CheckErrors(*eval, expected_errors);
3139 
3140  CLEAR_ERRORS
3141 }
3142 
3143 
3144 BOOST_AUTO_TEST_CASE(Test_HistoryGiCollision)
3145 {
3147  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3148  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3149  CRef<CSeq_id> gi_id(new CSeq_id());
3150  gi_id->SetGi(GI_CONST(21914627));
3151  entry->SetSeq().SetId().push_back(gi_id);
3152 
3154 
3155  CRef<CSeq_id> hist_id(new CSeq_id());
3156  hist_id->SetGi(GI_CONST(21914627));
3157  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetIds().push_back(hist_id);
3158  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetDate().SetStd().SetYear(2008);
3159 
3160  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error, "HistoryGiCollision", "Replaced by gi (21914627) is same as current Bioseq"));
3161  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
3162  eval = validator.Validate(seh, options);
3163  CheckErrors(*eval, expected_errors);
3164 
3165  entry->SetSeq().SetInst().SetHist().ResetReplaced_by();
3166  entry->SetSeq().SetInst().SetHist().SetReplaces().SetIds().push_back(hist_id);
3167  entry->SetSeq().SetInst().SetHist().SetReplaces().SetDate().SetStd().SetYear(2008);
3168  expected_errors[0]->SetErrMsg("Replaces gi (21914627) is same as current Bioseq");
3169  eval = validator.Validate(seh, options);
3170  CheckErrors(*eval, expected_errors);
3171 
3172  CLEAR_ERRORS
3173 
3174  // should not generate errors if date has not been set
3175  entry->SetSeq().SetInst().SetHist().ResetReplaces();
3176  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetIds().push_back(hist_id);
3177  eval = validator.Validate(seh, options);
3178  // AddChromosomeNoLocation(expected_errors, entry);
3179  CheckErrors(*eval, expected_errors);
3180 
3181  entry->SetSeq().SetInst().SetHist().ResetReplaced_by();
3182  entry->SetSeq().SetInst().SetHist().SetReplaces().SetIds().push_back(hist_id);
3183  eval = validator.Validate(seh, options);
3184  CheckErrors(*eval, expected_errors);
3185 
3186  CLEAR_ERRORS
3187 }
3188 
3189 
3190 BOOST_AUTO_TEST_CASE(Test_GiWithoutAccession)
3191 {
3193  entry->SetSeq().SetId().front()->SetGi(GI_CONST(123456));
3194 
3196 
3197  expected_errors.push_back(new CExpectedError("gi|123456", eDiag_Error, "GiWithoutAccession", "No accession on sequence with gi number"));
3198  // AddChromosomeNoLocation(expected_errors, entry);
3199  eval = validator.Validate(seh, options);
3200  CheckErrors(*eval, expected_errors);
3201 
3202  CLEAR_ERRORS
3203 }
3204 
3205 
3206 void TestOneOtherAcc(CRef<CSeq_id> other_acc, bool id_change, bool conflict, bool need_hist = false)
3207 {
3209  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3210  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3211  CRef<CSeq_id> gi_id(new CSeq_id());
3212  gi_id->SetGi(GI_CONST(21914627));
3213  entry->SetSeq().SetId().push_back(gi_id);
3214  entry->SetSeq().SetId().push_back(other_acc);
3215  string acc_str = "gb|AY123456.1|";
3216 
3218 
3219  if (conflict) {
3220  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error, "ConflictingIdsOnBioseq",
3221  "Conflicting ids on a Bioseq: (gb|AY123456.1| - " + other_acc->AsFastaString() + ")"));
3222  }
3223  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "MultipleAccessions", "Multiple accessions on sequence with gi number"));
3224  if (id_change) {
3225  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123457.1|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
3226  }
3227  if (need_hist) {
3228  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Info, "HistAssemblyMissing",
3229  "TPA record gb|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3230  }
3231  // AddChromosomeNoLocation(expected_errors, acc_str);
3232  eval = validator.Validate(seh, options);
3233  CheckErrors(*eval, expected_errors);
3234 
3235  CLEAR_ERRORS
3236 }
3237 
3238 
3239 BOOST_FIXTURE_TEST_CASE(Test_MultipleAccessions, CGenBankFixture)
3240 {
3241  CRef<CSeq_id> other_acc(new CSeq_id());
3242 
3243  // genbank, ddbj, embl, tpg, tpe, tpd, other, pir, swissprot, and prf all count as accessionts
3244  // genbank
3245  other_acc->SetGenbank().SetAccession("AY123457");
3246  other_acc->SetGenbank().SetVersion(1);
3247  TestOneOtherAcc(other_acc, true, true);
3248 
3249  // ddbj
3250  other_acc->SetDdbj().SetAccession("AY123457");
3251  other_acc->SetDdbj().SetVersion(1);
3252  TestOneOtherAcc(other_acc, false, true);
3253 
3254  // embl
3255  other_acc->SetEmbl().SetAccession("AY123457");
3256  other_acc->SetEmbl().SetVersion(1);
3257  TestOneOtherAcc(other_acc, false, true);
3258 
3259  // pir
3260  other_acc->SetPir().SetAccession("AY123457");
3261  other_acc->SetPir().SetVersion(1);
3262  TestOneOtherAcc(other_acc, false, false);
3263 
3264  // swissprot
3265  other_acc->SetSwissprot().SetAccession("AY123457");
3266  other_acc->SetSwissprot().SetVersion(1);
3267  TestOneOtherAcc(other_acc, false, false);
3268 
3269  // prf
3270  other_acc->SetPrf().SetAccession("AY123457");
3271  other_acc->SetPrf().SetVersion(1);
3272  TestOneOtherAcc(other_acc, false, false);
3273 
3274  // tpg
3275  other_acc->SetTpg().SetAccession("AY123457");
3276  other_acc->SetTpg().SetVersion(1);
3277  TestOneOtherAcc(other_acc, false, true, true);
3278 
3279  // tpe
3280  other_acc->SetTpe().SetAccession("AY123457");
3281  other_acc->SetTpe().SetVersion(1);
3282  TestOneOtherAcc(other_acc, false, true, true);
3283 
3284  // tpd
3285  other_acc->SetTpd().SetAccession("AY123457");
3286  other_acc->SetTpd().SetVersion(1);
3287  TestOneOtherAcc(other_acc, false, true, true);
3288 
3289  // other
3291  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3292  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3293  CRef<CSeq_id> gi_id(new CSeq_id());
3294  gi_id->SetGi(GI_CONST(21914627));
3295  entry->SetSeq().SetId().push_back(gi_id);
3296  entry->SetSeq().SetId().push_back(other_acc);
3297  other_acc->SetOther().SetAccession("NC_123457");
3298  other_acc->SetOther().SetVersion(1);
3299 
3301 
3302  string acc_str = "gb|AY123456.1|";
3303  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "INSDRefSeqPackaging", "INSD and RefSeq records should not be present in the same set"));
3304  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "MultipleAccessions", "Multiple accessions on sequence with gi number"));
3305  // AddChromosomeNoLocation(expected_errors, acc_str);
3306  eval = validator.Validate(seh, options);
3307  CheckErrors(*eval, expected_errors);
3308 
3309  CLEAR_ERRORS
3310 }
3311 
3312 
3313 BOOST_AUTO_TEST_CASE(Test_HistAssemblyMissing)
3314 {
3316  tpg_entry->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3317  tpg_entry->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3318 
3320  tpe_entry->SetSeq().SetId().front()->SetTpe().SetAccession("AY123456");
3321  tpe_entry->SetSeq().SetId().front()->SetTpe().SetVersion(1);
3322 
3324  tpd_entry->SetSeq().SetId().front()->SetTpd().SetAccession("AY123456");
3325  tpd_entry->SetSeq().SetId().front()->SetTpd().SetVersion(1);
3326 
3327  STANDARD_SETUP_NAME(tpg_entry)
3328 
3329  // tpg
3330  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Info, "HistAssemblyMissing", "TPA record tpg|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3331  // AddChromosomeNoLocation(expected_errors, tpg_entry);
3332  eval = validator.Validate(seh, options);
3333  CheckErrors(*eval, expected_errors);
3334 
3335  // tpe
3336  scope.RemoveTopLevelSeqEntry(seh);
3337  seh = scope.AddTopLevelSeqEntry(*tpe_entry);
3338  ChangeErrorAcc(expected_errors, "tpe|AY123456.1|");
3339  expected_errors[0]->SetErrMsg("TPA record tpe|AY123456.1| should have Seq-hist.assembly for PRIMARY block");
3340  eval = validator.Validate(seh, options);
3341  CheckErrors(*eval, expected_errors);
3342 
3343 
3344  // tpd
3345  scope.RemoveTopLevelSeqEntry(seh);
3346  seh = scope.AddTopLevelSeqEntry(*tpd_entry);
3347  ChangeErrorAcc(expected_errors, "tpd|AY123456.1|");
3348  expected_errors[0]->SetErrMsg("TPA record tpd|AY123456.1| should have Seq-hist.assembly for PRIMARY block");
3349  eval = validator.Validate(seh, options);
3350  CheckErrors(*eval, expected_errors);
3351 
3352  CLEAR_ERRORS
3353 
3354  // error suppressed if keyword present
3355  CRef<CSeqdesc> block(new CSeqdesc());
3356  block->SetGenbank().SetKeywords().push_back("TPA:reassembly");
3357  tpg_entry->SetSeq().SetDescr().Set().push_back(block);
3358  scope.RemoveTopLevelSeqEntry(seh);
3359  seh = scope.AddTopLevelSeqEntry(*tpg_entry);
3360  eval = validator.Validate(seh, options);
3361  // AddChromosomeNoLocation(expected_errors, tpg_entry);
3362 
3363  CheckErrors(*eval, expected_errors);
3364  block->SetEmbl().SetKeywords().push_back("TPA:reassembly");
3365  eval = validator.Validate(seh, options);
3366  CheckErrors(*eval, expected_errors);
3367  CLEAR_ERRORS
3368 }
3369 
3370 BOOST_AUTO_TEST_CASE(Test_TerminalNs)
3371 {
3373  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNNAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAANNNNNNNNNN");
3374  entry->SetSeq().SetInst().SetLength(62);
3375 
3377 
3378  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at beginning of sequence"));
3379  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at end of sequence"));
3380  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
3381  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3382  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
3383  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3384  // AddChromosomeNoLocation(expected_errors, entry);
3385  eval = validator.Validate(seh, options);
3386  CheckErrors(*eval, expected_errors);
3387 
3388  // warning level changes if not local only
3389  scope.RemoveTopLevelSeqEntry(seh);
3390  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3391  seh = scope.AddTopLevelSeqEntry(*entry);
3392  ChangeErrorAcc(expected_errors, "gb|AY123456|");
3393  expected_errors[0]->SetSeverity(eDiag_Error);
3394  expected_errors[1]->SetSeverity(eDiag_Error);
3395  eval = validator.Validate(seh, options);
3396  CheckErrors(*eval, expected_errors);
3397 
3398  CLEAR_ERRORS
3399 
3400  // also try delta sequence
3401  scope.RemoveTopLevelSeqEntry(seh);
3403  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNNNNNNNCCC");
3404  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCNNNNNNNNN");
3405  seh = scope.AddTopLevelSeqEntry(*entry);
3406 
3407  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 3 bases"));
3408  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at beginning of sequence"));
3409  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at end of sequence"));
3410  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 52 percent Ns"));
3411  eval = validator.Validate(seh, options);
3412  CheckErrors(*eval, expected_errors);
3413 
3414  // 10 Ns but just local stays at warning
3415  scope.RemoveTopLevelSeqEntry(seh);
3417  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNNNNNNNNCC");
3418  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCNNNNNNNNNN");
3419  seh = scope.AddTopLevelSeqEntry(*entry);
3420  expected_errors[0]->SetErrMsg("Maximum contig length is 2 bases");
3421  expected_errors.back()->SetErrMsg ("Sequence contains 58 percent Ns");
3422  eval = validator.Validate(seh, options);
3423  CheckErrors(*eval, expected_errors);
3424 
3425  // 10 Ns but now has non-local ID, error
3426  scope.RemoveTopLevelSeqEntry(seh);
3427  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3428  seh = scope.AddTopLevelSeqEntry(*entry);
3429  ChangeErrorAcc(expected_errors, "gb|AY123456|");
3430  expected_errors[1]->SetSeverity(eDiag_Error);
3431  expected_errors[2]->SetSeverity(eDiag_Error);
3432  eval = validator.Validate(seh, options);
3433  CheckErrors(*eval, expected_errors);
3434 
3435  // NC and patent IDs back to warning
3436  scope.RemoveTopLevelSeqEntry(seh);
3437  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3438  seh = scope.AddTopLevelSeqEntry(*entry);
3439  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3440  expected_errors[1]->SetSeverity(eDiag_Warning);
3441  expected_errors[2]->SetSeverity(eDiag_Warning);
3442  eval = validator.Validate(seh, options);
3443  CheckErrors(*eval, expected_errors);
3444 
3445  scope.RemoveTopLevelSeqEntry(seh);
3446  entry->SetSeq().SetId().front()->SetPatent().SetSeqid(1);
3447  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetCountry("USA");
3448  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetId().SetNumber("1");
3449  seh = scope.AddTopLevelSeqEntry(*entry);
3450  ChangeErrorAcc(expected_errors, "pat|USA|1|1");
3451  delete expected_errors.back();
3452  expected_errors.pop_back();
3453  eval = validator.Validate(seh, options);
3454  CheckErrors(*eval, expected_errors);
3455  CLEAR_ERRORS
3456 
3457  // no more TerminalNs warnings if circular
3458  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3460  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Error, "ContigsTooShort",
3461  "Maximum contig length is 2 bases"));
3462  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Warning, "UnwantedCompleteFlag",
3463  "Suspicious use of complete"));
3464  // AddChromosomeNoLocation(expected_errors, entry);
3465 
3466  eval = validator.Validate(seh, options);
3467  CheckErrors(*eval, expected_errors);
3468 
3469  CLEAR_ERRORS
3470 }
3471 
3472 
3473 BOOST_FIXTURE_TEST_CASE(Test_UnexpectedIdentifierChange, CGenBankFixture)
3474 {
3476  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123457");
3477  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3478  CRef<CSeq_id> gi_id(new CSeq_id());
3479  gi_id->SetGi(GI_CONST(21914627));
3480  entry->SetSeq().SetId().push_back(gi_id);
3481 
3483 
3484  expected_errors.push_back(new CExpectedError("gb|AY123457.1|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123457.1|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
3485  // AddChromosomeNoLocation(expected_errors, entry);
3486  eval = validator.Validate(seh, options);
3487  CheckErrors(*eval, expected_errors);
3488 
3489  CLEAR_ERRORS
3490  scope.RemoveTopLevelSeqEntry(seh);
3491  entry->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3492  entry->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3493  seh = scope.AddTopLevelSeqEntry(*entry);
3494  // AddChromosomeNoLocation(expected_errors, entry);
3495  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Info, "HistAssemblyMissing", "TPA record tpg|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3496  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "Loss of accession (gb|AY123456.1|) on gi (21914627) compared to the NCBI sequence repository"));
3497  eval = validator.Validate(seh, options);
3498  CheckErrors(*eval, expected_errors);
3499 
3500  // TODO - try to instigate other errors
3501 
3502  CLEAR_ERRORS
3503 }
3504 
3505 
3506 BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqLit)
3507 {
3509  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNGG");
3510  SetTech(entry, CMolInfo::eTech_wgs);
3511 
3513 
3514  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqLit", "Run of 20 Ns in delta component 5 that starts at base 45"));
3515  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
3516  /*
3517  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3518  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3519  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3520  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3521  */
3522  AddChromosomeNoLocation(expected_errors, entry);
3523 
3524  eval = validator.Validate(seh, options);
3525  CheckErrors(*eval, expected_errors);
3526 
3527  CLEAR_ERRORS
3528 
3529  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGG");
3531  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqLit",
3532  "Run of 81 Ns in delta component 7 that starts at base 79"));
3533  /*
3534  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3535  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3536  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3537  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3538  */
3539  // AddChromosomeNoLocation(expected_errors, entry);
3540 
3541  eval = validator.Validate(seh, options);
3542  CheckErrors(*eval, expected_errors);
3543 
3545  eval = validator.Validate(seh, options);
3546  CheckErrors(*eval, expected_errors);
3547 
3549  eval = validator.Validate(seh, options);
3550  CheckErrors(*eval, expected_errors);
3551 
3552  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGG");
3554  expected_errors[0]->SetErrMsg("Run of 101 Ns in delta component 9 that starts at base 174");
3555  eval = validator.Validate(seh, options);
3556  CheckErrors(*eval, expected_errors);
3557 
3558  CLEAR_ERRORS
3559 }
3560 
3561 
3562 BOOST_AUTO_TEST_CASE(Test_SeqLitGapLength0)
3563 {
3565  CRef<CDelta_seq> delta_seq(new CDelta_seq());
3566  delta_seq->SetLiteral().SetLength(0);
3567  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(delta_seq);
3568 
3570 
3571  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLitGapLength0", "Gap of length 0 in delta chain"));
3572  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "Last delta seq component is a gap"));
3573  // AddChromosomeNoLocation(expected_errors, entry);
3574  eval = validator.Validate(seh, options);
3575  CheckErrors(*eval, expected_errors);
3576 
3577  // some kinds of fuzz don't trigger other kind of error
3578  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_gt);
3579  eval = validator.Validate(seh, options);
3580  CheckErrors(*eval, expected_errors);
3581 
3582  delta_seq->SetLiteral().SetFuzz().Reset();
3583  delta_seq->SetLiteral().SetFuzz().SetP_m(10);
3584  eval = validator.Validate(seh, options);
3585  CheckErrors(*eval, expected_errors);
3586 
3587  // others will
3588  delta_seq->SetLiteral().SetFuzz().Reset();
3589  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3590  expected_errors[0]->SetErrMsg("Gap of length 0 with unknown fuzz in delta chain");
3591  eval = validator.Validate(seh, options);
3592  CheckErrors(*eval, expected_errors);
3593 
3594  // try again with swissprot, error goes to warning
3595  scope.RemoveTopLevelSeqEntry(seh);
3596  entry->SetSeq().SetId().front()->SetSwissprot().SetAccession("AY123456");
3597  seh = scope.AddTopLevelSeqEntry(*entry);
3598  expected_errors[0]->SetSeverity(eDiag_Warning);
3599  ChangeErrorAcc(expected_errors, "sp|AY123456|");
3600  eval = validator.Validate(seh, options);
3601  CheckErrors(*eval, expected_errors);
3602 
3603  delta_seq->SetLiteral().SetFuzz().SetP_m(10);
3604  expected_errors[0]->SetErrMsg("Gap of length 0 in delta chain");
3605  eval = validator.Validate(seh, options);
3606  CheckErrors(*eval, expected_errors);
3607 
3608  delta_seq->SetLiteral().SetFuzz().Reset();
3609  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_gt);
3610  eval = validator.Validate(seh, options);
3611  CheckErrors(*eval, expected_errors);
3612 
3613  delta_seq->SetLiteral().ResetFuzz();
3614  eval = validator.Validate(seh, options);
3615  CheckErrors(*eval, expected_errors);
3616 
3617  CLEAR_ERRORS
3618 }
3619 
3620 
3622 {
3623  CRef<CSeqdesc> desc(new CSeqdesc());
3624  desc->SetUser().SetType().SetStr("TpaAssembly");
3625  entry->SetSeq().SetDescr().Set().push_back(desc);
3626 
3627  CRef<CUser_field> field(new CUser_field());
3628  field->SetLabel().SetStr("Label");
3629  field->SetData().SetStr("Data");
3630  desc->SetUser().SetData().push_back(field);
3631 }
3632 
3633 
3634 BOOST_FIXTURE_TEST_CASE(Test_TpaAssemblyProblem, CGenBankFixture)
3635 {
3636  CRef<CSeq_entry> entry(new CSeq_entry());
3639  member1->SetSeq().SetId().front()->SetLocal().SetStr("good");
3640  AddTpaAssemblyUserObject(member1);
3641  entry->SetSet().SetSeq_set().push_back(member1);
3643  member2->SetSeq().SetId().front()->SetLocal().SetStr("good2");
3644  AddTpaAssemblyUserObject(member2);
3645  entry->SetSet().SetSeq_set().push_back(member2);
3646 
3648 
3649  // two Tpa sequences, but neither has assembly and neither has GI, so no errors expected
3650  // AddChromosomeNoLocation(expected_errors, "lcl|good");
3651  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
3652  eval = validator.Validate(seh, options);
3653  CheckErrors(*eval, expected_errors);
3654 
3655  // now one has hist, other does not
3656  member1->SetSeq().SetInst().SetHist().SetAssembly().push_back(unit_test_util::BuildGoodAlign());
3657  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TpaAssemblyProblem", "There are 1 TPAs with history and 1 without history in this record."));
3658  eval = validator.Validate(seh, options);
3659  CheckErrors(*eval, expected_errors);
3660 
3661  // now one has gi
3662  scope.RemoveTopLevelSeqEntry(seh);
3663  member1->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3664  member1->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3665  CRef<CSeq_id> gi_id(new CSeq_id());
3666  gi_id->SetGi(GI_CONST(21914627));
3667  member1->SetSeq().SetId().push_back(gi_id);
3668  seh = scope.AddTopLevelSeqEntry(*entry);
3669 
3670  CLEAR_ERRORS
3671 
3672  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "Loss of accession (gb|AY123456.1|) on gi (21914627) compared to the NCBI sequence repository"));
3673  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Error, "TpaAssemblyProblem", "There are 1 TPAs with history and 1 without history in this record."));
3674  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "TpaAssemblyProblem", "There are 1 TPAs without history in this record, but the record has a gi number assignment."));
3675  // AddChromosomeNoLocation(expected_errors, "tpg|AY123456.1|");
3676  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
3677  eval = validator.Validate(seh, options);
3678  CheckErrors(*eval, expected_errors);
3679 
3680  CLEAR_ERRORS
3681 }
3682 
3683 
3685 {
3686  // prepare entry
3688  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
3689  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
3690  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(9);
3691  entry->SetSeq().SetInst().SetLength(32);
3692 
3694 
3695  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SeqLocLength", "Short length (10) on seq-loc (gb|AY123456|:1-10) of delta seq_ext"));
3696  // AddChromosomeNoLocation(expected_errors, entry);
3697  eval = validator.Validate(seh, options);
3698  CheckErrors(*eval, expected_errors);
3699 
3700  scope.RemoveTopLevelSeqEntry(seh);
3701  // if length 11, should not be a problem
3703  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
3704  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
3705  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(10);
3706  entry->SetSeq().SetInst().SetLength(33);
3707  seh = scope.AddTopLevelSeqEntry(*entry);
3708  eval = validator.Validate(seh, options);
3709  CheckErrors(*eval, expected_errors);
3710 
3711  CLEAR_ERRORS
3712 }
3713 
3714 
3715 BOOST_AUTO_TEST_CASE(Test_MissingGaps)
3716 {
3717  // prepare entry
3719  // remove gaps
3721 
3723 
3724  // AddChromosomeNoLocation(expected_errors, entry);
3725  // only report errors for specific molinfo tech values
3726  eval = validator.Validate(seh, options);
3727  CheckErrors(*eval, expected_errors);
3728  // htgs_3 should not report
3730  eval = validator.Validate(seh, options);
3731  CheckErrors(*eval, expected_errors);
3732 
3734  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingGaps", "HTGS delta seq should have gaps between all sequence runs"));
3735  eval = validator.Validate(seh, options);
3736  CheckErrors(*eval, expected_errors);
3737 
3739  eval = validator.Validate(seh, options);
3740  CheckErrors(*eval, expected_errors);
3741 
3743  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 delta seq has no gaps and no graphs"));
3744  eval = validator.Validate(seh, options);
3745  CheckErrors(*eval, expected_errors);
3746 
3747  // RefGeneTracking changes severity
3748  scope.RemoveTopLevelSeqEntry(seh);
3749  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3751  seh = scope.AddTopLevelSeqEntry(*entry);
3752  expected_errors[0]->SetSeverity(eDiag_Info);
3753  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3754  eval = validator.Validate(seh, options);
3755  CheckErrors(*eval, expected_errors);
3756  delete expected_errors[1];
3757  expected_errors.pop_back();
3758 
3760  eval = validator.Validate(seh, options);
3761  CheckErrors(*eval, expected_errors);
3762 
3764  eval = validator.Validate(seh, options);
3765  CheckErrors(*eval, expected_errors);
3766 
3767  CLEAR_ERRORS
3768 }
3769 
3770 
3771 BOOST_AUTO_TEST_CASE(Test_CompleteTitleProblem)
3772 {
3773  // prepare entry
3774  CRef<CSeq_entry> entry = BuildGoodSeq();
3775  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3776  SetLineage (entry, "Viruses; foo");
3777  SetTitle(entry, "Foo complete genome");
3778 
3780 
3781  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning, "CompleteTitleProblem", "Complete genome in title without complete flag set"));
3782  // AddChromosomeNoLocation(expected_errors, entry);
3783 
3784  eval = validator.Validate(seh, options);
3785  CheckErrors(*eval, expected_errors);
3786 
3787  CLEAR_ERRORS
3788 
3789  // should be no error if complete
3791 
3792  eval = validator.Validate(seh, options);
3793  // AddChromosomeNoLocation(expected_errors, entry);
3794  CheckErrors(*eval, expected_errors);
3795 
3796  // different message and code if gaps
3797  scope.RemoveTopLevelSeqEntry(seh);
3798  entry = BuildGoodDeltaSeq();
3799  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3800  unit_test_util::SetLineage (entry, "Viruses; foo");
3801  SetTitle(entry, "Foo complete genome");
3803  seh = scope.AddTopLevelSeqEntry(*entry);
3804 
3805  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3806  "CompleteGenomeHasGaps", "Title contains 'complete genome' but sequence has gaps"));
3807 
3808  eval = validator.Validate(seh, options);
3809  CheckErrors(*eval, expected_errors);
3810 
3811  CLEAR_ERRORS
3812 
3813 }
3814 
3815 
3816 BOOST_AUTO_TEST_CASE(Test_CompleteCircleProblem)
3817 {
3818  // prepare entry
3820  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3821 
3823 
3824  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
3825  "CompleteCircleProblem",
3826  "Circular topology without complete flag set"));
3827  // AddChromosomeNoLocation(expected_errors, entry);
3828 
3829  eval = validator.Validate(seh, options);
3830  CheckErrors(*eval, expected_errors);
3831 
3832  CLEAR_ERRORS
3833 
3834  scope.RemoveTopLevelSeqEntry(seh);
3835  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3836  SetTitle(entry, "This is just a title");
3838  seh = scope.AddTopLevelSeqEntry(*entry);
3839  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3840  "CompleteCircleProblem",
3841  "Circular topology has complete flag set, but title should say complete sequence or complete genome"));
3842  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3843  "UnwantedCompleteFlag",
3844  "Suspicious use of complete"));
3845  // AddChromosomeNoLocation(expected_errors, entry);
3846 
3847  eval = validator.Validate(seh, options);
3848  CheckErrors(*eval, expected_errors);
3849 
3850  CLEAR_ERRORS
3851 }
3852 
3853 
3854 BOOST_AUTO_TEST_CASE(Test_BadHTGSeq)
3855 {
3856  // prepare entry
3858  // remove gaps
3859  unit_test_util::RemoveDeltaSeqGaps (delta_entry);
3860 
3861  STANDARD_SETUP_NAME(delta_entry)
3862 
3863  SetTech(delta_entry, CMolInfo::eTech_htgs_2);
3864  // AddChromosomeNoLocation(expected_errors, delta_entry);
3865  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingGaps", "HTGS delta seq should have gaps between all sequence runs"));
3866  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 delta seq has no gaps and no graphs"));
3867  eval = validator.Validate(seh, options);
3868  CheckErrors(*eval, expected_errors);
3869 
3870  delete expected_errors[1];
3871  expected_errors.pop_back();
3872 
3873  // HTGS_ACTIVEFIN keyword disables BadHTGSeq error
3874  AddGenbankKeyword(delta_entry, "HTGS_ACTIVEFIN");
3875  eval = validator.Validate(seh, options);
3876  CheckErrors(*eval, expected_errors);
3877 
3878  CLEAR_ERRORS
3879 
3880  scope.RemoveTopLevelSeqEntry(seh);
3882  SetTech(raw_entry, CMolInfo::eTech_htgs_2);
3883  seh = scope.AddTopLevelSeqEntry(*raw_entry);
3884  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 raw seq has no gaps and no graphs"));
3885  // AddChromosomeNoLocation(expected_errors, raw_entry);
3886  eval = validator.Validate(seh, options);
3887  CheckErrors(*eval, expected_errors);
3888 
3889  CLEAR_ERRORS
3890 
3891  // HTGS_ACTIVEFIN keyword disables error
3892  AddGenbankKeyword(raw_entry, "HTGS_ACTIVEFIN");
3893  // AddChromosomeNoLocation(expected_errors, raw_entry);
3894  eval = validator.Validate(seh, options);
3895  CheckErrors(*eval, expected_errors);
3896 
3897 
3898  // htg3 errors
3899  SetTech(raw_entry, CMolInfo::eTech_htgs_3);
3900  AddGenbankKeyword(raw_entry, "HTGS_DRAFT");
3901  AddGenbankKeyword(raw_entry, "HTGS_PREFIN");
3902  AddGenbankKeyword(raw_entry, "HTGS_FULLTOP");
3903  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_DRAFT keyword"));
3904  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_PREFIN keyword"));
3905  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_ACTIVEFIN keyword"));
3906  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_FULLTOP keyword"));
3907  eval = validator.Validate(seh, options);
3908  CheckErrors(*eval, expected_errors);
3909 
3910  scope.RemoveTopLevelSeqEntry(seh);
3911  seh = scope.AddTopLevelSeqEntry(*delta_entry);
3912  SetTech(delta_entry, CMolInfo::eTech_htgs_3);
3913  AddGenbankKeyword(delta_entry, "HTGS_DRAFT");
3914  AddGenbankKeyword(delta_entry, "HTGS_PREFIN");
3915  AddGenbankKeyword(delta_entry, "HTGS_FULLTOP");
3916  eval = validator.Validate(seh, options);
3917  CheckErrors(*eval, expected_errors);
3918 
3919  CLEAR_ERRORS
3920 }
3921 
3922 
3923 BOOST_AUTO_TEST_CASE(Test_GapInProtein_and_BadProteinStart)
3924 {
3925  // prepare entry
3927  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("PRK-EIN");
3928 
3930 
3931  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapInProtein", "[1] internal gap symbols in protein sequence (gene? - fake protein name)"));
3932  // AddChromosomeNoLocation(expected_errors, entry);
3933  eval = validator.Validate(seh, options);
3934  CheckErrors(*eval, expected_errors);
3935 
3936  CLEAR_ERRORS
3937 
3938  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("-RKTEIN");
3939  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadProteinStart", "gap symbol at start of protein sequence (gene? - fake protein name)"));
3940  // AddChromosomeNoLocation(expected_errors, entry);
3941  eval = validator.Validate(seh, options);
3942  CheckErrors(*eval, expected_errors);
3943 
3944  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("-RK-EIN");
3945  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapInProtein", "[1] internal gap symbols in protein sequence (gene? - fake protein name)"));
3946  eval = validator.Validate(seh, options);
3947  CheckErrors(*eval, expected_errors);
3948 
3949  CLEAR_ERRORS
3950 }
3951 
3952 
3953 BOOST_AUTO_TEST_CASE(Test_TerminalGap)
3954 {
3955  // prepare entry
3957  CRef<CDelta_seq> first_seg(new CDelta_seq());
3958  first_seg->SetLiteral().SetLength(9);
3959  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_front(first_seg);
3960  CRef<CDelta_seq> last_seg(new CDelta_seq());
3961  last_seg->SetLiteral().SetLength(9);
3962  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(last_seg);
3963  entry->SetSeq().SetInst().SetLength(entry->SetSeq().SetInst().GetLength() + 18);
3964 
3966 
3967  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "First delta seq component is a gap"));
3968  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "Last delta seq component is a gap"));
3969  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at beginning of sequence"));
3970  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at end of sequence"));
3971  /*
3972  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3973  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3974  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3975  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3976  */
3977  // AddChromosomeNoLocation(expected_errors, entry);
3978 
3979  eval = validator.Validate(seh, options);
3980  CheckErrors(*eval, expected_errors);
3981 
3982  // if gap length is 10, severity is still warning because still all local IDS
3983  scope.RemoveTopLevelSeqEntry(seh);
3984  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetLength(10);
3985  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetLength(10);
3986  entry->SetSeq().SetInst().SetLength(entry->SetSeq().SetInst().GetLength() + 2);
3987  seh = scope.AddTopLevelSeqEntry(*entry);
3988  eval = validator.Validate(seh, options);
3989  CheckErrors(*eval, expected_errors);
3990 
3991 
3992  scope.RemoveTopLevelSeqEntry(seh);
3993  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3994  seh = scope.AddTopLevelSeqEntry(*entry);
3995  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3996  /*
3997  expected_errors[2]->SetSeverity(eDiag_Warning);
3998  expected_errors[3]->SetSeverity(eDiag_Warning);
3999  */
4000  eval = validator.Validate(seh, options);
4001  CheckErrors(*eval, expected_errors);
4002 
4003  scope.RemoveTopLevelSeqEntry(seh);
4004  entry->SetSeq().SetId().front()->SetPatent().SetSeqid(1);
4005  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetCountry("USA");
4006  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetId().SetNumber("1");
4007  seh = scope.AddTopLevelSeqEntry(*entry);
4008  ChangeErrorAcc(expected_errors, "pat|USA|1|1");
4009  eval = validator.Validate(seh, options);
4010  CheckErrors(*eval, expected_errors);
4011 
4012  CLEAR_ERRORS
4013 
4014  // no more terminal gap warnings if circular - changed to still show first/last delta component
4015  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
4017  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Warning, "UnwantedCompleteFlag",
4018  "Suspicious use of complete"));
4019  // AddChromosomeNoLocation(expected_errors, entry);
4020 
4021  eval = validator.Validate(seh, options);
4022  CheckErrors(*eval, expected_errors);
4023  CLEAR_ERRORS
4024 }
4025 
4026 
4027 BOOST_FIXTURE_TEST_CASE(Test_OverlappingDeltaRange, CGenBankFixture)
4028 {
4029  // prepare entry
4031  entry->SetSeq().SetInst().ResetExt();
4032  CRef<CSeq_id> seqid(new CSeq_id());
4033  seqid->SetGenbank().SetAccession("AY123456");
4034  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 0, 10);
4035  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 5, 15);
4036  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 20, 30);
4037  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 25, 35);
4038  entry->SetSeq().SetInst().SetLength(44);
4039 
4041 
4042  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingDeltaRange", "Overlapping delta range 6-16 and 1-11 on a Bioseq gb|AY123456|"));
4043  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingDeltaRange", "Overlapping delta range 26-36 and 21-31 on a Bioseq gb|AY123456|"));
4044  // AddChromosomeNoLocation(expected_errors, entry);
4045  eval = validator.Validate(seh, options);
4046  CheckErrors(*eval, expected_errors);
4047 
4048  CLEAR_ERRORS
4049 }
4050 
4051 
4052 BOOST_AUTO_TEST_CASE(Test_LeadingX)
4053 {
4054  // prepare entry
4056  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("XROTEIN");
4057 
4059 
4060  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LeadingX", "Sequence starts with leading X"));
4061  // AddChromosomeNoLocation(expected_errors, entry);
4062  eval = validator.Validate(seh, options);
4063  CheckErrors(*eval, expected_errors);
4064 
4065  CLEAR_ERRORS
4066 }
4067 
4068 BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqRaw)
4069 {
4070  // prepare entry
4072  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTTT");
4073  entry->SetSeq().SetInst().SetLength(110);
4074 
4076 
4077  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqRaw", "Run of 100 Ns in raw sequence starting at base 6"));
4078  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4079  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 90 percent Ns"));
4080  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4081  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4082  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4083  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4084  // AddChromosomeNoLocation(expected_errors, entry);
4085  eval = validator.Validate(seh, options);
4086  CheckErrors(*eval, expected_errors);
4087 
4088  CLEAR_ERRORS
4089 
4090  // expect no InternalNsInSeqRaw error
4091  scope.RemoveTopLevelSeqEntry(seh);
4092  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAANNNNNNNNNNNNNNNNNNNNTTTTT");
4093  entry->SetSeq().SetInst().SetLength(30);
4094  seh = scope.AddTopLevelSeqEntry(*entry);
4095  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4096  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 66 percent Ns"));
4097  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4098  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4099  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4100  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4101  // AddChromosomeNoLocation(expected_errors, entry);
4102  eval = validator.Validate(seh, options);
4103  CheckErrors(*eval, expected_errors);
4104 
4105  CLEAR_ERRORS
4106 
4107  // WGS has lower threshold
4108  SetTech (entry, CMolInfo::eTech_wgs);
4109  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqRaw", "Run of 20 Ns in raw sequence starting at base 6"));
4110  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4111  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 66 percent Ns"));
4112  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4113  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4114  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4115  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4116  AddChromosomeNoLocation(expected_errors, entry);
4117  eval = validator.Validate(seh, options);
4118  CheckErrors(*eval, expected_errors);
4119 
4120  CLEAR_ERRORS
4121 }
4122 
4123 
4124 BOOST_AUTO_TEST_CASE(Test_InternalNsAdjacentToGap)
4125 {
4126  // prepare entry
4128  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("ATGATGATGNNN");
4129  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNATGATGATG");
4130 
4132 
4133  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 9 bases"));
4134  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InternalNsAdjacentToGap", "Ambiguous residue N is adjacent to a gap around position 13"));
4135  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InternalNsAdjacentToGap", "Ambiguous residue N is adjacent to a gap around position 23"));
4136 // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4137 // "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4138 // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4139 // "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4140  // AddChromosomeNoLocation(expected_errors, entry);
4141 
4142  eval = validator.Validate(seh, options);
4143  CheckErrors(*eval, expected_errors);
4144 
4145  CLEAR_ERRORS
4146 }
4147 
4148 BOOST_AUTO_TEST_CASE(Test_DeltaComponentIsGi0)
4149 {
4150  // prepare entry
4152  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
4153  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
4154  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGi(ZERO_GI);
4155 
4157 
4158  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "DeltaComponentIsGi0", "Delta component is gi|0"));
4159  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DeltaSeqError", "Unable to find far delta sequence component"));
4160  // AddChromosomeNoLocation(expected_errors, entry);
4161 
4162  eval = validator.Validate(seh, options);
4163  CheckErrors(*eval, expected_errors);
4164 
4165  CLEAR_ERRORS
4166 }
4167 
4168 
4169 BOOST_AUTO_TEST_CASE(Test_InternalGapsInSeqRaw)
4170 {
4171  // prepare entry
4173  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGGCCAAAATTGGCCAAAATTGG-CAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
4174 
4176 
4177  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue '-' at position [27]"));
4178  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalGapsInSeqRaw", "Raw nucleotide should not contain gap characters"));
4179  // AddChromosomeNoLocation(expected_errors, entry);
4180 
4181  eval = validator.Validate(seh, options);
4182  CheckErrors(*eval, expected_errors);
4183 
4184  CLEAR_ERRORS
4185 }
4186 
4187 
4188 BOOST_AUTO_TEST_CASE(Test_SelfReferentialSequence)
4189 {
4190  // prepare entry
4192  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
4193  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
4194  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetLocal().SetStr("good");
4195 
4197 
4198  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "SelfReferentialSequence", "Self-referential delta sequence"));
4199  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InstantiatedGapMismatch", "Exception 4 in GapByGapInst"));
4200  // AddChromosomeNoLocation(expected_errors, entry);
4201 
4202  eval = validator.Validate(seh, options);
4203  CheckErrors(*eval, expected_errors);
4204 
4205  CLEAR_ERRORS
4206 }
4207 
4208 
4210 {
4211  // prepare entry
4213  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetWhole().SetGenbank().SetAccession("AY123456");
4214  entry->SetSeq().SetInst().SetLength(507);
4215 
4217 
4218  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WholeComponent", "Delta seq component should not be of type whole"));
4219  // AddChromosomeNoLocation(expected_errors, entry);
4220 
4221  eval = validator.Validate(seh, options);
4222  CheckErrors(*eval, expected_errors);
4223 
4224  CLEAR_ERRORS
4225 }
4226 
4227 
4229 {
4230  CRef<CSeq_id> gnl(new CSeq_id());
4231  gnl->SetGeneral().SetDb("a");
4232  gnl->SetGeneral().SetTag().SetStr("b");
4233  seq.SetId().front()->Assign(*gnl);
4234  CRef<CSeq_id> lcl(new CSeq_id());
4235  lcl->SetLocal().SetStr("x");
4236  seq.SetId().push_back(lcl);
4237  seq.SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().Assign(*gnl);
4238 }
4239 
4240 
4241 BOOST_AUTO_TEST_CASE(Test_ProteinsHaveGeneralID)
4242 {
4243  // prepare entry
4245  s_AddGeneralAndLocal(entry->SetSeq());
4246 
4248 
4249  // no error unless part of nuc-prot set
4250  // AddChromosomeNoLocation(expected_errors,entry);
4251  eval = validator.Validate(seh, options);
4252  CheckErrors(*eval, expected_errors);
4253  CLEAR_ERRORS
4254 
4255  scope.RemoveTopLevelSeqEntry(seh);
4258  s_AddGeneralAndLocal(prot->SetSeq());
4259 
4261  cds->SetProduct().SetWhole().SetGeneral().SetDb("a");
4262  cds->SetProduct().SetWhole().SetGeneral().SetTag().SetStr("b");
4263  seh = scope.AddTopLevelSeqEntry(*entry);
4264 
4265  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "ProteinsHaveGeneralID", "INDEXER_ONLY - Protein bioseqs have general seq-id."));
4266  // AddChromosomeNoLocation(expected_errors, entry);
4267 
4268  eval = validator.Validate(seh, options);
4269  CheckErrors(*eval, expected_errors);
4270 
4271  CLEAR_ERRORS
4272 }
4273 
4274 
4275 BOOST_AUTO_TEST_CASE(Test_HighNContentPercent_and_HighNContentStretch)
4276 {
4277  // prepare entry
4279  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCNNNNNNNNNNNAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTT");
4280  entry->SetSeq().SetInst().SetLength(100);
4281  SetTech (entry, CMolInfo::eTech_tsa);
4283  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
4284 
4286 
4287  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 11 percent Ns"));
4288  // AddChromosomeNoLocation(expected_errors, entry);
4289  eval = validator.Validate(seh, options);
4290  CheckErrors(*eval, expected_errors);
4291 
4292  scope.RemoveTopLevelSeqEntry(seh);
4293  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCNNNNNNNNNNNNNNNNTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTT");
4294  seh = scope.AddTopLevelSeqEntry(*entry);
4295  expected_errors[0]->SetErrMsg("Sequence contains 16 percent Ns");
4296  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentStretch", "Sequence has a stretch of 16 Ns"));
4297  eval = validator.Validate(seh, options);
4298  CheckErrors(*eval, expected_errors);
4299 
4300  CLEAR_ERRORS
4301 
4302  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentStretch", "Sequence has a stretch of 16 Ns"));
4303  eval = validator.GetTSANStretchErrors(seh);
4304  CheckErrors(*eval, expected_errors);
4305  eval = validator.GetTSANStretchErrors(entry->GetSeq());
4306  CheckErrors(*eval, expected_errors);
4307 
4308  CLEAR_ERRORS
4309 
4310  scope.RemoveTopLevelSeqEntry(seh);
4311  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AANNNNNNNNNNGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGTTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCNNNNNNNNNNAAA");
4312  seh = scope.AddTopLevelSeqEntry(*entry);
4313  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4314  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4315  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4316  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4317  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent",
4318  "Sequence contains 20 percent Ns"));
4319  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent5Prime",
4320  "Sequence has a stretch of at least 10 Ns within the first 20 bases"));
4321  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent3Prime",
4322  "Sequence has a stretch of at least 10 Ns within the last 20 bases"));
4323  // AddChromosomeNoLocation(expected_errors, entry);
4324  eval = validator.Validate(seh, options);
4325  CheckErrors(*eval, expected_errors);
4326 
4327  CLEAR_ERRORS
4328 
4329  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent5Prime", "Sequence has a stretch of at least 10 Ns within the first 20 bases"));
4330  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent3Prime", "Sequence has a stretch of at least 10 Ns within the last 20 bases"));
4331  eval = validator.GetTSANStretchErrors(seh);
4332  CheckErrors(*eval, expected_errors);
4333  eval = validator.GetTSANStretchErrors(entry->GetSeq());
4334  CheckErrors(*eval, expected_errors);
4335 
4336  CLEAR_ERRORS
4337 
4338  scope.RemoveTopLevelSeqEntry(seh);
4340  CRef<CDelta_seq> gap_seg(new CDelta_seq());
4341  gap_seg->SetLiteral().SetSeq_data().SetGap();
4342  gap_seg->SetLiteral().SetLength(10);
4343  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
4344  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGA", CSeq_inst::eMol_dna);
4345  entry->SetSeq().SetInst().SetLength(entry->GetSeq().GetInst().GetLength() + 20);
4346  seh = scope.AddTopLevelSeqEntry(*entry);
4347 
4348  /*
4349  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4350  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4351  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4352  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4353  */
4354  // AddChromosomeNoLocation(expected_errors, entry);
4355 
4356  eval = validator.Validate(seh, options);
4357  CheckErrors(*eval, expected_errors);
4358 
4359  CLEAR_ERRORS
4360 }
4361 
4362 
4363 BOOST_AUTO_TEST_CASE(Test_SeqLitDataLength0)
4364 {
4365  // prepare entry
4367 
4368  CDelta_ext::Tdata::iterator seg_it = entry->SetSeq().SetInst().SetExt().SetDelta().Set().begin();
4369  ++seg_it;
4370  (*seg_it)->SetLiteral().SetSeq_data().SetIupacna().Set();
4371  (*seg_it)->SetLiteral().SetLength(0);
4372 
4373  entry->SetSeq().SetInst().SetLength(24);
4374 
4376 
4377  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLitDataLength0", "Seq-lit of length 0 in delta chain"));
4378  // AddChromosomeNoLocation(expected_errors, entry);
4379  eval = validator.Validate(seh, options);
4380  CheckErrors(*eval, expected_errors);
4381 
4382  CLEAR_ERRORS
4383 }
4384 
4385 
4387 {
4389 
4390  entry->SetSeq().SetInst().ResetSeq_data();
4391  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
4392  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("ATGATGATGCCC", CSeq_inst::eMol_dna);
4393  CRef<CDelta_seq> gap_seg(new CDelta_seq());
4394  gap_seg->SetLiteral().SetLength(101);
4395  gap_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
4396  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
4397  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATG", CSeq_inst::eMol_dna);
4398  entry->SetSeq().SetInst().SetLength(125);
4399 
4400  return entry;
4401 }
4402 
4403 
4404 BOOST_AUTO_TEST_CASE(Test_UnknownLengthGapNot100)
4405 {
4407 
4409 
4410  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnknownLengthGapNot100", "Gap of unknown length should have length 100"));
4411  /*
4412  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4413  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4414  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4415  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4416  */
4417  // AddChromosomeNoLocation(expected_errors, entry);
4418  eval = validator.Validate(seh, options);
4419  CheckErrors(*eval, expected_errors);
4420 
4421  CLEAR_ERRORS
4422 }
4423 
4424 
4426 {
4427  // prepare entry
4429  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
4431  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ds);
4432 
4434 
4435  // double strand
4436  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "mRNAshouldBeSingleStranded", "mRNA should be single stranded not double stranded"));
4437  // AddChromosomeNoLocation(expected_errors, entry);
4438  eval = validator.Validate(seh, options);
4439  CheckErrors(*eval, expected_errors);
4440 
4441  // mixed strand
4442  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_mixed);
4443  eval = validator.Validate(seh, options);
4444  CheckErrors(*eval, expected_errors);
4445 
4446  // mixed strand
4447  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_other);
4448  eval = validator.Validate(seh, options);
4449  CheckErrors(*eval, expected_errors);
4450 
4451  CLEAR_ERRORS
4452 
4453  // these should not produce errors
4454 
4455  // strand not set
4456  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_not_set);
4457  eval = validator.Validate(seh, options);
4458  // AddChromosomeNoLocation(expected_errors, entry);
4459 
4460  CheckErrors(*eval, expected_errors);
4461 
4462  entry->SetSeq().SetInst().ResetStrand();
4463  eval = validator.Validate(seh, options);
4464  CheckErrors(*eval, expected_errors);
4465 
4466  // single strand
4467  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ss);
4468  eval = validator.Validate(seh, options);
4469  CheckErrors(*eval, expected_errors);
4470 
4471  CLEAR_ERRORS
4472 }
4473 
4474 
4475 BOOST_AUTO_TEST_CASE(Test_BioSourceMissing)
4476 {
4477  // prepare entry
4480  unit_test_util::AddGoodSource (entry->SetSet().SetSeq_set().front());
4481 
4483 
4484  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing", "Nuc-prot set does not contain expected BioSource descriptor"));
4485  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Fatal, "NoOrgFound", "No organism name included in the source. Other qualifiers may exist."));
4486  // AddChromosomeNoLocation(expected_errors, entry);
4487 
4488  eval = validator.Validate(seh, options);
4489  CheckErrors(*eval, expected_errors);
4490 
4491  CLEAR_ERRORS
4492 }
4493 
4494 
4495 BOOST_AUTO_TEST_CASE(Test_Descr_InvalidForType)
4496 {
4497  // prepare entry
4499  CRef<CSeqdesc> desc;
4500  desc.Reset(new CSeqdesc());
4502  entry->SetDescr().Set().push_back(desc);
4503  desc.Reset(new CSeqdesc());
4504  desc->SetModif().push_back(eGIBB_mod_dna);
4505  entry->SetDescr().Set().push_back(desc);
4506  desc.Reset(new CSeqdesc());
4508  entry->SetDescr().Set().push_back(desc);
4509  desc.Reset(new CSeqdesc());
4510  desc->SetOrg().SetTaxname("Sebaea microphylla");
4511  entry->SetDescr().Set().push_back(desc);
4512  AddTpaAssemblyUserObject (entry);
4513 
4515 
4516  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide",
4517  "Nucleic acid with protein sequence method"));
4518  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4519  "MolType descriptor is obsolete"));
4520  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4521  "Modif descriptor is obsolete"));
4522  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4523  "Method descriptor is obsolete"));
4524  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4525  "OrgRef descriptor is obsolete"));
4526  // AddChromosomeNoLocation(expected_errors, entry);
4527 
4528  // won't complain about TPA assembly if only local ID
4529  eval = validator.Validate(seh, options);
4530  CheckErrors(*eval, expected_errors);
4531 
4532  CLEAR_ERRORS
4533 
4534  scope.RemoveTopLevelSeqEntry(seh);
4535  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
4540  seh = scope.AddTopLevelSeqEntry(*entry);
4541  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TPAassemblyWithoutTPAKeyword",
4542  "Non-TPA record gb|AY123456| should not have TpaAssembly object"));
4543  // AddChromosomeNoLocation(expected_errors, entry);
4544  SetErrorsAccessions(expected_errors, "gb|AY123456|");
4545  eval = validator.Validate(seh, options);
4546  CheckErrors(*eval, expected_errors);
4547 
4548  scope.RemoveTopLevelSeqEntry(seh);
4549  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
4550  seh = scope.AddTopLevelSeqEntry(*entry);
4551  SetErrorsAccessions(expected_errors, "ref|NC_123456|");
4552  expected_errors[0]->SetErrMsg("Non-TPA record ref|NC_123456| should not have TpaAssembly object");
4553  eval = validator.Validate(seh, options);
4554  CheckErrors(*eval, expected_errors);
4555 
4556  desc.Reset(new CSeqdesc());
4558  entry->SetDescr().Set().push_back(desc);
4559  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InvalidForTypeGIBB",
4560  "Nucleic acid with GIBB-mol = peptide"));
4561  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InvalidForType",
4562  "MolType descriptor is obsolete"));
4563  eval = validator.Validate(seh, options);
4564  CheckErrors(*eval, expected_errors);
4565 
4567  expected_errors[1]->SetErrMsg("GIBB-mol unknown or other used");
4568  eval = validator.Validate(seh, options);
4569  CheckErrors(*eval, expected_errors);
4570 
4572  eval = validator.Validate(seh, options);
4573  CheckErrors(*eval, expected_errors);
4574 
4575  CLEAR_ERRORS
4576 
4577  scope.RemoveTopLevelSeqEntry(seh);
4579  desc.Reset(new CSeqdesc());
4581  entry->SetDescr().Set().push_back(desc);
4582  seh = scope.AddTopLevelSeqEntry(*entry);
4583  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4584  "GIBB-mol [1] used on protein"));
4585  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4586  "MolType descriptor is obsolete"));
4587  // AddChromosomeNoLocation(expected_errors, entry);
4588  eval = validator.Validate(seh, options);
4589  CheckErrors(*eval, expected_errors);
4590 
4592  expected_errors[0]->SetErrMsg("GIBB-mol [2] used on protein");
4593  eval = validator.Validate(seh, options);
4594  CheckErrors(*eval, expected_errors);
4595 
4596  desc->SetMol_type(eGIBB_mol_mRNA);
4597  expected_errors[0]->SetErrMsg("GIBB-mol [3] used on protein");
4598  eval = validator.Validate(seh, options);
4599  CheckErrors(*eval, expected_errors);
4600 
4601  desc->SetMol_type(eGIBB_mol_rRNA);
4602  expected_errors[0]->SetErrMsg("GIBB-mol [4] used on protein");
4603  eval = validator.Validate(seh, options);
4604  CheckErrors(*eval, expected_errors);
4605 
4606  desc->SetMol_type(eGIBB_mol_tRNA);
4607  expected_errors[0]->SetErrMsg("GIBB-mol [5] used on protein");
4608  eval = validator.Validate(seh, options);
4609  CheckErrors(*eval, expected_errors);
4610 
4612  expected_errors[0]->SetErrMsg("GIBB-mol [6] used on protein");
4613  eval = validator.Validate(seh, options);
4614  CheckErrors(*eval, expected_errors);
4615 
4617  expected_errors[0]->SetErrMsg("GIBB-mol [7] used on protein");
4618  eval = validator.Validate(seh, options);
4619  CheckErrors(*eval, expected_errors);
4620 
4622  expected_errors[0]->SetErrMsg("GIBB-mol [9] used on protein");
4623  eval = validator.Validate(seh, options);
4624  CheckErrors(*eval, expected_errors);
4625 
4627  expected_errors[0]->SetErrMsg("GIBB-mol [10] used on protein");
4628  eval = validator.Validate(seh, options);
4629  CheckErrors(*eval, expected_errors);
4630 
4631  CLEAR_ERRORS
4632 
4633  // invalid modif
4634  desc->SetModif().push_back(eGIBB_mod_dna);
4635  desc->SetModif().push_back(eGIBB_mod_rna);
4636  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4637  "Nucleic acid GIBB-mod [0] on protein"));
4638  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4639  "Nucleic acid GIBB-mod [1] on protein"));
4640  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4641  "Modif descriptor is obsolete"));
4642  // AddChromosomeNoLocation(expected_errors, entry);
4643  eval = validator.Validate(seh, options);
4644  CheckErrors(*eval, expected_errors);
4645 
4646  CLEAR_ERRORS
4647 
4648  scope.RemoveTopLevelSeqEntry(seh);
4649  entry = unit_test_util::BuildGoodSeq();
4650  for (auto& it : entry->SetSeq().SetDescr().Set()) {
4651  if (it->IsSource()) {
4652  it->SetSource().SetOrigin(CBioSource::eOrigin_synthetic);
4653  }
4654  }
4655  seh = scope.AddTopLevelSeqEntry(*entry);
4656  // if biomol not other, should generate error
4657  expected_errors.push_back(new CExpectedError ("lcl|good", eDiag_Warning, "InvalidForType",
4658  "Molinfo-biomol other should be used if Biosource-location is synthetic"));
4659  // AddChromosomeNoLocation(expected_errors, entry);
4660  eval = validator.Validate(seh, options);
4661  CheckErrors(*eval, expected_errors);
4662 
4663  CLEAR_ERRORS
4664 
4665  for (auto& it : entry->SetSeq().SetDescr().Set()) {
4666  if (it->IsSource()) {
4667  it->SetSource().ResetOrigin();
4668  }
4669  }
4670 
4672  expected_errors.push_back(new CExpectedError ("lcl|good", eDiag_Error, "InvalidMolInfo",
4673  "Nucleic acid with Molinfo = peptide"));
4674  // AddChromosomeNoLocation(expected_errors, entry);
4675  eval = validator.Validate(seh, options);
4676  CheckErrors(*eval, expected_errors);
4677  CLEAR_ERRORS
4678 
4680  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
4681  "MoltypeOtherGenetic", "Molinfo-biomol = other genetic"));
4682  // AddChromosomeNoLocation(expected_errors, entry);
4683  eval = validator.Validate(seh, options);
4684  CheckErrors(*eval, expected_errors);
4685  CLEAR_ERRORS
4686 
4688  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4689  "MoltypeUnknown", "Molinfo-biomol unknown used"));
4690  // AddChromosomeNoLocation(expected_errors, entry);
4691  eval = validator.Validate(seh, options);
4692  CheckErrors(*eval, expected_errors);
4693  CLEAR_ERRORS
4694 
4696  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
4697  "MoltypeOther", "Molinfo-biomol other used"));
4698  // AddChromosomeNoLocation(expected_errors, entry);
4699  eval = validator.Validate(seh, options);
4700  CheckErrors(*eval, expected_errors);
4701  CLEAR_ERRORS
4702 
4703  scope.RemoveTopLevelSeqEntry(seh);
4705  seh = scope.AddTopLevelSeqEntry(*entry);
4706 
4707  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4708  "InvalidForType", "Molinfo-biomol [1] used on protein"));
4709  // AddChromosomeNoLocation(expected_errors, entry);
4711  expected_errors[0]->SetErrMsg("Molinfo-biomol [1] used on protein");
4712  eval = validator.Validate(seh, options);
4713  CheckErrors(*eval, expected_errors);
4714 
4716  expected_errors[0]->SetErrMsg("Molinfo-biomol [2] used on protein");
4717  eval = validator.Validate(seh, options);
4718  CheckErrors(*eval, expected_errors);
4719 
4721  expected_errors[0]->SetErrMsg("Molinfo-biomol [3] used on protein");
4722  eval = validator.Validate(seh, options);
4723  CheckErrors(*eval, expected_errors);
4724 
4726  expected_errors[0]->SetErrMsg("Molinfo-biomol [4] used on protein");
4727  eval = validator.Validate(seh, options);
4728  CheckErrors(*eval, expected_errors);
4729 
4731  expected_errors[0]->SetErrMsg("Molinfo-biomol [5] used on protein");
4732  eval = validator.Validate(seh, options);
4733  CheckErrors(*eval, expected_errors);
4734 
4736  expected_errors[0]->SetErrMsg("Molinfo-biomol [6] used on protein");
4737  eval = validator.Validate(seh, options);
4738  CheckErrors(*eval, expected_errors);
4739 
4741  expected_errors[0]->SetErrMsg("Molinfo-biomol [7] used on protein");
4742  eval = validator.Validate(seh, options);
4743  CheckErrors(*eval, expected_errors);
4744 
4746  expected_errors[0]->SetErrMsg("Molinfo-biomol [10] used on protein");
4747  eval = validator.Validate(seh, options);
4748  CheckErrors(*eval, expected_errors);
4749 
4751  expected_errors[0]->SetErrMsg("Molinfo-biomol [11] used on protein");
4752  eval = validator.Validate(seh, options);
4753  CheckErrors(*eval, expected_errors);
4754 
4756  expected_errors[0]->SetErrMsg("Molinfo-biomol [12] used on protein");
4757  eval = validator.Validate(seh, options);
4758  CheckErrors(*eval, expected_errors);
4759 
4761  expected_errors[0]->SetErrMsg("Molinfo-biomol [13] used on protein");
4762  eval = validator.Validate(seh, options);
4763  CheckErrors(*eval, expected_errors);
4764 
4766  expected_errors[0]->SetErrMsg("Molinfo-biomol [14] used on protein");
4767  eval = validator.Validate(seh, options);
4768  CheckErrors(*eval, expected_errors);
4769 
4771  expected_errors[0]->SetErrMsg("Molinfo-biomol [15] used on protein");
4772  eval = validator.Validate(seh, options);
4773  CheckErrors(*eval, expected_errors);
4774 
4775  CLEAR_ERRORS
4776 
4777  scope.RemoveTopLevelSeqEntry(seh);
4778  entry = unit_test_util::BuildGoodSeq();
4779  seh = scope.AddTopLevelSeqEntry(*entry);
4781  expected_errors.push_back(new CExpectedError ("lcl|good", eDiag_Warning, "SyntheticConstructWrongMolType",
4782  "synthetic construct should have other-genetic"));
4783  expected_errors.push_back(new CExpectedError ("lcl|good", eDiag_Warning, "SyntheticConstructNeedsArtificial",
4784  "synthetic construct should have artificial origin"));
4785  // AddChromosomeNoLocation(expected_errors, entry);
4786  eval = validator.Validate(seh, options);
4787  CheckErrors(*eval, expected_errors);
4788 
4789  CLEAR_ERRORS
4790 
4792 
4793  SetTech(entry,