NCBI C++ ToolKit
unit_test_validator.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: unit_test_validator.cpp 102767 2024-07-09 17:10:10Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin, NCBI
27  *
28  * File Description:
29  * Unit tests for the validator.
30  *
31  * ===========================================================================
32  */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "unit_test_validator.hpp"
37 
38 #include <corelib/ncbi_system.hpp>
39 
40 // This macro should be defined before inclusion of test_boost.hpp in all
41 // "*.cpp" files inside executable except one. It is like function main() for
42 // non-Boost.Test executables is defined only in one *.cpp file - other files
43 // should not include it. If NCBI_BOOST_NO_AUTO_TEST_MAIN will not be defined
44 // then test_boost.hpp will define such "main()" function for tests.
45 //
46 // Usually if your unit tests contain only one *.cpp file you should not
47 // care about this macro at all.
48 //
49 //#define NCBI_BOOST_NO_AUTO_TEST_MAIN
50 
51 #define BAD_VALIDATOR
52 
53 // This header must be included before all Boost.Test headers if there are any
54 #include <corelib/test_boost.hpp>
55 
56 // for ignoring external config files
57 #include <util/util_misc.hpp>
58 
60 #include <objects/biblio/Title.hpp>
66 #include <objects/pub/Pub.hpp>
68 #include <objects/seq/GIBB_mol.hpp>
69 #include <objects/seq/Seq_ext.hpp>
73 #include <objects/seq/Ref_ext.hpp>
74 #include <objects/seq/Map_ext.hpp>
75 #include <objects/seq/Seg_ext.hpp>
76 #include <objects/seq/Seq_gap.hpp>
77 #include <objects/seq/Seq_data.hpp>
79 #include <objects/seq/Seqdesc.hpp>
80 #include <objects/seq/MolInfo.hpp>
81 #include <objects/seq/Pubdesc.hpp>
82 #include <objects/seq/Seq_hist.hpp>
100 #include <objmgr/object_manager.hpp>
101 #include <objmgr/scope.hpp>
102 #include <objmgr/bioseq_ci.hpp>
103 #include <objmgr/feat_ci.hpp>
104 #include <objmgr/seq_vector.hpp>
105 #include <objmgr/util/sequence.hpp>
106 #include <objmgr/seqdesc_ci.hpp>
107 #include <objmgr/util/sequence.hpp>
115 #include <corelib/ncbiapp.hpp>
116 #include <common/ncbi_export.h>
120 #include <objtools/edit/cds_fix.hpp>
122 
123 // for writing out tmp files
124 #include <serial/objostrasn.hpp>
125 #include <serial/objostrasnb.hpp>
126 
128 
131 
132 using namespace validator;
133 using namespace unit_test_util;
134 
135 
136 CExpectedError::CExpectedError(string accession, EDiagSev severity, string err_code, string err_msg)
137  : m_Accession(accession), m_Severity(severity), m_ErrCode(err_code), m_ErrMsg(err_msg)
138 {
139 }
140 
142 {
143 }
144 
145 
146 bool CExpectedError::Match(const CValidErrItem& err_item, bool ignore_severity)
147 {
148  if (!NStr::IsBlank(m_Accession) && !NStr::IsBlank(err_item.GetAccnver())
149  && !NStr::Equal(err_item.GetAccnver(), m_Accession)) {
150  return false;
151  }
152  if (!NStr::Equal(err_item.GetErrCode(), m_ErrCode)) {
153  return false;
154  }
155  string msg = err_item.GetMsg();
156  size_t pos = NStr::Find(msg, " EXCEPTION: NCBI C++ Exception:");
157  if (pos != string::npos) {
158  msg = msg.substr(0, pos);
159  }
160 
161  if (!NStr::Equal(msg, m_ErrMsg)) {
162  return false;
163  }
164  if (!ignore_severity && m_Severity != err_item.GetSeverity()) {
165  return false;
166  }
167  return true;
168 }
169 
170 
171 void CExpectedError::Test(const CValidErrItem& err_item)
172 {
173  if (!NStr::IsBlank(m_Accession) && !NStr::IsBlank(err_item.GetAccnver())) {
174  BOOST_CHECK_EQUAL(err_item.GetAccnver(), m_Accession);
175  }
176  BOOST_CHECK_EQUAL(err_item.GetSeverity(), m_Severity);
177  BOOST_CHECK_EQUAL(err_item.GetErrCode(), m_ErrCode);
178  string msg = err_item.GetMsg();
179  size_t pos = NStr::Find(msg, " EXCEPTION: NCBI C++ Exception:");
180  if (pos != string::npos) {
181  msg = msg.substr(0, pos);
182  }
183  BOOST_CHECK_EQUAL(msg, m_ErrMsg);
184 }
185 
186 
188 {
189  string description = err_item.GetAccnver() + ":"
190  + CValidErrItem::ConvertSeverity(err_item.GetSeverity()) + ":"
191  + err_item.GetErrCode() + ":"
192  + err_item.GetMsg();
193  printf("%s\n", description.c_str());
194 }
195 
196 
198 {
199  string description = m_Accession + ":"
201  + m_ErrCode + ":"
202  + m_ErrMsg;
203  printf("%s\n", description.c_str());
204 }
205 
206 
207 static bool s_debugMode = false;
208 
209 void WriteErrors(const CValidError& eval, bool debug_mode)
210 {
211  if (debug_mode) {
212  printf("\n-\n");
213  }
214  for (CValidError_CI vit(eval); vit; ++vit) {
216  }
217  if (debug_mode) {
218  printf("\n\n");
219  }
220  printf("\n\n");
221 }
222 
223 
224 void CheckErrors(const CValidError& eval,
225  vector<CExpectedError*>& expected_errors)
226 {
227  //static int count(1);
228  //if (count == 1367) {
229  // cerr << "";
230  //}
231  //cerr << count++ << "\n";
232 
233  bool problem_found = false;
234 
235  if (s_debugMode) {
236  WriteErrors(eval, true);
237  return;
238  }
239 
240  vector<bool> expected_found;
241  for (size_t i = 0; i < expected_errors.size(); i++) {
242  if (expected_errors[i]) {
243  expected_found.push_back(false);
244  } else {
245  expected_found.push_back(true);
246  }
247  }
248 
249  for (CValidError_CI vit(eval); vit; ++vit) {
250  bool found = false;
251  for (size_t i = 0; i < expected_errors.size(); i++) {
252  if (!expected_found[i] && expected_errors[i]->Match(*vit)) {
253  expected_found[i] = true;
254  found = true;
255  break;
256  }
257  }
258  if (!found) {
259  for (size_t i = 0; i < expected_errors.size(); i++) {
260  if (!expected_found[i] && expected_errors[i]->Match(*vit, true)) {
261  printf("Problem with ");
263  expected_errors[i]->Test(*vit);
264  expected_found[i] = true;
265  found = true;
266  problem_found = true;
267  break;
268  }
269  }
270  }
271  if (!found) {
272  BOOST_CHECK_EQUAL("Unexpected error", "Error not found");
274  problem_found = true;
275  }
276  }
277 
278  for (size_t i = 0; i < expected_errors.size(); i++) {
279  if (!expected_found[i]) {
280  BOOST_CHECK_EQUAL(expected_errors[i]->GetErrMsg(), "Expected error not found");
281  problem_found = true;
282  }
283  }
284 
285  if (problem_found) {
286  WriteErrors(eval, false);
287 
288  printf("Expected:\n");
289  for (auto it : expected_errors) {
290  if (it) {
291  it->Print();
292  }
293  }
294  }
295 }
296 
297 
298 void CheckStrings(const vector<string>& seen, const vector<string>& expected)
299 {
300  auto it1 = seen.begin();
301  auto it2 = expected.begin();
302  bool any = false;
303  while (it1 != seen.end() && it2 != expected.end()) {
304  BOOST_CHECK_EQUAL(*it1, *it2);
305  if (!NStr::Equal(*it1, *it2)) {
306  any = true;
307  }
308  it1++;
309  it2++;
310  }
311  while (it1 != seen.end()) {
312  BOOST_CHECK_EQUAL(*it1, "Unexpected string");
313  it1++;
314  any = true;
315  }
316  while (it2 != expected.end()) {
317  BOOST_CHECK_EQUAL("Missing string", *it2);
318  it2++;
319  any = true;
320  }
321 
322  if (any) {
323  printf("Seen:\n");
324  auto it1 = seen.begin();
325  while (it1 != seen.end()) {
326  printf("%s\n", (*it1).c_str());
327  it1++;
328  }
329  printf("Expected:\n");
330  auto it2 = expected.begin();
331  while (it2 != expected.end()) {
332  printf("%s\n", (*it2).c_str());
333  it2++;
334  }
335  }
336 }
337 
338 
339 // Not currently used, but I'll leave it here in case
340 // it's useful in the future.
341 
342 #if 0
343 static void SetCountryOnSrc(CBioSource& src, string country)
344 {
345  if (NStr::IsBlank(country)) {
346  if (src.IsSetSubtype()) {
347  auto& cont = src.SetSubtype();
348  cont.remove_if([](CSubSource* it) {
349  return (it->IsSetSubtype() && it->GetSubtype() == CSubSource::eSubtype_country);
350  });
351  }
352  } else {
354  src.SetSubtype().push_back(sub);
355  }
356 }
357 #endif
358 
360 static string ToAsn1(const CRef<CSeq_entry>& entry)
361 {
362  CNcbiOstrstream os;
363  os << MSerial_AsnText << entry;
364  return os.str();
365 }
366 
369 
372 
374 {
375  if (!CNcbiApplication::Instance()->GetConfig().HasEntry("NCBI", "Data")) {
376  NCBITEST_DISABLE(Test_Descr_BadStructuredCommentFormat);
377  NCBITEST_DISABLE(Test_Descr_MissingKeyword);
378  }
379 }
380 
381 
382 static void SetErrorsAccessions(vector<CExpectedError*>& expected_errors, string accession)
383 {
384  size_t i, len = expected_errors.size();
385  for (i = 0; i < len; i++) {
386  expected_errors[i]->SetAccession(accession);
387  }
388 }
389 
391 {
392  // Here we make descriptions of command line parameters that we are
393  // going to use.
394 
395  arg_desc->AddFlag(
396  "debug_mode", "Debugging mode writes errors seen for each test");
397 }
398 
400 {
401  // initialization function body
402 
403  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
404  if (args["debug_mode"]) {
405  s_debugMode = true;
406  }
407  g_IgnoreDataFile("institution_codes.txt");
408 }
409 
410 void AddChromosomeNoLocation(vector<CExpectedError*>& expected_errors, const string& id)
411 {
412  expected_errors.push_back(new CExpectedError(id, eDiag_Error,
413  "ChromosomeWithoutLocation",
414  "INDEXER_ONLY - source contains chromosome value '1' but the BioSource location is not set to chromosome"));
415 }
416 
417 void AddChromosomeNoLocation(vector<CExpectedError*>& expected_errors, CRef<CSeq_entry> entry)
418 {
419  if (entry->IsSeq()) {
420  CConstRef<CSeq_id> seqid = sequence::GetId(entry->GetSeq(), sequence::eGetId_Best).GetSeqId();
421  AddChromosomeNoLocation(expected_errors, seqid->AsFastaString());
422  } else if (entry->IsSet()) {
423  if (entry->GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
425  AddChromosomeNoLocation(expected_errors, nuc_entry);
426  } else {
427  for (auto it : entry->SetSet().SetSeq_set()) {
428  AddChromosomeNoLocation(expected_errors, it);
429  }
430  }
431  }
432 }
433 
434 
435 // new case test ground
436 
437 BOOST_AUTO_TEST_CASE(Test_Descr_MissingKeyword)
438 {
439  // prepare entry
441  CRef<CSeqdesc> sdesc(new CSeqdesc());
442  sdesc->SetUser().SetType().SetStr("StructuredComment");
443  entry->SetSeq().SetDescr().Set().push_back(sdesc);
444 
445  sdesc->SetUser().AddField("StructuredCommentPrefix", "##MIGS-Data-START##", CUser_object::eParse_String);
446  sdesc->SetUser().AddField("alt_elev", "foo", CUser_object::eParse_String);
447  sdesc->SetUser().AddField("assembly", "foo", CUser_object::eParse_String);
448  sdesc->SetUser().AddField("collection_date", "foo", CUser_object::eParse_String);
449  sdesc->SetUser().AddField("country", "foo", CUser_object::eParse_String);
450  sdesc->SetUser().AddField("depth", "foo", CUser_object::eParse_String);
451  sdesc->SetUser().AddField("environment", "foo", CUser_object::eParse_String);
452  sdesc->SetUser().AddField("investigation_type", "eukaryote", CUser_object::eParse_String);
453  sdesc->SetUser().AddField("isol_growth_condt", "foo", CUser_object::eParse_String);
454  sdesc->SetUser().AddField("sequencing_meth", "foo", CUser_object::eParse_String);
455  sdesc->SetUser().AddField("project_name", "foo", CUser_object::eParse_String);
456  sdesc->SetUser().AddField("ploidy", "foo", CUser_object::eParse_String);
457  sdesc->SetUser().AddField("num_replicons", "foo", CUser_object::eParse_String);
458  sdesc->SetUser().AddField("estimated_size", "foo", CUser_object::eParse_String);
459  sdesc->SetUser().AddField("trophic_level", "foo", CUser_object::eParse_String);
460  sdesc->SetUser().AddField("propagation", "foo", CUser_object::eParse_String);
461  sdesc->SetUser().AddField("lat_lon", "foo", CUser_object::eParse_String);
462 
463  CRef<CSeqdesc> gdesc(new CSeqdesc());
464  gdesc->SetGenbank().SetKeywords().push_back("GSC:MIGS:2.1");
465  entry->SetSeq().SetDescr().Set().push_back(gdesc);
466 
468 
469  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadKeywordForStrucComm",
470  "Structured Comment is non-compliant, keyword should be removed"));
471  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommMissingField",
472  "Required field finishing_strategy is missing when investigation_type has value 'eukaryote'"));
473  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue",
474  "Structured Comment invalid; the field value and/or name are incorrect"));
475  // AddChromosomeNoLocation(expected_errors, entry);
476  eval = validator.Validate(seh, options);
477  CheckErrors(*eval, expected_errors);
478 
479  // if no keyword, no badkeyword error
480  entry->SetSeq().SetDescr().Set().pop_back();
481  delete expected_errors[0];
482  expected_errors[0] = nullptr;
483  eval = validator.Validate(seh, options);
484  CheckErrors(*eval, expected_errors);
485 
487 
488  // make the comment valid, should complain about missing keyword
489  sdesc->SetUser().AddField("finishing_strategy", "foo", CUser_object::eParse_String);
490  // AddChromosomeNoLocation(expected_errors, entry);
491  eval = validator.Validate(seh, options);
492  CheckErrors(*eval, expected_errors);
493 
495  // put keyword back, should have no errors
496  entry->SetSeq().SetDescr().Set().push_back(gdesc);
497  // AddChromosomeNoLocation(expected_errors, entry);
498  eval = validator.Validate(seh, options);
499  CheckErrors(*eval, expected_errors);
501 }
502 
503 
504 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonValue)
505 {
506  // prepare entry
510 
512 
513  /*
514  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
515  "Latitude should be set to N (northern hemisphere)"));
516  eval = validator.Validate(seh, options);
517  CheckErrors(*eval, expected_errors);
518  */
519 
522  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
523  "Longitude should be set to W (western hemisphere)"));
524  eval = validator.Validate(seh, options);
525  CheckErrors(*eval, expected_errors);
526 
531  expected_errors[0]->SetErrMsg("Latitude should be set to S (southern hemisphere)");
532  eval = validator.Validate(seh, options);
533  CheckErrors(*eval, expected_errors);
534 
535  /*
536  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
537  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "25 S 47 W");
538  expected_errors[0]->SetErrMsg("Longitude should be set to E (eastern hemisphere)");
539  eval = validator.Validate(seh, options);
540  CheckErrors(*eval, expected_errors);
541  */
542 
544 
549  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
550  "Latitude and longitude values appear to be exchanged"));
551  eval = validator.Validate(seh, options);
552  CheckErrors(*eval, expected_errors);
553 
555 }
556 
557 
558 void TestOneLatLonCountry(const string& country, const string& lat_lon, const string& error, bool use_state = false, const string& err_code = "LatLonCountry")
559 {
560  // prepare entry
564 
566 
567  if (use_state) {
569  }
570 
571  string err_cd = err_code;
572  bool use_geo_loc_name = CSubSource::NCBI_UseGeoLocNameForCountry();
573  if (use_geo_loc_name && err_code == "LatLonCountry") {
574  err_cd = "LatLonGeoLocName";
575  }
576 
577  if (!error.empty()) {
578  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, err_cd, error));
579  }
580  eval = validator.Validate(seh, options);
581  CheckErrors(*eval, expected_errors);
582 
583  if (!error.empty()) {
584  CValidErrorFormat format(*objmgr);
585  vector<string> expected;
586  if (use_geo_loc_name) {
587  expected.push_back("LatLonGeoLocName Errors");
588  } else {
589  expected.push_back("LatLonCountry Errors");
590  }
591  expected.push_back("lcl|good:" + error);
592  expected.push_back("");
593 
594  vector<string> seen;
595  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
596  for (const string& it : cat_list) {
597  vector<string> sublist;
598  NStr::Split(it, "\n", sublist);
599  for (const string& sit : sublist) {
600  seen.push_back(sit);
601  }
602  }
603 
604  CheckStrings(seen, expected);
605  }
606 
608 }
609 
610 
612 {
613  TestOneLatLonCountry("Portugal", "37.7715 N 25.3097 W", "", true);
614 }
615 
616 
617 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonCountry)
618 {
619  TestOneLatLonCountry("Romania", "46.5 N 20 E",
620  "Lat_lon '46.5 N 20 E' maps to 'Hungary' instead of 'Romania' - claimed region 'Romania' is at distance 45 km");
621  TestOneLatLonCountry("Romania", "34 N 65 E", "Lat_lon '34 N 65 E' maps to 'Afghanistan' instead of 'Romania'");
622  TestOneLatLonCountry("Romania", "48 N 15 E", "Lat_lon '48 N 15 E' maps to 'Austria' instead of 'Romania'");
623  TestOneLatLonCountry("Romania", "48 N 15 W", "Lat_lon '48 N 15 W' is in water 'Atlantic Ocean'", false, "LatLonWater");
624  // RW-1137 this had inconsistent behavior in production vs. development tests, possibly due to version skew in
625  // Puerto Rico cleanup code, so commenting out to avoid spurious error reports
626  /*
627  TestOneLatLonCountry("Puerto Rico: Rio Mameyes in Luquillo", "18.47 N 64.23000000000002 W",
628  "Lat_lon '18.47 N 64.23000000000002 W' is in water 'Caribbean Sea', 'Puerto Rico: Rio Mameyes in Luquillo' is 108 km away",
629  false, "LatLonWater");
630  */
631 
632 }
633 
634 
635 BOOST_AUTO_TEST_CASE(Test_ValidError_Format)
636 {
637  bool use_geo_loc_name = CSubSource::NCBI_UseGeoLocNameForCountry();
639 
640  // Create consensus splice problems
643  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
644  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
645  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'G';
646  CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
647  unit_test_util::AddFeat(intron, nuc);
648 
650  other_intron->SetData().SetImp().SetKey("intron");
652  gene->SetData().SetGene().SetLocus_tag("fake_locustag");
653  AddFeat(gene, nuc);
654 
655  // create EC number problems
656  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
657  prot->SetData().SetProt().SetEc().push_back("1.2.3.10");
658  prot->SetData().SetProt().SetEc().push_back("1.1.3.22");
659  prot->SetData().SetProt().SetEc().push_back("1.1.99.n");
660  prot->SetData().SetProt().SetEc().push_back("1.1.1.17");
661  prot->SetData().SetProt().SetEc().push_back("11.22.33.44");
662  prot->SetData().SetProt().SetEc().push_back("11.22.n33.44");
663  prot->SetData().SetProt().SetEc().push_back("11.22.33.n44");
664 
665 
666  // create bad institution code errors
670 
671  // create lat-lon country error
674 
676 
677  eval = validator.Validate(seh, options);
678 
679  CValidErrorFormat format(*objmgr);
680 
681  vector<string> expected;
682  expected.push_back("intron\tlcl|nuc\tGT at 17");
683  expected.push_back("intron\tlcl|nuc\tGT at 1");
684  expected.push_back("intron\tlcl|nuc\tAG at 11");
685  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
686  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
687  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
688  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
689  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
690  expected.push_back("CDS\tlcl|nuc\tGT at 16");
691  expected.push_back("lcl|nuc:Lat_lon '30 N 30 E' maps to 'Egypt' instead of 'Panama'");
692  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
693  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
694  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
695 
696  vector<string> seen;
697  for (CValidError_CI vit(*eval); vit; ++vit) {
698  string val = format.FormatForSubmitterReport(*vit, scope);
699  seen.push_back(val);
700  }
701  CheckStrings(seen, expected);
702 
703  expected.clear();
704  seen.clear();
705  for (CValidError_CI vit(*eval); vit; ++vit) {
706  seen.push_back(vit->GetErrCode());
707  }
708  expected.push_back("NotSpliceConsensusDonor");
709  expected.push_back("NotSpliceConsensusDonorTerminalIntron");
710  expected.push_back("NotSpliceConsensusAcceptor");
711  expected.push_back("DeletedEcNumber");
712  expected.push_back("ReplacedEcNumber");
713  expected.push_back("BadEcNumberValue");
714  expected.push_back("BadEcNumberFormat");
715  expected.push_back("BadEcNumberValue");
716  expected.push_back("NotSpliceConsensusDonor");
717  if (use_geo_loc_name) {
718  expected.push_back("LatLonGeoLocName");
719  } else {
720  expected.push_back("LatLonCountry");
721  }
722  expected.push_back("BadInstitutionCode");
723  expected.push_back("BadInstitutionCode");
724  expected.push_back("BadInstitutionCode");
725  CheckStrings(seen, expected);
726 
727  seen.clear();
728  expected.clear();
729  vector<CValidErrItem::TErrIndex> codes = format.GetListOfErrorCodes(*eval);
730  for (CValidErrItem::TErrIndex it : codes) {
731  string val = CValidErrItem::ConvertErrCode(it);
732  seen.push_back(val);
733  }
734  if (use_geo_loc_name) {
735  expected.push_back("BadInstitutionCode");
736  expected.push_back("LatLonGeoLocName");
737  } else {
738  expected.push_back("LatLonCountry");
739  expected.push_back("BadInstitutionCode");
740  }
741  expected.push_back("BadEcNumberFormat");
742  expected.push_back("BadEcNumberValue");
743  expected.push_back("NotSpliceConsensusDonor");
744  expected.push_back("NotSpliceConsensusAcceptor");
745  expected.push_back("DeletedEcNumber");
746  expected.push_back("ReplacedEcNumber");
747  expected.push_back("NotSpliceConsensusDonorTerminalIntron");
748  CheckStrings(seen, expected);
749 
750  string rval = format.FormatForSubmitterReport(*eval, scope, eErr_SEQ_FEAT_NotSpliceConsensusDonor);
751  expected.clear();
752  seen.clear();
753  NStr::Split(rval, "\n", seen);
754  expected.push_back("Not Splice Consensus");
755  expected.push_back("intron\tlcl|nuc\tGT at 17");
756  expected.push_back("CDS\tlcl|nuc\tGT at 16");
757  expected.push_back("");
758  CheckStrings(seen, expected);
759 
760  rval = format.FormatCategoryForSubmitterReport(*eval, scope, eSubmitterFormatErrorGroup_ConsensusSplice);
761  expected.clear();
762  seen.clear();
763  NStr::Split(rval, "\n", seen);
764  expected.push_back("Not Splice Consensus");
765  expected.push_back("intron\tlcl|nuc\tGT at 17");
766  expected.push_back("intron\tlcl|nuc\tGT at 1");
767  expected.push_back("intron\tlcl|nuc\tAG at 11");
768  expected.push_back("CDS\tlcl|nuc\tGT at 16");
769  expected.push_back("");
770  CheckStrings(seen, expected);
771 
772  expected.clear();
773  seen.clear();
774  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
775  for (const string& it : cat_list) {
776  vector<string> sublist;
777  NStr::Split(it, "\n", sublist);
778  for (const string& sit : sublist) {
779  seen.push_back(sit);
780  }
781  }
782  expected.push_back("Not Splice Consensus");
783  expected.push_back("intron\tlcl|nuc\tGT at 17");
784  expected.push_back("intron\tlcl|nuc\tGT at 1");
785  expected.push_back("intron\tlcl|nuc\tAG at 11");
786  expected.push_back("CDS\tlcl|nuc\tGT at 16");
787  expected.push_back("");
788  expected.push_back("EC Number Format");
789  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
790  expected.push_back("");
791  expected.push_back("EC Number Value");
792  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
793  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
794  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
795  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
796  expected.push_back("");
797  expected.push_back("Bad Institution Codes");
798  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
799  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
800  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
801  expected.push_back("");
802  if (use_geo_loc_name) {
803  expected.push_back("LatLonGeoLocName Errors");
804  } else {
805  expected.push_back("LatLonCountry Errors");
806  }
807  expected.push_back("lcl|nuc:Lat_lon '30 N 30 E' maps to 'Egypt' instead of 'Panama'");
808  expected.push_back("");
809  CheckStrings(seen, expected);
810 }
811 
812 
813 BOOST_AUTO_TEST_CASE(Test_GB_6395)
814 {
815  // prepare entry
817  unit_test_util::SetTaxon(entry, 0);
818 
820 
821  eval = validator.Validate(seh, options);
822 
823  CValidErrorFormat format(*objmgr);
824  vector<string> expected;
825  vector<string> seen;
826 
827  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
828  for (const string& it : cat_list) {
829  vector<string> sublist;
830  NStr::Split(it, "\n", sublist);
831  for (const string& sit : sublist) {
832  seen.push_back(sit);
833  }
834  }
835  expected.push_back("NoTaxonID");
836  expected.push_back("lcl|good:Sebaea microphylla");
837  expected.push_back("");
838 
839  CheckStrings(seen, expected);
840 }
841 
842 
843 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonState)
844 {
845  // prepare entry
847  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "USA: South Carolina");
849 
851 
852  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "LatLonState",
853  "Lat_lon '36 N 80 W' maps to 'USA: North Carolina' instead of 'USA: South Carolina' - claimed region 'USA: South Carolina' is at distance 130 km"));
854  // AddChromosomeNoLocation(expected_errors, "lcl|good");
856  eval = validator.Validate(seh, options);
857  CheckErrors(*eval, expected_errors);
858 
860 }
861 
862 
864 {
866  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
867  prot->SetData().SetProt().SetEc().push_back("1.2.3.10");
868  prot->SetData().SetProt().SetEc().push_back("1.1.3.22");
869  prot->SetData().SetProt().SetEc().push_back("1.1.99.n");
870  prot->SetData().SetProt().SetEc().push_back("1.1.1.17");
871  prot->SetData().SetProt().SetEc().push_back("11.22.33.44");
872  prot->SetData().SetProt().SetEc().push_back("11.22.n33.44");
873  prot->SetData().SetProt().SetEc().push_back("11.22.33.n44");
874  return entry;
875 }
876 
877 
878 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadEcNumberValue)
879 {
881  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
882 
884 
885  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "DeletedEcNumber",
886  "EC_number 1.2.3.10 was deleted"));
887  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ReplacedEcNumber",
888  "EC_number 1.1.3.22 was transferred and is no longer valid"));
889  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadEcNumberValue",
890  "11.22.33.44 is not a legal value for qualifier EC_number"));
891  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadEcNumberFormat",
892  "11.22.n33.44 is not in proper EC_number format"));
893  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Info, "BadEcNumberValue",
894  "11.22.33.n44 is not a legal preliminary value for qualifier EC_number"));
895  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
896  eval = validator.Validate(seh, options);
897  CheckErrors(*eval, expected_errors);
898 
899  scope.RemoveTopLevelSeqEntry(seh);
900  prot->SetData().SetProt().ResetEc();
902  misc->SetData().SetImp().SetKey("exon");
903  misc->AddQualifier("EC_number", "1.2.3.10");
904  misc->AddQualifier("EC_number", "1.1.3.22");
905  misc->AddQualifier("EC_number", "1.1.99.n");
906  misc->AddQualifier("EC_number", "1.1.1.17");
907  misc->AddQualifier("EC_number", "11.22.33.44");
908  misc->AddQualifier("EC_number", "11.22.n33.44");
909  misc->AddQualifier("EC_number", "11.22.33.n44");
910  SetErrorsAccessions(expected_errors, "lcl|nuc");
911  expected_errors[1]->SetErrMsg("EC_number 1.1.3.22 was replaced");
912  seh = scope.AddTopLevelSeqEntry(*entry);
913  eval = validator.Validate(seh, options);
914  CheckErrors(*eval, expected_errors);
915 
917 }
918 
919 
920 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidQualifierValue)
921 {
924  misc->SetData().SetImp().SetKey("repeat_region");
925  misc->AddQualifier("rpt_unit_seq", "ATA");
926 
928 
929  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "RepeatSeqDoNotMatch",
930  "repeat_region /rpt_unit and underlying sequence do not match"));
931  // AddChromosomeNoLocation(expected_errors, "lcl|good");
932  eval = validator.Validate(seh, options);
933  CheckErrors(*eval, expected_errors);
934 
935  scope.RemoveTopLevelSeqEntry(seh);
937  misc = unit_test_util::AddMiscFeature(entry);
938  misc->SetData().SetImp().SetKey("repeat_region");
939  misc->AddQualifier("rpt_unit_seq", "ATAGTGATAGTG");
940  seh = scope.AddTopLevelSeqEntry(*entry);
941  expected_errors[0]->SetErrCode("InvalidRepeatUnitLength");
942  expected_errors[0]->SetErrMsg("Length of rpt_unit_seq is greater than feature length");
943  expected_errors[0]->SetSeverity(eDiag_Info);
944  eval = validator.Validate(seh, options);
945  CheckErrors(*eval, expected_errors);
946 
948 }
949 
950 
951 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ExtNotAllowed)
952 {
954 
956 
957  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "ExtNotAllowed", "Bioseq-ext not allowed on virtual Bioseq"));
958  // AddChromosomeNoLocation(expected_errors, "lcl|good");
959 
960  // repr = virtual
961  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_virtual);
962  entry->SetSeq().SetInst().ResetSeq_data();
963  entry->SetSeq().SetInst().SetExt().SetDelta();
964  eval = validator.Validate(seh, options);
965  CheckErrors(*eval, expected_errors);
966 
967  // repr = raw
968  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_raw);
969  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
970  expected_errors[0]->SetErrMsg("Bioseq-ext not allowed on raw Bioseq");
971  eval = validator.Validate(seh, options);
972  CheckErrors(*eval, expected_errors);
973 
974  entry->SetSeq().SetInst().ResetExt();
975  entry->SetSeq().SetInst().ResetSeq_data();
976  expected_errors[0]->SetErrCode("SeqDataNotFound");
977  expected_errors[0]->SetErrMsg("Missing Seq-data on raw Bioseq");
978  expected_errors[0]->SetSeverity(eDiag_Critical);
979  eval = validator.Validate(seh, options);
980  CheckErrors(*eval, expected_errors);
981 
982  entry->SetSeq().SetInst().SetSeq_data().SetGap();
983  eval = validator.Validate(seh, options);
984  CheckErrors(*eval, expected_errors);
985 
986  // repr = const
987  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_const);
988  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
989  entry->SetSeq().SetInst().SetExt().SetDelta();
990  expected_errors[0]->SetErrCode("ExtNotAllowed");
991  expected_errors[0]->SetErrMsg("Bioseq-ext not allowed on constructed Bioseq");
992  eval = validator.Validate(seh, options);
993  CheckErrors(*eval, expected_errors);
994 
995  entry->SetSeq().SetInst().ResetExt();
996  entry->SetSeq().SetInst().ResetSeq_data();
997  expected_errors[0]->SetErrCode("SeqDataNotFound");
998  expected_errors[0]->SetErrMsg("Missing Seq-data on constructed Bioseq");
999  expected_errors[0]->SetSeverity(eDiag_Critical);
1000  eval = validator.Validate(seh, options);
1001  CheckErrors(*eval, expected_errors);
1002 
1003  entry->SetSeq().SetInst().SetSeq_data().SetGap();
1004  eval = validator.Validate(seh, options);
1005  CheckErrors(*eval, expected_errors);
1006 
1007  // repr = map
1008  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_map);
1009  entry->SetSeq().SetInst().ResetSeq_data();
1010  expected_errors[0]->SetErrCode("ExtBadOrMissing");
1011  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on map Bioseq");
1012  expected_errors[0]->SetSeverity(eDiag_Error);
1013  eval = validator.Validate(seh, options);
1014  CheckErrors(*eval, expected_errors);
1015 
1016  entry->SetSeq().SetInst().SetExt().SetDelta();
1017  eval = validator.Validate(seh, options);
1018  CheckErrors(*eval, expected_errors);
1019 
1020  entry->SetSeq().SetInst().SetExt().SetRef();
1021  eval = validator.Validate(seh, options);
1022  CheckErrors(*eval, expected_errors);
1023 
1024  entry->SetSeq().SetInst().SetExt().SetMap();
1025  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
1026  expected_errors[0]->SetErrCode("SeqDataNotAllowed");
1027  expected_errors[0]->SetErrMsg("Seq-data not allowed on map Bioseq");
1028  eval = validator.Validate(seh, options);
1029  CheckErrors(*eval, expected_errors);
1030 
1031 
1032  // repr = ref
1033  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_ref);
1034  entry->SetSeq().SetInst().ResetExt();
1035  entry->SetSeq().SetInst().ResetSeq_data();
1036  expected_errors[0]->SetErrCode("ExtBadOrMissing");
1037  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on reference Bioseq");
1038  eval = validator.Validate(seh, options);
1039  CheckErrors(*eval, expected_errors);
1040 
1041  /*
1042  // repr = seg
1043  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
1044  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on seg Bioseq");
1045  eval = validator.Validate(seh, options);
1046  CheckErrors(*eval, expected_errors);
1047  */
1048 
1049  // repr = consen
1050  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_consen);
1051  expected_errors[0]->SetSeverity(eDiag_Critical);
1052  expected_errors[0]->SetErrCode("ReprInvalid");
1053  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 6");
1054  eval = validator.Validate(seh, options);
1055  CheckErrors(*eval, expected_errors);
1056 
1057  // repr = notset
1058  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_not_set);
1059  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 0");
1060  eval = validator.Validate(seh, options);
1061  CheckErrors(*eval, expected_errors);
1062 
1063  // repr = other
1064  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_other);
1065  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 255");
1066  eval = validator.Validate(seh, options);
1067  CheckErrors(*eval, expected_errors);
1068 
1069  // repr = delta
1070  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1071  entry->SetSeq().SetInst().SetExt().SetDelta();
1072  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
1073  expected_errors[0]->SetSeverity(eDiag_Error);
1074  expected_errors[0]->SetErrCode("SeqDataNotAllowed");
1075  expected_errors[0]->SetErrMsg("Seq-data not allowed on delta Bioseq");
1076  eval = validator.Validate(seh, options);
1077  CheckErrors(*eval, expected_errors);
1078 
1079  entry->SetSeq().SetInst().ResetExt();
1080  entry->SetSeq().SetInst().ResetSeq_data();
1081  expected_errors[0]->SetSeverity(eDiag_Error);
1082  expected_errors[0]->SetErrCode("ExtBadOrMissing");
1083  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on delta Bioseq");
1084  eval = validator.Validate(seh, options);
1085  CheckErrors(*eval, expected_errors);
1086 
1087  CLEAR_ERRORS
1088 }
1089 
1090 
1091 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ReprInvalid)
1092 {
1094 
1096  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "ReprInvalid", "Invalid Bioseq->repr = 0"));
1097  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1098 
1099  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_not_set);
1100  eval = validator.Validate(seh, options);
1101  CheckErrors(*eval, expected_errors);
1102 
1103  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 255");
1104  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_other);
1105  eval = validator.Validate(seh, options);
1106  CheckErrors(*eval, expected_errors);
1107 
1108  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 6");
1109  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_consen);
1110  eval = validator.Validate(seh, options);
1111  CheckErrors(*eval, expected_errors);
1112 
1113  CLEAR_ERRORS
1114 }
1115 
1116 
1117 BOOST_AUTO_TEST_CASE(Test_CollidingLocusTags)
1118 {
1119  CRef<CSeq_entry> entry(new CSeq_entry());
1120  {
1122  istr >> MSerial_AsnText >> *entry;
1123  }
1124 
1126  CScope scope(*objmgr);
1127  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
1128 
1129  CValidator validator(*objmgr);
1130 
1131  // Set validator options
1132  unsigned int options = CValidator::eVal_need_isojta
1136 
1137  // list of expected errors
1138  vector<CExpectedError*> expected_errors;
1139  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Warning, "TerminalNs", "N at end of sequence"));
1140  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Warning, "GeneLocusCollidesWithLocusTag", "locus collides with locus_tag in another gene"));
1141  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "CollidingLocusTags", "Colliding locus_tags in gene features"));
1142  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "CollidingLocusTags", "Colliding locus_tags in gene features"));
1143  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoMolInfoFound", "No Mol-info applies to this Bioseq"));
1144  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "LocusTagGeneLocusMatch", "Gene locus and locus_tag 'foo' match"));
1145  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoPubFound", "No publications anywhere on this entire record."));
1146  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Info, "MissingPubRequirement", "No submission citation anywhere on this entire record."));
1147  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoSourceDescriptor", "No source information included on this record."));
1148 
1149  CConstRef<CValidError> eval = validator.Validate(seh, options);
1150  CheckErrors(*eval, expected_errors);
1151 
1152  CLEAR_ERRORS
1153 }
1154 
1155 
1156 const string sc_TestEntryCollidingLocusTags = "Seq-entry ::= seq {\
1157  id {\
1158  local str \"LocusCollidesWithLocusTag\" } ,\
1159  inst {\
1160  repr raw ,\
1161  mol dna ,\
1162  length 24 ,\
1163  seq-data\
1164  iupacna \"AATTGGCCAANNAATTGGCCAANN\" } ,\
1165  annot {\
1166  {\
1167  data\
1168  ftable {\
1169  {\
1170  data\
1171  gene {\
1172  locus \"foo\" ,\
1173  locus-tag \"foo\" } ,\
1174  location\
1175  int {\
1176  from 0 ,\
1177  to 4 ,\
1178  strand plus ,\
1179  id\
1180  local str \"LocusCollidesWithLocusTag\" } } ,\
1181  {\
1182  data\
1183  gene {\
1184  locus \"bar\" ,\
1185  locus-tag \"foo\" } ,\
1186  location\
1187  int {\
1188  from 5 ,\
1189  to 9 ,\
1190  strand plus ,\
1191  id\
1192  local str \"LocusCollidesWithLocusTag\" } } ,\
1193  {\
1194  data\
1195  gene {\
1196  locus \"bar\" ,\
1197  locus-tag \"baz\" } ,\
1198  location\
1199  int {\
1200  from 10 ,\
1201  to 14 ,\
1202  strand plus ,\
1203  id\
1204  local str \"LocusCollidesWithLocusTag\" } } ,\
1205  {\
1206  data\
1207  gene {\
1208  locus \"quux\" ,\
1209  locus-tag \"baz\" } ,\
1210  location\
1211  int {\
1212  from 15 ,\
1213  to 19 ,\
1214  strand plus ,\
1215  id\
1216  local str \"LocusCollidesWithLocusTag\" } } } } } }\
1217 ";
1218 
1219 
1220 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_CircularProtein)
1221 {
1223 
1225 
1226  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "CircularProtein", "Non-linear topology set on protein"));
1227  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1228 
1230 
1231  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
1232  eval = validator.Validate(seh, options);
1233  CheckErrors(*eval, expected_errors);
1234 
1235  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_tandem);
1236  eval = validator.Validate(seh, options);
1237  CheckErrors(*eval, expected_errors);
1238 
1239  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_other);
1240  eval = validator.Validate(seh, options);
1241  CheckErrors(*eval, expected_errors);
1242 
1243  // should be no error for not set or linear
1244  CLEAR_ERRORS
1245 
1246  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_not_set);
1247  eval = validator.Validate(seh, options);
1248  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1249  CheckErrors(*eval, expected_errors);
1250 
1251  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_linear);
1252  eval = validator.Validate(seh, options);
1253  CheckErrors(*eval, expected_errors);
1254 
1255  CLEAR_ERRORS
1256 }
1257 
1258 
1259 BOOST_AUTO_TEST_CASE(Test_BadProteinMoltype)
1260 {
1262 
1264 
1265  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadProteinMoltype", "Protein not single stranded"));
1266  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1267 
1268  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ds);
1269  eval = validator.Validate(seh, options);
1270  CheckErrors(*eval, expected_errors);
1271 
1272  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_mixed);
1273  eval = validator.Validate(seh, options);
1274  CheckErrors(*eval, expected_errors);
1275 
1276  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_other);
1277  eval = validator.Validate(seh, options);
1278  CheckErrors(*eval, expected_errors);
1279 
1280  // no errors expected for not set or single strand
1281  CLEAR_ERRORS
1282 
1283  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1284 
1285  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_not_set);
1286  eval = validator.Validate(seh, options);
1287  CheckErrors(*eval, expected_errors);
1288 
1289  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ss);
1290  eval = validator.Validate(seh, options);
1291  CheckErrors(*eval, expected_errors);
1292 
1293  CLEAR_ERRORS
1294 }
1295 
1296 
1297 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNotSet)
1298 {
1300 
1302 
1303  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MolNotSet", "Bioseq.mol is 0"));
1304  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1305 
1306  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_not_set);
1307  eval = validator.Validate(seh, options);
1308  CheckErrors(*eval, expected_errors);
1309 
1310  expected_errors[0]->SetErrCode("MolOther");
1311  expected_errors[0]->SetErrMsg("Bioseq.mol is type other");
1312  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_other);
1313  eval = validator.Validate(seh, options);
1314  CheckErrors(*eval, expected_errors);
1315 
1316  expected_errors[0]->SetErrCode("MolNuclAcid");
1317  expected_errors[0]->SetErrMsg("Bioseq.mol is type nucleic acid");
1318  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_na);
1319  eval = validator.Validate(seh, options);
1320  CheckErrors(*eval, expected_errors);
1321 
1322  CLEAR_ERRORS
1323 }
1324 
1325 
1326 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_FuzzyLen)
1327 {
1329 
1331 
1332  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "FuzzyLen", "Fuzzy length on raw Bioseq"));
1333  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1334 
1335  entry->SetSeq().SetInst().SetFuzz();
1336  eval = validator.Validate(seh, options);
1337  CheckErrors(*eval, expected_errors);
1338 
1339  expected_errors[0]->SetErrMsg("Fuzzy length on const Bioseq");
1340  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_const);
1341  eval = validator.Validate(seh, options);
1342  CheckErrors(*eval, expected_errors);
1343 
1344  // shouldn't get fuzzy length if gap
1345  expected_errors[0]->SetErrCode("SeqDataNotFound");
1346  expected_errors[0]->SetErrMsg("Missing Seq-data on constructed Bioseq");
1347  expected_errors[0]->SetSeverity(eDiag_Critical);
1348  entry->SetSeq().SetInst().SetSeq_data().SetGap();
1349  eval = validator.Validate(seh, options);
1350  CheckErrors(*eval, expected_errors);
1351 
1352  CLEAR_ERRORS
1353 }
1354 
1355 
1356 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidAlphabet)
1357 {
1359 
1361  CScope scope(*objmgr);
1362  scope.AddDefaults();
1363  CSeq_entry_Handle prot_seh = scope.AddTopLevelSeqEntry(*prot_entry);
1364 
1365  CValidator validator(*objmgr);
1366 
1367  // Set validator options
1368  unsigned int options = CValidator::eVal_need_isojta
1372 
1373  // list of expected errors
1374  vector<CExpectedError*> expected_errors;
1375  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidAlphabet", "Using a nucleic acid alphabet on a protein sequence"));
1376  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1377  prot_entry->SetSeq().SetInst().SetSeq_data().SetIupacna();
1378  CConstRef<CValidError> eval = validator.Validate(prot_seh, options);
1379  CheckErrors(*eval, expected_errors);
1380 
1381  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi2na();
1382  eval = validator.Validate(prot_seh, options);
1383  CheckErrors(*eval, expected_errors);
1384 
1385  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na();
1386  eval = validator.Validate(prot_seh, options);
1387  CheckErrors(*eval, expected_errors);
1388 
1389  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi8na();
1390  eval = validator.Validate(prot_seh, options);
1391  CheckErrors(*eval, expected_errors);
1392 
1393  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbipna();
1394  eval = validator.Validate(prot_seh, options);
1395  CheckErrors(*eval, expected_errors);
1396 
1398  CScope scope2(*objmgr);
1399  scope2.AddDefaults();
1400  CSeq_entry_Handle seh = scope2.AddTopLevelSeqEntry(*entry);
1401 
1402  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa();
1403  expected_errors[0]->SetErrMsg("Using a protein alphabet on a nucleic acid");
1404 
1405  eval = validator.Validate(seh, options);
1406  CheckErrors(*eval, expected_errors);
1407 
1408  entry->SetSeq().SetInst().SetSeq_data().SetNcbi8aa();
1409  eval = validator.Validate(seh, options);
1410  CheckErrors(*eval, expected_errors);
1411 
1412  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa();
1413  eval = validator.Validate(seh, options);
1414  CheckErrors(*eval, expected_errors);
1415 
1416  entry->SetSeq().SetInst().SetSeq_data().SetNcbipaa();
1417  eval = validator.Validate(seh, options);
1418  CheckErrors(*eval, expected_errors);
1419 
1420  entry->SetSeq().SetInst().SetSeq_data().SetNcbistdaa();
1421  eval = validator.Validate(seh, options);
1422  CheckErrors(*eval, expected_errors);
1423 
1424  CLEAR_ERRORS
1425 }
1426 
1427 
1428 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidResidue)
1429 {
1431 
1433 
1434  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1435  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1436  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1437  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1438  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1439  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1440  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1441  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1442  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1443  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1444  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFE');
1445  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFE');
1446  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFF');
1447  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFF');
1448  entry->SetSeq().SetInst().SetLength(65);
1449  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'E' at position [5]"));
1450  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'F' at position [6]"));
1451  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'I' at position [9]"));
1452  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'J' at position [10]"));
1453  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'L' at position [12]"));
1454  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'O' at position [15]"));
1455  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'P' at position [16]"));
1456  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Q' at position [17]"));
1457  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'U' at position [21]"));
1458  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'X' at position [24]"));
1459  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Z' at position [26]"));
1460  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'E' at position [31]"));
1461  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'F' at position [32]"));
1462  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'I' at position [35]"));
1463  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'J' at position [36]"));
1464  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'L' at position [38]"));
1465  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'O' at position [41]"));
1466  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'P' at position [42]"));
1467  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Q' at position [43]"));
1468  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'U' at position [47]"));
1469  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'X' at position [50]"));
1470  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Z' at position [52]"));
1471  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [53]"));
1472  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [54]"));
1473  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [55]"));
1474  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [56]"));
1475  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [57]"));
1476  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [58]"));
1477  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [59]"));
1478  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [60]"));
1479  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [61]"));
1480  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [254] at position [62]"));
1481  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "More than 10 invalid residues. Checking stopped"));
1482  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
1483  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1484 
1485  eval = validator.Validate(seh, options);
1486  CheckErrors(*eval, expected_errors);
1487 
1488  // now repeat test, but with mRNA - this time Us should not be reported
1489  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
1490  delete expected_errors[8];
1491  expected_errors[8] = nullptr;
1492  delete expected_errors[19];
1493  expected_errors[19] = nullptr;
1494  eval = validator.Validate(seh, options);
1495  CheckErrors(*eval, expected_errors);
1496 
1497  // now repeat test, but with protein
1498  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_aa);
1499  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1500  if (it->IsMolinfo()) {
1501  it->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1502  }
1503  }
1504  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1505  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1506  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1507  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1508  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1509  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1510  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1511  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1512  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1513  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1514  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFE');
1515  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFE');
1516  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFF');
1517  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFF');
1518  entry->SetSeq().SetInst().SetLength(65);
1519  CRef<CSeq_feat> feat(new CSeq_feat());
1520  feat->SetData().SetProt().SetName().push_back("fake protein name");
1521  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
1522  feat->SetLocation().SetInt().SetFrom(0);
1523  feat->SetLocation().SetInt().SetTo(64);
1524  unit_test_util::AddFeat(feat, entry);
1525  scope.RemoveEntry(*entry);
1526  seh = scope.AddTopLevelSeqEntry(*entry);
1527 
1528  for (int j = 0; j < 22; j++) {
1529  if (expected_errors[j]) {
1530  delete expected_errors[j];
1531  expected_errors[j] = nullptr;
1532  }
1533  }
1534  eval = validator.Validate(seh, options);
1535  CheckErrors(*eval, expected_errors);
1536 
1537  CLEAR_ERRORS
1538 
1539  // now look for lowercase characters
1540  scope.RemoveEntry(*entry);
1541  entry = unit_test_util::BuildGoodSeq();
1542  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("abcdefghijklmnopqrstuvwxyz");
1543  entry->SetSeq().SetInst().SetLength(26);
1544  seh = scope.AddTopLevelSeqEntry(*entry);
1545  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Sequence contains lower-case characters"));
1546  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1547  eval = validator.Validate(seh, options);
1548  CheckErrors(*eval, expected_errors);
1549 
1550  scope.RemoveEntry(*entry);
1552  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("protein");
1553  seh = scope.AddTopLevelSeqEntry(*entry);
1554  eval = validator.Validate(seh, options);
1555  CheckErrors(*eval, expected_errors);
1556 
1557  CLEAR_ERRORS
1558 
1559  // now try delta sequence
1560  scope.RemoveEntry(*entry);
1561  entry = unit_test_util::BuildGoodSeq();
1562  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1563  entry->SetSeq().SetInst().ResetSeq_data();
1564  CRef<CDelta_seq> seg(new CDelta_seq());
1565  seg->SetLiteral().SetSeq_data().SetIupacna().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1566  seg->SetLiteral().SetLength(52);
1567  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(seg);
1568  entry->SetSeq().SetInst().SetLength(52);
1569  seh = scope.AddTopLevelSeqEntry(*entry);
1570 
1571  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [E] at position [5]"));
1572  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [F] at position [6]"));
1573  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [I] at position [9]"));
1574  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [J] at position [10]"));
1575  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [L] at position [12]"));
1576  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [O] at position [15]"));
1577  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [P] at position [16]"));
1578  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Q] at position [17]"));
1579  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [U] at position [21]"));
1580  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [X] at position [24]"));
1581  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Z] at position [26]"));
1582  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [E] at position [31]"));
1583  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [F] at position [32]"));
1584  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [I] at position [35]"));
1585  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [J] at position [36]"));
1586  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [L] at position [38]"));
1587  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [O] at position [41]"));
1588  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [P] at position [42]"));
1589  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Q] at position [43]"));
1590  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [U] at position [47]"));
1591  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [X] at position [50]"));
1592  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Z] at position [52]"));
1593  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1594  eval = validator.Validate(seh, options);
1595  CheckErrors(*eval, expected_errors);
1596 
1597  CLEAR_ERRORS
1598 
1599  // try protein delta sequence
1600  scope.RemoveEntry(*entry);
1602  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1603  entry->SetSeq().SetInst().ResetSeq_data();
1604  CRef<CDelta_seq> seg2(new CDelta_seq());
1605  seg2->SetLiteral().SetSeq_data().SetIupacaa().Set("1234567");
1606  seg2->SetLiteral().SetLength(7);
1607  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(seg2);
1608  entry->SetSeq().SetInst().SetLength(7);
1609  seh = scope.AddTopLevelSeqEntry(*entry);
1610 
1611  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [1] at position [1]"));
1612  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [2] at position [2]"));
1613  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [3] at position [3]"));
1614  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [4] at position [4]"));
1615  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [5] at position [5]"));
1616  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [6] at position [6]"));
1617  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [7] at position [7]"));
1618  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1619 
1620  eval = validator.Validate(seh, options);
1621  CheckErrors(*eval, expected_errors);
1622 
1623  CLEAR_ERRORS
1624 }
1625 
1626 
1627 /*
1628 static void WriteOutTemp(CRef<CSeq_entry> entry)
1629 {
1630  // construct a temp file name
1631  CNcbiOstrstream oss;
1632  oss << "test.asn";
1633  string filename = CNcbiOstrstreamToString(oss);
1634  string fullPath = CDirEntry::MakePath(".", filename);
1635 
1636  // initialize a binary output stream
1637  unique_ptr<CNcbiOstream> outStream;
1638  outStream.reset(new CNcbiOfstream(
1639  fullPath.c_str(),
1640  IOS_BASE::out));
1641  if (!(*outStream)) {
1642  return;
1643  }
1644 
1645  unique_ptr<CObjectOStream> outObject;
1646  // Associate ASN.1 text serialization methods with the input
1647  outObject.reset(new CObjectOStreamAsn(*outStream));
1648 
1649  // write the asn data
1650  try {
1651  *outObject << *entry;
1652  outStream->flush();
1653  } catch (exception&) {
1654  }
1655 }
1656 */
1657 
1658 
1659 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_StopInProtein)
1660 {
1662 
1664 
1665  entry->SetSet().SetSeq_set().back()->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MP*K*E*N");
1666  entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("GTGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
1668  cds->SetExcept(true);
1669  cds->SetExcept_text("unclassified translation discrepancy");
1670 
1671  BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
1672  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
1673 
1674  // list of expected errors
1675  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
1676  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "ExceptionProblem", "unclassified translation discrepancy is not a legal exception explanation"));
1677  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
1678  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
1679  "CDS has unnecessary translated product replaced exception"));
1680  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
1681 
1682  eval = validator.Validate(seh, options);
1683  CheckErrors(*eval, expected_errors);
1684  // WriteOutTemp(entry);
1685 
1686  CLEAR_ERRORS
1687  cds->ResetExcept();
1688  cds->ResetExcept_text();
1689  BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
1690  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
1691  BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
1692 
1693  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
1694  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon", "Illegal start codon (and 3 internal stops). Probably wrong genetic code [0]"));
1695  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
1696  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
1697 
1698  eval = validator.Validate(seh, options);
1699  CheckErrors(*eval, expected_errors);
1700  // WriteOutTemp(entry);
1701 
1703  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
1704 
1705  // write out seq-entry
1706  // WriteOutTemp(entry);
1707 
1708  delete expected_errors[1];
1709  expected_errors[1] = nullptr;
1710  expected_errors[2]->SetErrMsg("3 internal stops. Genetic code [0]");
1711  eval = validator.Validate(seh, options);
1712  CheckErrors(*eval, expected_errors);
1713 
1714  CLEAR_ERRORS
1715 }
1716 
1717 
1718 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_PartialInconsistent)
1719 {
1720 #if 0
1721  //We don't care about segmented sets any more
1723 
1725 
1726  entry->SetSeq().SetInst().ResetSeq_data();
1727  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
1728  CRef<CSeq_id> id(new CSeq_id("gb|AY123456"));
1729  CRef<CSeq_loc> loc1(new CSeq_loc(*id, 0, 3));
1730  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc1);
1731  CRef<CSeq_id> id2(new CSeq_id("gb|AY123457"));
1732  CRef<CSeq_loc> loc2(new CSeq_loc(*id2, 0, 2));
1733  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc2);
1734 
1735  // list of expected errors
1736  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "PartialInconsistent", "Partial segmented sequence without MolInfo partial"));
1737 
1738  // not-set
1739  loc1->SetPartialStart(true, eExtreme_Biological);
1740  loc2->SetPartialStop(true, eExtreme_Biological);
1741  eval = validator.Validate(seh, options);
1742  CheckErrors(*eval, expected_errors);
1743  loc1->SetPartialStart(true, eExtreme_Biological);
1744  loc2->SetPartialStop(false, eExtreme_Biological);
1745  eval = validator.Validate(seh, options);
1746  CheckErrors(*eval, expected_errors);
1747  loc1->SetPartialStart(false, eExtreme_Biological);
1748  loc2->SetPartialStop(true, eExtreme_Biological);
1749  eval = validator.Validate(seh, options);
1750  CheckErrors(*eval, expected_errors);
1751 
1752  // unknown
1754 
1755  loc1->SetPartialStart(true, eExtreme_Biological);
1756  loc2->SetPartialStop(true, eExtreme_Biological);
1757  eval = validator.Validate(seh, options);
1758  CheckErrors(*eval, expected_errors);
1759  loc1->SetPartialStart(true, eExtreme_Biological);
1760  loc2->SetPartialStop(false, eExtreme_Biological);
1761  eval = validator.Validate(seh, options);
1762  CheckErrors(*eval, expected_errors);
1763  loc1->SetPartialStart(false, eExtreme_Biological);
1764  loc2->SetPartialStop(true, eExtreme_Biological);
1765  eval = validator.Validate(seh, options);
1766  CheckErrors(*eval, expected_errors);
1767 
1768  // complete
1770 
1771  loc1->SetPartialStart(true, eExtreme_Biological);
1772  loc2->SetPartialStop(true, eExtreme_Biological);
1773  eval = validator.Validate(seh, options);
1774  CheckErrors(*eval, expected_errors);
1775  loc1->SetPartialStart(true, eExtreme_Biological);
1776  loc2->SetPartialStop(false, eExtreme_Biological);
1777  eval = validator.Validate(seh, options);
1778  CheckErrors(*eval, expected_errors);
1779  loc1->SetPartialStart(false, eExtreme_Biological);
1780  loc2->SetPartialStop(true, eExtreme_Biological);
1781  eval = validator.Validate(seh, options);
1782  CheckErrors(*eval, expected_errors);
1783 
1784  // partial
1786 
1787  loc1->SetPartialStart(false, eExtreme_Biological);
1788  loc2->SetPartialStop(false, eExtreme_Biological);
1789  expected_errors[0]->SetErrMsg("Complete segmented sequence with MolInfo partial");
1790  eval = validator.Validate(seh, options);
1791  CheckErrors(*eval, expected_errors);
1792 
1793  // no-left
1795 
1796  loc1->SetPartialStart(true, eExtreme_Biological);
1797  loc2->SetPartialStop(true, eExtreme_Biological);
1798  expected_errors[0]->SetErrMsg("No-left inconsistent with segmented SeqLoc");
1799  eval = validator.Validate(seh, options);
1800  CheckErrors(*eval, expected_errors);
1801  loc1->SetPartialStart(false, eExtreme_Biological);
1802  loc2->SetPartialStop(true, eExtreme_Biological);
1803  eval = validator.Validate(seh, options);
1804  CheckErrors(*eval, expected_errors);
1805  loc1->SetPartialStart(false, eExtreme_Biological);
1806  loc2->SetPartialStop(false, eExtreme_Biological);
1807  eval = validator.Validate(seh, options);
1808  CheckErrors(*eval, expected_errors);
1809 
1810  // no-right
1812 
1813  loc1->SetPartialStart(true, eExtreme_Biological);
1814  loc2->SetPartialStop(true, eExtreme_Biological);
1815  expected_errors[0]->SetErrMsg("No-right inconsistent with segmented SeqLoc");
1816  eval = validator.Validate(seh, options);
1817  CheckErrors(*eval, expected_errors);
1818  loc1->SetPartialStart(true, eExtreme_Biological);
1819  loc2->SetPartialStop(false, eExtreme_Biological);
1820  eval = validator.Validate(seh, options);
1821  CheckErrors(*eval, expected_errors);
1822  loc1->SetPartialStart(false, eExtreme_Biological);
1823  loc2->SetPartialStop(false, eExtreme_Biological);
1824  eval = validator.Validate(seh, options);
1825  CheckErrors(*eval, expected_errors);
1826 
1827  // no-ends
1829 
1830  expected_errors[0]->SetErrMsg("No-ends inconsistent with segmented SeqLoc");
1831  loc1->SetPartialStart(true, eExtreme_Biological);
1832  loc2->SetPartialStop(false, eExtreme_Biological);
1833  eval = validator.Validate(seh, options);
1834  CheckErrors(*eval, expected_errors);
1835  loc1->SetPartialStart(false, eExtreme_Biological);
1836  loc2->SetPartialStop(true, eExtreme_Biological);
1837  eval = validator.Validate(seh, options);
1838  CheckErrors(*eval, expected_errors);
1839  loc1->SetPartialStart(false, eExtreme_Biological);
1840  loc2->SetPartialStop(false, eExtreme_Biological);
1841  eval = validator.Validate(seh, options);
1842  CheckErrors(*eval, expected_errors);
1843 
1844  CLEAR_ERRORS
1845 #endif
1846 }
1847 
1848 
1849 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ShortSeq)
1850 {
1852 
1854 
1855  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPR");
1856  entry->SetSeq().SetInst().SetLength(3);
1857  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetTo(2);
1858 
1859  // don't report if pdb
1860  CRef<CPDB_seq_id> pdb_id(new CPDB_seq_id());
1861  pdb_id->SetMol().Set("foo");
1862  entry->SetSeq().SetId().front()->SetPdb(*pdb_id);
1863  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetPdb(*pdb_id);
1864  scope.RemoveTopLevelSeqEntry(seh);
1865  seh = scope.AddTopLevelSeqEntry(*entry);
1866  eval = validator.Validate(seh, options);
1867  // AddChromosomeNoLocation(expected_errors, "pdb|foo| ");
1868  CheckErrors(*eval, expected_errors);
1869 
1870  // new test if no coding region
1871  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PartialsInconsistent", "Molinfo completeness and protein feature partials conflict"));
1872  expected_errors[0]->SetAccession("lcl|good");
1873  entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
1874  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
1875  scope.RemoveTopLevelSeqEntry(seh);
1876  seh = scope.AddTopLevelSeqEntry(*entry);
1878 
1879  eval = validator.Validate(seh, options);
1880  CheckErrors(*eval, expected_errors);
1882  eval = validator.Validate(seh, options);
1883  CheckErrors(*eval, expected_errors);
1885  eval = validator.Validate(seh, options);
1886  CheckErrors(*eval, expected_errors);
1888  eval = validator.Validate(seh, options);
1889  CheckErrors(*eval, expected_errors);
1890 
1891  CLEAR_ERRORS
1892 
1893  // for all other completeness, report
1894  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortSeq", "Sequence only 3 residues"));
1895  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1896  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1897  if (it->IsMolinfo()) {
1898  it->SetMolinfo().ResetCompleteness();
1899  }
1900  }
1901  eval = validator.Validate(seh, options);
1902  CheckErrors(*eval, expected_errors);
1904  eval = validator.Validate(seh, options);
1905  CheckErrors(*eval, expected_errors);
1907  eval = validator.Validate(seh, options);
1908  CheckErrors(*eval, expected_errors);
1910  eval = validator.Validate(seh, options);
1911  CheckErrors(*eval, expected_errors);
1912 
1913  // nucleotide
1914  scope.RemoveTopLevelSeqEntry(seh);
1915  entry = unit_test_util::BuildGoodSeq();
1916  seh = scope.AddTopLevelSeqEntry(*entry);
1917  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCTTT");
1918  entry->SetSeq().SetInst().SetLength(9);
1919  expected_errors[0]->SetErrMsg("Sequence only 9 residues");
1920  eval = validator.Validate(seh, options);
1921  CheckErrors(*eval, expected_errors);
1922 
1923  CLEAR_ERRORS
1924 
1925  // don't report if pdb
1926  entry->SetSeq().SetId().front()->SetPdb(*pdb_id);
1927  scope.RemoveTopLevelSeqEntry(seh);
1928  seh = scope.AddTopLevelSeqEntry(*entry);
1929  eval = validator.Validate(seh, options);
1930  // AddChromosomeNoLocation(expected_errors, "pdb|foo| ");
1931  CheckErrors(*eval, expected_errors);
1932 
1933  CLEAR_ERRORS
1934 }
1935 
1936 
1938 {
1939  bool rval = false;
1940 
1941  switch (tech) {
1944  case CMolInfo::eTech_both:
1948  rval = true;
1949  break;
1950  default:
1951  break;
1952  }
1953  return rval;
1954 }
1955 
1956 
1958 {
1959  CRef<CSeqdesc> desc(new CSeqdesc());
1962  if (entry->IsSeq()) {
1963  entry->SetSeq().SetDescr().Set().push_back(desc);
1964  } else if (entry->IsSet()) {
1965  entry->SetSet().SetDescr().Set().push_back(desc);
1966  }
1967 }
1968 
1969 
1970 static void SetRefGeneTrackingStatus(CRef<CSeq_entry> entry, string status)
1971 {
1972  if (entry->IsSeq()) {
1973  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1974  if (it->IsUser() && it->GetUser().IsRefGeneTracking()) {
1975  it->SetUser().SetData().front()->SetData().SetStr(status);
1976  }
1977  }
1978  } else if (entry->IsSet()) {
1979  for (auto& it : entry->SetSet().SetDescr().Set()) {
1980  if (it->IsUser() && it->GetUser().IsRefGeneTracking()) {
1981  it->SetUser().SetData().front()->SetData().SetStr(status);
1982  }
1983  }
1984  }
1985 }
1986 
1987 
1988 static void SetTitle(CRef<CSeq_entry> entry, string title)
1989 {
1990  bool found = false;
1991 
1992  if (entry->IsSetDescr()) {
1993  auto& cont = entry->SetDescr().Set();
1994  for (auto it = cont.begin(); it != cont.end();) {
1995  if ((*it)->IsTitle()) {
1996  found = true;
1997  if (NStr::IsBlank((*it)->GetTitle())) {
1998  it = cont.erase(it);
1999  continue;
2000  } else {
2001  (*it)->SetTitle(title);
2002  }
2003  }
2004  ++it;
2005  }
2006  }
2007 
2008  if (!found && !NStr::IsBlank(title)) {
2009  CRef<CSeqdesc> desc(new CSeqdesc());
2010  desc->SetTitle(title);
2011  entry->SetSeq().SetDescr().Set().push_back(desc);
2012  }
2013 }
2014 
2015 
2016 static void AddGenbankKeyword(CRef<CSeq_entry> entry, string keyword)
2017 {
2018  bool found = false;
2019 
2020  for (auto& it : entry->SetSeq().SetDescr().Set()) {
2021  if (it->IsGenbank()) {
2022  it->SetGenbank().SetKeywords().push_back(keyword);
2023  found = true;
2024  }
2025  }
2026  if (!found) {
2027  CRef<CSeqdesc> desc(new CSeqdesc());
2028  desc->SetGenbank().SetKeywords().push_back(keyword);
2029  entry->SetSeq().SetDescr().Set().push_back(desc);
2030  }
2031 }
2032 
2033 
2035 {
2038 
2039  SetTech(entry, tech);
2040  eval = validator.Validate(seh, options);
2041  if (tech == CMolInfo::eTech_barcode) {
2042  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
2043  } else if (tech == CMolInfo::eTech_tsa) {
2044  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TSAseqGapProblem", "TSA Seq_gap NULL"));
2045  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2046  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"genomic\" is not appropriate for sequences that use the TSA technique."));
2047  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAseqGapProblem", "TSA submission includes wrong gap type. Gaps for TSA should be Assembly Gaps with linkage evidence."));
2048  } else if (tech == CMolInfo::eTech_wgs) {
2049  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
2050  }
2051  if (tech == CMolInfo::eTech_wgs) {
2052  AddChromosomeNoLocation(expected_errors, "lcl|good");
2053  }
2054 
2055  CheckErrors(*eval, expected_errors);
2056 
2057  CLEAR_ERRORS
2058 }
2059 
2060 
2062 {
2065 
2066  SetTech(entry, tech);
2067  eval = validator.Validate(seh, options);
2068  if (IsProteinTech(tech)) {
2069  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide", "Nucleic acid with protein sequence method"));
2070  } else if (tech == CMolInfo::eTech_est) {
2071  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA", "EST sequence should be mRNA"));
2072  }
2073  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDeltaSeq", "Delta seq technique should not be [" + NStr::UIntToString(tech) + "]"));
2074  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2075  eval = validator.Validate(seh, options);
2076  CheckErrors(*eval, expected_errors);
2077  CLEAR_ERRORS
2078 }
2079 
2080 
2082 {
2084 
2086 
2087  CRef<CDelta_seq> start_gap_seg(new CDelta_seq());
2088  start_gap_seg->SetLiteral().SetLength(10);
2089  start_gap_seg->SetLiteral().SetSeq_data().SetGap();
2090  entry->SetSeq().SetInst().SetExt().SetDelta().Set().insert(entry->SetSeq().SetInst().SetExt().SetDelta().Set().begin(), start_gap_seg);
2091  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2092  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2093  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("AAATTTGGGC", CSeq_inst::eMol_dna);
2094  CRef<CDelta_seq> end_gap_seg(new CDelta_seq());
2095  end_gap_seg->SetLiteral().SetLength(10);
2096  end_gap_seg->SetLiteral().SetSeq_data().SetGap();
2097  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(end_gap_seg);
2098  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2099  entry->SetSeq().SetInst().SetLength(94);
2100  SetTech(entry, tech);
2101  if (tech == CMolInfo::eTech_wgs) {
2102  AddChromosomeNoLocation(expected_errors, "lcl|good");
2103  }
2104  // expected_errors.push_back(new CExpectedError("lcl|good", tech == CMolInfo::eTech_wgs ? eDiag_Warning : eDiag_Error, "BadDeltaSeq", "First delta seq component is a gap"));
2105  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDeltaSeq", "There is 1 adjacent gap in delta seq"));
2106  // expected_errors.push_back(new CExpectedError("lcl|good", tech == CMolInfo::eTech_wgs ? eDiag_Warning : eDiag_Error, "BadDeltaSeq", "Last delta seq component is a gap"));
2107  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at beginning of sequence"));
2108  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at end of sequence"));
2109  /*
2110  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
2111  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
2112  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
2113  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
2114  */
2115  if (tech == CMolInfo::eTech_wgs) {
2116  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
2117  }
2118  eval = validator.Validate(seh, options);
2119  CheckErrors(*eval, expected_errors);
2120 
2121  CLEAR_ERRORS
2122 }
2123 
2124 
2125 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadDeltaSeq)
2126 {
2128 
2130 
2131  for (auto& it : entry->SetSeq().SetDescr().Set()) {
2132  if (it->IsMolinfo()) {
2133  it->SetMolinfo().SetTech(CMolInfo::eTech_derived);
2134  }
2135  }
2136 
2137  // don't report if NT or NC
2138  scope.RemoveTopLevelSeqEntry(seh);
2139  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
2140  seh = scope.AddTopLevelSeqEntry(*entry);
2141  eval = validator.Validate(seh, options);
2142  // AddChromosomeNoLocation(expected_errors, "ref|NC_123456|");
2143  CheckErrors(*eval, expected_errors);
2144  CLEAR_ERRORS
2145 
2146  entry->SetSeq().SetId().front()->SetOther().SetAccession("NT_123456");
2147  scope.RemoveTopLevelSeqEntry(seh);
2148  seh = scope.AddTopLevelSeqEntry(*entry);
2149  eval = validator.Validate(seh, options);
2150  // AddChromosomeNoLocation(expected_errors, "ref|NT_123456|");
2151  CheckErrors(*eval, expected_errors);
2152  CLEAR_ERRORS
2153 
2154  // don't report if gen-prod-set
2155 
2156  entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
2157  scope.RemoveTopLevelSeqEntry(seh);
2158  seh = scope.AddTopLevelSeqEntry(*entry);
2159 
2160  // allowed tech values
2161  vector<CMolInfo::TTech> allowed_list;
2162  allowed_list.push_back(CMolInfo::eTech_htgs_0);
2163  allowed_list.push_back(CMolInfo::eTech_htgs_1);
2164  allowed_list.push_back(CMolInfo::eTech_htgs_2);
2165  allowed_list.push_back(CMolInfo::eTech_htgs_3);
2166  allowed_list.push_back(CMolInfo::eTech_wgs);
2167  allowed_list.push_back(CMolInfo::eTech_composite_wgs_htgs);
2168  allowed_list.push_back(CMolInfo::eTech_unknown);
2169  allowed_list.push_back(CMolInfo::eTech_standard);
2170  allowed_list.push_back(CMolInfo::eTech_htc);
2171  allowed_list.push_back(CMolInfo::eTech_barcode);
2172  allowed_list.push_back(CMolInfo::eTech_tsa);
2173 
2175  bool allowed = false;
2176  for (CMolInfo::TTech it : allowed_list) {
2177  if (it == i) {
2178  allowed = true;
2179  break;
2180  }
2181  }
2182  if (allowed) {
2183  // don't report for htgs_0
2185  } else {
2187  }
2188  }
2189 
2190  CLEAR_ERRORS
2191 
2194 
2195  CLEAR_ERRORS
2196 }
2197 
2198 
2199 void AdjustGap(CSeq_gap& gap, CSeq_gap::EType gap_type, bool is_linked, vector<CLinkage_evidence::EType> linkage_evidence)
2200 {
2201  gap.Reset();
2202  gap.SetType(gap_type);
2203  if (is_linked) {
2205  } else {
2206  gap.ResetLinkage();
2207  }
2208  gap.ResetLinkage_evidence();
2209  for (auto it : linkage_evidence) {
2211  ev->SetType(it);
2212  gap.SetLinkage_evidence().push_back(ev);
2213  }
2214 }
2215 
2216 
2217 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqGapBadLinkage)
2218 {
2220 
2221  vector<CLinkage_evidence::EType> evidence;
2222  evidence.push_back(CLinkage_evidence::eType_align_genus);
2223  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2224  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2225  && it->GetLiteral().GetSeq_data().IsGap()) {
2226  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2227  CSeq_gap::eType_short_arm, true, evidence);
2228  }
2229  }
2230 
2232 
2233  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2234  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
2235  "SeqGapBadLinkage", "Seq-gap of type 3 should not have linkage evidence"));
2236 
2237  eval = validator.Validate(seh, options);
2238  CheckErrors(*eval, expected_errors);
2239 
2240  CLEAR_ERRORS
2241 
2242  scope.RemoveTopLevelSeqEntry(seh);
2243  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2244  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2245  && it->GetLiteral().GetSeq_data().IsGap()) {
2246  CSeq_gap& gap = it->SetLiteral().SetSeq_data().SetGap();
2247  gap.ResetLinkage();
2248  gap.ResetType();
2249  }
2250  }
2251  seh = scope.AddTopLevelSeqEntry(*entry);
2252 
2253  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2254  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
2255  "SeqGapBadLinkage", "Seq-gap with linkage evidence must have linkage field set to linked"));
2256 
2257  eval = validator.Validate(seh, options);
2258  CheckErrors(*eval, expected_errors);
2259 
2260  CLEAR_ERRORS
2261 
2262  scope.RemoveTopLevelSeqEntry(seh);
2263  evidence.push_back(CLinkage_evidence::eType_align_genus);
2264  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2265  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2266  && it->GetLiteral().GetSeq_data().IsGap()) {
2267  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2268  CSeq_gap::eType_fragment, true, evidence);
2269  }
2270  }
2271  seh = scope.AddTopLevelSeqEntry(*entry);
2272 
2273  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2274  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
2275  "SeqGapBadLinkage", "Linkage evidence 'align genus' appears 2 times"));
2276 
2277  eval = validator.Validate(seh, options);
2278  CheckErrors(*eval, expected_errors);
2279 
2280  CLEAR_ERRORS
2281 
2282  evidence.pop_back();
2283  evidence.push_back(CLinkage_evidence::eType_unspecified);
2284  scope.RemoveTopLevelSeqEntry(seh);
2285  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2286  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2287  && it->GetLiteral().GetSeq_data().IsGap()) {
2288  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2289  CSeq_gap::eType_fragment, true, evidence);
2290  }
2291  }
2292  seh = scope.AddTopLevelSeqEntry(*entry);
2293 
2294  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2295  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
2296  "SeqGapBadLinkage", "Seq-gap type has unspecified and additional linkage evidence"));
2297 
2298  eval = validator.Validate(seh, options);
2299  CheckErrors(*eval, expected_errors);
2300 
2301  CLEAR_ERRORS
2302 
2303  scope.RemoveTopLevelSeqEntry(seh);
2304  evidence.clear();
2305  evidence.push_back(CLinkage_evidence::eType_unspecified);
2306  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2307  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2308  && it->GetLiteral().GetSeq_data().IsGap()) {
2309  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2310  CSeq_gap::eType_unknown, true, evidence);
2311  }
2312  }
2313  seh = scope.AddTopLevelSeqEntry(*entry);
2314 
2315  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2316  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
2317  "SeqGapBadLinkage", "Single Seq-gap has unknown type and unspecified linkage"));
2318 
2319  eval = validator.Validate(seh, options);
2320  CheckErrors(*eval, expected_errors);
2321 
2322  CLEAR_ERRORS
2323 
2324  scope.RemoveTopLevelSeqEntry(seh);
2325  CRef<CDelta_seq> gap_seg(new CDelta_seq());
2326  gap_seg->SetLiteral().SetLength(10);
2327  AdjustGap(gap_seg->SetLiteral().SetSeq_data().SetGap(),
2328  CSeq_gap::eType_unknown, true, evidence);
2329 
2330  // adjust delta to avoid errors about large number of Ns in first and last 50 bp
2331  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCATGATGATGTACCGTACGTTTTCCCATGATGATGTACCGTACGTTTT");
2332  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetLength(50);
2333  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
2334  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATGTACCGTACGTTTTCCCATGATGATGTACCGTACGTTTT", CSeq_inst::eMol_dna);
2335  entry->SetSeq().SetInst().SetLength(132);
2336 
2337  seh = scope.AddTopLevelSeqEntry(*entry);
2338 
2339  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2340  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
2341  "SeqGapBadLinkage", "All 2 Seq-gaps have unknown type and unspecified linkage"));
2342 
2343  eval = validator.Validate(seh, options);
2344  CheckErrors(*eval, expected_errors);
2345 
2346  CLEAR_ERRORS
2347 }
2348 
2349 
2350 void ChangeErrorAcc(vector<CExpectedError*> expected_errors, const string& acc)
2351 {
2352  for (auto it : expected_errors) {
2353  if (it) {
2354  it->SetAccession(acc);
2355  }
2356  }
2357 }
2358 
2359 
2360 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingIdsOnBioseq)
2361 {
2363 
2365 
2366  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (lcl|good - lcl|bad)"));
2367  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2368 
2369  // local IDs
2370  scope.RemoveTopLevelSeqEntry(seh);
2371  CRef<CSeq_id> id2(new CSeq_id());
2372  id2->SetLocal().SetStr("bad");
2373  entry->SetSeq().SetId().push_back(id2);
2374  seh = scope.AddTopLevelSeqEntry(*entry);
2375  eval = validator.Validate(seh, options);
2376  CheckErrors(*eval, expected_errors);
2377 
2378  // GIBBSQ
2379  scope.RemoveTopLevelSeqEntry(seh);
2380  CRef<CSeq_id> id1 = entry->SetSeq().SetId().front();
2381  id1->SetGibbsq(1);
2382  id2->SetGibbsq(2);
2383  seh = scope.AddTopLevelSeqEntry(*entry);
2384  ChangeErrorAcc(expected_errors, "bbs|1");
2385  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (bbs|1 - bbs|2)");
2386  eval = validator.Validate(seh, options);
2387  CheckErrors(*eval, expected_errors);
2388 
2389  // GIBBSQ
2390  scope.RemoveTopLevelSeqEntry(seh);
2391  id1->SetGibbmt(1);
2392  id2->SetGibbmt(2);
2393  seh = scope.AddTopLevelSeqEntry(*entry);
2394  ChangeErrorAcc(expected_errors, "bbm|1");
2395  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (bbm|1 - bbm|2)");
2396  eval = validator.Validate(seh, options);
2397  CheckErrors(*eval, expected_errors);
2398 
2399  // GI
2400  scope.RemoveTopLevelSeqEntry(seh);
2401  id1->SetGi(GI_CONST(1));
2402  id2->SetGi(GI_CONST(2));
2403  CRef<CSeq_id> id3(new CSeq_id("gb|AY123456.1"));
2404  entry->SetSeq().SetId().push_back(id3);
2405  seh = scope.AddTopLevelSeqEntry(*entry);
2406  ChangeErrorAcc(expected_errors, "gb|AY123456.1|");
2407  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (gi|1 - gi|2)");
2408  eval = validator.Validate(seh, options);
2409  CheckErrors(*eval, expected_errors);
2410  entry->SetSeq().SetId().pop_back();
2411 
2412  // GIIM
2413  scope.RemoveTopLevelSeqEntry(seh);
2414  id1->SetGiim().SetId(1);
2415  id1->SetGiim().SetDb("foo");
2416  id2->SetGiim().SetId(2);
2417  id2->SetGiim().SetDb("foo");
2418  seh = scope.AddTopLevelSeqEntry(*entry);
2419  CLEAR_ERRORS
2420 
2421  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "IdOnMultipleBioseqs", "BioseqFind (gim|1) unable to find itself - possible internal error"));
2422  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gim|1 - gim|2)"));
2423  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "IdOnMultipleBioseqs", "BioseqFind (gim|2) unable to find itself - possible internal error"));
2424  // AddChromosomeNoLocation(expected_errors, "gim|1");
2425  eval = validator.Validate(seh, options);
2426  CheckErrors(*eval, expected_errors);
2427  CLEAR_ERRORS
2428 
2429  // patent
2430  scope.RemoveTopLevelSeqEntry(seh);
2431  id1->SetPatent().SetSeqid(1);
2432  id1->SetPatent().SetCit().SetCountry("USA");
2433  id1->SetPatent().SetCit().SetId().SetNumber("1");
2434  id2->SetPatent().SetSeqid(2);
2435  id2->SetPatent().SetCit().SetCountry("USA");
2436  id2->SetPatent().SetCit().SetId().SetNumber("2");
2437  seh = scope.AddTopLevelSeqEntry(*entry);
2438  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (pat|USA|1|1 - pat|USA|2|2)"));
2439  // AddChromosomeNoLocation(expected_errors, "pat|USA|1|1");
2440  eval = validator.Validate(seh, options);
2441  CheckErrors(*eval, expected_errors);
2442 
2443  // pdb
2444  scope.RemoveTopLevelSeqEntry(seh);
2445  id1->SetPdb().SetMol().Set("good");
2446  id2->SetPdb().SetMol().Set("badd");
2447  seh = scope.AddTopLevelSeqEntry(*entry);
2448  ChangeErrorAcc(expected_errors, "pdb|good| ");
2449  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (pdb|good| - pdb|badd| )");
2450  eval = validator.Validate(seh, options);
2451  CheckErrors(*eval, expected_errors);
2452 
2453  // general
2454  scope.RemoveTopLevelSeqEntry(seh);
2455  id1->SetGeneral().SetDb("a");
2456  id1->SetGeneral().SetTag().SetStr("good");
2457  id2->SetGeneral().SetDb("a");
2458  id2->SetGeneral().SetTag().SetStr("bad");
2459  seh = scope.AddTopLevelSeqEntry(*entry);
2460  ChangeErrorAcc(expected_errors, "gnl|a|good");
2461  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (gnl|a|good - gnl|a|bad)");
2462  eval = validator.Validate(seh, options);
2463  CheckErrors(*eval, expected_errors);
2464 
2465  CLEAR_ERRORS
2466  // should get no error if db values are different
2467  scope.RemoveTopLevelSeqEntry(seh);
2468  id2->SetGeneral().SetDb("b");
2469  seh = scope.AddTopLevelSeqEntry(*entry);
2470  // AddChromosomeNoLocation(expected_errors, "gnl|a|good");
2471  eval = validator.Validate(seh, options);
2472  CheckErrors(*eval, expected_errors);
2473 
2474  // genbank
2475  scope.RemoveTopLevelSeqEntry(seh);
2476  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gb|AY222222|)"));
2477  id1->SetGenbank().SetAccession("AY123456");
2478  id2->SetGenbank().SetAccession("AY222222");
2479  seh = scope.AddTopLevelSeqEntry(*entry);
2480  eval = validator.Validate(seh, options);
2481  CheckErrors(*eval, expected_errors);
2482 
2483  // try genbank with accession same, versions different
2484  scope.RemoveTopLevelSeqEntry(seh);
2485  id2->SetGenbank().SetAccession("AY123456");
2486  id2->SetGenbank().SetVersion(2);
2487  seh = scope.AddTopLevelSeqEntry(*entry);
2488  CLEAR_ERRORS
2489  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.2|");
2490  expected_errors.push_back(new CExpectedError("gb|AY123456.2|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gb|AY123456.2|)"));
2491  eval = validator.Validate(seh, options);
2492  CheckErrors(*eval, expected_errors);
2493 
2494  // try similar id type
2495  scope.RemoveTopLevelSeqEntry(seh);
2496  id2->SetGpipe().SetAccession("AY123456");
2497  seh = scope.AddTopLevelSeqEntry(*entry);
2498  CLEAR_ERRORS
2499  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gpp|AY123456|)"));
2500  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
2501  eval = validator.Validate(seh, options);
2502  CheckErrors(*eval, expected_errors);
2503 
2504  // LRG
2505  scope.RemoveTopLevelSeqEntry(seh);
2506  id1->SetGeneral().SetDb("LRG");
2507  id1->SetGeneral().SetTag().SetStr("good");
2508  seh = scope.AddTopLevelSeqEntry(*entry);
2509  ChangeErrorAcc(expected_errors, "gpp|AY123456|");
2510  expected_errors[0]->SetErrMsg("LRG sequence needs NG_ accession");
2511  expected_errors[0]->SetSeverity(eDiag_Critical);
2512  eval = validator.Validate(seh, options);
2513  CheckErrors(*eval, expected_errors);
2514  // no error if has NG
2515  scope.RemoveTopLevelSeqEntry(seh);
2516  id2->SetOther().SetAccession("NG_123456");
2517  seh = scope.AddTopLevelSeqEntry(*entry);
2518  CLEAR_ERRORS
2519  // AddChromosomeNoLocation(expected_errors, "ref|NG_123456|");
2520  eval = validator.Validate(seh, options);
2521  CheckErrors(*eval, expected_errors);
2522 
2523  CLEAR_ERRORS
2524 }
2525 
2526 
2527 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNuclAcid)
2528 {
2530 
2532 
2533  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2534  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MolNuclAcid", "Bioseq.mol is type nucleic acid"));
2535 
2536  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_na);
2537  eval = validator.Validate(seh, options);
2538  CheckErrors(*eval, expected_errors);
2539 
2540  CLEAR_ERRORS
2541 }
2542 
2543 
2544 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingBiomolTech)
2545 {
2547 
2549 
2550  // allowed tech values
2551  vector<CMolInfo::TTech> genomic_list;
2552  genomic_list.push_back(CMolInfo::eTech_sts);
2553  genomic_list.push_back(CMolInfo::eTech_survey);
2554  genomic_list.push_back(CMolInfo::eTech_wgs);
2555  genomic_list.push_back(CMolInfo::eTech_htgs_0);
2556  genomic_list.push_back(CMolInfo::eTech_htgs_1);
2557  genomic_list.push_back(CMolInfo::eTech_htgs_2);
2558  genomic_list.push_back(CMolInfo::eTech_htgs_3);
2559  genomic_list.push_back(CMolInfo::eTech_composite_wgs_htgs);
2560 
2562  bool genomic = false;
2563  for (CMolInfo::TTech it : genomic_list) {
2564  if (it == i) {
2565  genomic = true;
2566  break;
2567  }
2568  }
2569  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
2570  SetTech(entry, i);
2572  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolType", "Molecule type (DNA) does not match biomol (RNA)"));
2573  if (i == CMolInfo::eTech_wgs) {
2574  AddChromosomeNoLocation(expected_errors, "lcl|good");
2575  }
2576  if (i == CMolInfo::eTech_est) {
2577  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA", "EST sequence should be mRNA"));
2578  }
2579  if (i == CMolInfo::eTech_htgs_2) {
2580  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 raw seq has no gaps and no graphs"));
2581  }
2582  if (genomic) {
2583  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic", "HTGS/STS/GSS/WGS sequence should be genomic"));
2584  eval = validator.Validate(seh, options);
2585  CheckErrors(*eval, expected_errors);
2587  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
2588  delete expected_errors[0];
2589  expected_errors[0] = nullptr;
2590  expected_errors.back()->SetErrCode("HTGS_STS_GSS_WGSshouldNotBeRNA");
2591  expected_errors.back()->SetErrMsg("HTGS/STS/GSS/WGS sequence should not be RNA");
2592  eval = validator.Validate(seh, options);
2593  CheckErrors(*eval, expected_errors);
2594  } else {
2595  if (IsProteinTech(i)) {
2596  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide", "Nucleic acid with protein sequence method"));
2597  }
2598  if (i == CMolInfo::eTech_barcode) {
2599  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
2600  } else if (i == CMolInfo::eTech_tsa) {
2601  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2602  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"cRNA\" is not appropriate for sequences that use the TSA technique."));
2603  }
2604  eval = validator.Validate(seh, options);
2605  CheckErrors(*eval, expected_errors);
2606  }
2607  CLEAR_ERRORS
2608  }
2609 
2610  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
2611  SetTech(entry, CMolInfo::eTech_tsa);
2612  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2613  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolType", "Molecule type (DNA) does not match biomol (RNA)"));
2614  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2615  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"cRNA\" is not appropriate for sequences that use the TSA technique."));
2616  eval = validator.Validate(seh, options);
2617  CheckErrors(*eval, expected_errors);
2618 
2619  CLEAR_ERRORS
2620 
2621  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2622  eval = validator.GetTSAConflictingBiomolTechErrors(seh);
2623  CheckErrors(*eval, expected_errors);
2624  eval = validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
2625  CheckErrors(*eval, expected_errors);
2626  CLEAR_ERRORS
2627 }
2628 
2629 
2630 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqIdNameHasSpace)
2631 {
2633  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
2634  entry->SetSeq().SetId().front()->SetOther().SetName("good one");
2635 
2637 
2638  expected_errors.push_back(new CExpectedError("ref|NC_123456|good one", eDiag_Critical, "SeqIdNameHasSpace", "Seq-id.name 'good one' should be a single word without any spaces"));
2639  // AddChromosomeNoLocation(expected_errors, "ref|NC_123456|good one");
2640 
2641  eval = validator.Validate(seh, options);
2642  CheckErrors(*eval, expected_errors);
2643 
2644  CLEAR_ERRORS
2645 }
2646 
2647 
2648 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_DuplicateSegmentReferences)
2649 {
2650 #if 0
2651  // removed per VR-779
2653  entry->SetSeq().SetInst().ResetSeq_data();
2654  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
2655  CRef<CSeq_loc> seg1(new CSeq_loc());
2656  seg1->SetWhole().SetGenbank().SetAccession("AY123456");
2657  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(seg1);
2658  CRef<CSeq_loc> seg2(new CSeq_loc());
2659  seg2->SetWhole().SetGenbank().SetAccession("AY123456");
2660  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(seg2);
2661  entry->SetSeq().SetInst().SetLength(970);
2662 
2664  // need to call this statement before calling AddDefaults
2665  // to make sure that we can fetch the sequence referenced by the
2666  // delta sequence so that we can detect that the loc in the
2667  // delta sequence is longer than the referenced sequence
2669  CScope scope(*objmgr);
2670  scope.AddDefaults();
2671  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
2672 
2673  CValidator validator(*objmgr);
2674 
2675  // Set validator options
2676  unsigned int options = CValidator::eVal_need_isojta
2680 
2681  // list of expected errors
2682  vector<CExpectedError*> expected_errors;
2683  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLocOrder", "Segmented BioseqIntervals out of order in SeqLoc [[gb|AY123456|, gb|AY123456|]]"));
2684  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DuplicateSegmentReferences", "Segmented sequence has multiple references to gb|AY123456"));
2686 
2687  eval = validator.Validate(seh, options);
2688  CheckErrors(*eval, expected_errors);
2689 
2690  seg2->SetInt().SetId().SetGenbank().SetAccession("AY123456");
2691  seg2->SetInt().SetFrom(0);
2692  seg2->SetInt().SetTo(484);
2693  expected_errors[0]->SetErrMsg("Segmented BioseqIntervals out of order in SeqLoc [[gb|AY123456|, 1-485]]");
2694  expected_errors[1]->SetSeverity(eDiag_Warning);
2695  expected_errors[1]->SetErrMsg("Segmented sequence has multiple references to gb|AY123456 that are not SEQLOC_WHOLE");
2696  eval = validator.Validate(seh, options);
2697  CheckErrors(*eval, expected_errors);
2698 
2699  CLEAR_ERRORS
2700 #endif
2701 }
2702 
2703 
2704 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_TrailingX)
2705 {
2707  CRef<CSeq_entry> nuc = entry->SetSet().SetSeq_set().front();
2708  CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
2709  CRef<CSeq_feat> prot_feat = prot->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
2711  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATANNNNNN");
2712  nuc->SetSeq().SetInst().SetLength(27);
2713  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEIXX");
2714  prot->SetSeq().SetInst().SetLength(9);
2716  prot_feat->SetLocation().SetInt().SetTo(8);
2717  prot_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
2718  prot_feat->SetPartial(true);
2719  cds_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
2720  cds_feat->SetPartial(true);
2721 
2723 
2724  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TerminalNs", "N at end of sequence"));
2725  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "TrailingX", "Sequence ends in 2 trailing Xs"));
2726  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "HighNpercent3Prime",
2727  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
2728  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2729 
2730  eval = validator.Validate(seh, options);
2731  CheckErrors(*eval, expected_errors);
2732 
2733  CLEAR_ERRORS
2734 }
2735 
2736 
2737 void TestBadProtId(const string& id_str)
2738 {
2739  // bad for just prots
2741  CRef<CSeq_id> bad_id(new CSeq_id());
2742  bad_id->SetGenbank().SetAccession(id_str);
2743  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2744  good_nuc_id->SetLocal().SetStr("nuc");
2745  CRef<CSeq_id> good_prot_id(new CSeq_id());
2746  good_prot_id->SetLocal().SetStr("prot");
2747 
2748  unit_test_util::ChangeNucId(entry, good_nuc_id);
2749  unit_test_util::ChangeProtId(entry, bad_id);
2750 
2752 
2753  expected_errors.push_back(new CExpectedError("gb|" + id_str + "|", eDiag_Error, "BadSeqIdFormat", "Bad accession " + id_str));
2754  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2755 
2756  eval = validator.Validate(seh, options);
2757  CheckErrors(*eval, expected_errors);
2758  CLEAR_ERRORS
2759 }
2760 
2761 
2762 void TestGoodProtId(const string& id_str)
2763 {
2765  CRef<CSeq_id> bad_id(new CSeq_id());
2766  bad_id->SetGenbank().SetAccession(id_str);
2767  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2768  good_nuc_id->SetLocal().SetStr("nuc");
2769  CRef<CSeq_id> good_prot_id(new CSeq_id());
2770  good_prot_id->SetLocal().SetStr("prot");
2771 
2772  unit_test_util::ChangeNucId(entry, good_nuc_id);
2773  unit_test_util::ChangeProtId(entry, bad_id);
2774 
2776 
2777  eval = validator.Validate(seh, options);
2778  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2779 
2780  CheckErrors(*eval, expected_errors);
2781  CLEAR_ERRORS
2782 }
2783 
2784 
2785 void TestGoodNucId(const string& id_str)
2786 {
2788  CRef<CSeq_id> bad_id(new CSeq_id());
2789  bad_id->SetGenbank().SetAccession(id_str);
2790  CRef<CSeq_id> good_prot_id(new CSeq_id());
2791  good_prot_id->SetLocal().SetStr("prot");
2792  unit_test_util::ChangeNucId(entry, bad_id);
2793  unit_test_util::ChangeProtId(entry, good_prot_id);
2794  bool is_wgs = false;
2795  if (id_str.length() == 12 || id_str.length() == 13 || id_str.length() == 14 || id_str.length() == 15) {
2796  SetTech(entry->SetSet().SetSeq_set().front(), CMolInfo::eTech_wgs);
2797  is_wgs = true;
2798  }
2799 
2801 
2802  if (is_wgs) {
2803  AddChromosomeNoLocation(expected_errors, "gb|" + id_str + "|");
2804  }
2805  eval = validator.Validate(seh, options);
2806  CheckErrors(*eval, expected_errors);
2807  CLEAR_ERRORS
2808 }
2809 
2810 
2811 BOOST_FIXTURE_TEST_CASE(Test_SEQ_INST_BadSeqIdFormat, CGenBankFixture)
2812 {
2814  CRef<CSeq_entry> nuc_entry = entry->SetSet().SetSeq_set().front();
2815  CRef<CSeq_entry> prot_entry = entry->SetSet().SetSeq_set().back();
2816  CRef<CSeq_feat> prot_feat = prot_entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
2818 
2820 
2821  expected_errors.push_back(new CExpectedError("",eDiag_Error, "BadSeqIdFormat", "Bad accession"));
2822 
2823  vector<string> bad_ids;
2824  bad_ids.push_back("AY123456ABC"); // can't have letters after digits
2825  bad_ids.push_back("A1234"); // for a single letter, only acceptable number of digits is 5
2826  bad_ids.push_back("A123456");
2827  bad_ids.push_back("AY12345"); // for two letters, only acceptable number of digits is 6
2828  bad_ids.push_back("AY1234567");
2829  bad_ids.push_back("ABC1234"); // three letters bad unless prot and 5 digits
2830  bad_ids.push_back("ABC123456");
2831  bad_ids.push_back("ABCD1234567"); // four letters
2832  bad_ids.push_back("ABCDE123456"); // five letters
2833  bad_ids.push_back("ABCDE12345678");
2834 
2835  vector<string> bad_nuc_ids;
2836  bad_nuc_ids.push_back("ABC12345");
2837 
2838  vector<string> bad_prot_ids;
2839  bad_prot_ids.push_back("AY123456");
2840  bad_prot_ids.push_back("A12345");
2841 
2842  vector<string> good_ids;
2843 
2844  vector<string> good_nuc_ids;
2845  good_nuc_ids.push_back("AY123456");
2846  good_nuc_ids.push_back("A12345");
2847  good_nuc_ids.push_back("ABCD123456789");
2848  good_nuc_ids.push_back("ABCD1234567890");
2849 
2850  vector<string> good_prot_ids;
2851  good_prot_ids.push_back("ABC12345");
2852 
2853 
2854  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2855  good_nuc_id->SetLocal().SetStr("nuc");
2856  CRef<CSeq_id> good_prot_id(new CSeq_id());
2857  good_prot_id->SetLocal().SetStr("prot");
2858 
2859  CRef<CSeq_id> bad_id(new CSeq_id());
2860 
2861  // bad for both
2862  for (const string& id_str : bad_ids) {
2863  const string acc_str = "gb|" + id_str + "|";
2864  ChangeErrorAcc(expected_errors, acc_str);
2865  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2866 
2867  // GenBank
2868  scope.RemoveTopLevelSeqEntry(seh);
2869  scope.ResetDataAndHistory();
2870  bad_id->SetGenbank().SetAccession(id_str);
2871  unit_test_util::ChangeNucId(entry, bad_id);
2872  unit_test_util::ChangeProtId(entry, good_prot_id);
2873  seh = scope.AddTopLevelSeqEntry(*entry);
2874  eval = validator.Validate(seh, options);
2875  CheckErrors(*eval, expected_errors);
2876  scope.RemoveTopLevelSeqEntry(seh);
2877  scope.ResetDataAndHistory();
2878  unit_test_util::ChangeNucId(entry, good_nuc_id);
2879  unit_test_util::ChangeProtId(entry, bad_id);
2880  seh = scope.AddTopLevelSeqEntry(*entry);
2881  eval = validator.Validate(seh, options);
2882  CheckErrors(*eval, expected_errors);
2883  }
2884 
2885  for (const string& id_it : bad_ids) {
2886  const string id_str = "B" + id_it.substr(1);
2887  expected_errors[0]->SetAccession("embl|" + id_str + "|");
2888  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2889 
2890  // EMBL
2891  scope.RemoveTopLevelSeqEntry(seh);
2892  scope.ResetDataAndHistory();
2893  bad_id->SetEmbl().SetAccession(id_str);
2894  unit_test_util::ChangeNucId(entry, bad_id);
2895  unit_test_util::ChangeProtId(entry, good_prot_id);
2896  seh = scope.AddTopLevelSeqEntry(*entry);
2897  eval = validator.Validate(seh, options);
2898  expected_errors[0]->SetAccession("emb|" + id_str + "|");
2899  CheckErrors(*eval, expected_errors);
2900  scope.RemoveTopLevelSeqEntry(seh);
2901  scope.ResetDataAndHistory();
2902  unit_test_util::ChangeNucId(entry, good_nuc_id);
2903  unit_test_util::ChangeProtId(entry, bad_id);
2904  seh = scope.AddTopLevelSeqEntry(*entry);
2905  eval = validator.Validate(seh, options);
2906  CheckErrors(*eval, expected_errors);
2907  }
2908 
2909  for (const string& id_it : bad_ids) {
2910  const string id_str = "C" + id_it.substr(1);
2911  expected_errors[0]->SetAccession("dbj|" + id_str + "|");
2912  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2913 
2914  // DDBJ
2915  scope.RemoveTopLevelSeqEntry(seh);
2916  scope.ResetDataAndHistory();
2917  bad_id->SetDdbj().SetAccession(id_str);
2918  unit_test_util::ChangeNucId(entry, bad_id);
2919  unit_test_util::ChangeProtId(entry, good_prot_id);
2920  seh = scope.AddTopLevelSeqEntry(*entry);
2921  eval = validator.Validate(seh, options);
2922  expected_errors[0]->SetAccession("dbj|" + id_str + "|");
2923  CheckErrors(*eval, expected_errors);
2924  scope.RemoveTopLevelSeqEntry(seh);
2925  scope.ResetDataAndHistory();
2926  unit_test_util::ChangeNucId(entry, good_nuc_id);
2927  unit_test_util::ChangeProtId(entry, bad_id);
2928  seh = scope.AddTopLevelSeqEntry(*entry);
2929  eval = validator.Validate(seh, options);
2930  CheckErrors(*eval, expected_errors);
2931  }
2932 
2933  // bad for just nucs
2934  for (const string& id_str : bad_nuc_ids) {
2935  bad_id->SetGenbank().SetAccession(id_str);
2936  scope.RemoveTopLevelSeqEntry(seh);
2937  unit_test_util::ChangeNucId(entry, bad_id);
2938  unit_test_util::ChangeProtId(entry, good_prot_id);
2939  expected_errors[0]->SetAccession("gb|" + id_str + "|");
2940  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2941  seh = scope.AddTopLevelSeqEntry(*entry);
2942  eval = validator.Validate(seh, options);
2943  CheckErrors(*eval, expected_errors);
2944  }
2945 
2946  // bad for just prots
2947  for (auto id_it : bad_prot_ids) {
2948  TestBadProtId(id_it);
2949  }
2950 
2951  CLEAR_ERRORS
2952 
2953  // good for both
2954  for (const string& id_str : good_ids) {
2955  bad_id->SetGenbank().SetAccession(id_str);
2956  scope.RemoveTopLevelSeqEntry(seh);
2957  unit_test_util::ChangeNucId(entry, bad_id);
2958  unit_test_util::ChangeProtId(entry, good_prot_id);
2959  seh = scope.AddTopLevelSeqEntry(*entry);
2960  eval = validator.Validate(seh, options);
2961  // AddChromosomeNoLocation(expected_errors, "gb|" + *id_it + "|");
2962  CheckErrors(*eval, expected_errors);
2963  scope.RemoveTopLevelSeqEntry(seh);
2964  unit_test_util::ChangeNucId(entry, good_nuc_id);
2965  unit_test_util::ChangeProtId(entry, bad_id);
2966  seh = scope.AddTopLevelSeqEntry(*entry);
2967  eval = validator.Validate(seh, options);
2968  CheckErrors(*eval, expected_errors);
2969  CLEAR_ERRORS
2970  }
2971 
2972  // good for nucs
2973  for (const string& id_it : good_nuc_ids) {
2974  TestGoodNucId(id_it);
2975  }
2976 
2977  // good for just prots
2978  for (const string& id_it : good_prot_ids) {
2979  TestGoodProtId(id_it);
2980  }
2981 
2982  // if GI, needs version
2983  scope.RemoveTopLevelSeqEntry(seh);
2984  bad_id->SetGenbank().SetAccession("AY123456");
2985  bad_id->SetGenbank().SetVersion(0);
2986  unit_test_util::ChangeNucId(entry, bad_id);
2987  unit_test_util::ChangeProtId(entry, good_prot_id);
2988  CRef<CSeq_id> gi_id(new CSeq_id("gi|21914627"));
2989  nuc_entry->SetSeq().SetId().push_back(gi_id);
2990  seh = scope.AddTopLevelSeqEntry(*entry);
2991  eval = validator.Validate(seh, options);
2992  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Critical, "BadSeqIdFormat",
2993  "Accession AY123456 has 0 version"));
2994  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123456|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
2995  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
2996  CheckErrors(*eval, expected_errors);
2997 
2998  CLEAR_ERRORS
2999 
3000  nuc_entry->SetSeq().SetId().pop_back();
3001 
3002  // id that is too long
3003  scope.RemoveTopLevelSeqEntry(seh);
3004  bad_id->SetLocal().SetStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234");
3005  unit_test_util::ChangeNucId(entry, bad_id);
3006  seh = scope.AddTopLevelSeqEntry(*entry);
3007  eval = validator.Validate(seh, options);
3008  // AddChromosomeNoLocation(expected_errors, "lcl|ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234");
3009  CheckErrors(*eval, expected_errors);
3010 
3011  CLEAR_ERRORS
3012 
3013  // shouldn't report if ncbifile ID
3014  scope.RemoveTopLevelSeqEntry(seh);
3015  CRef<CSeq_id> ncbifile(new CSeq_id("gnl|NCBIFILE|ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234"));
3016  unit_test_util::ChangeNucId(entry, good_nuc_id);
3017  nuc_entry->SetSeq().SetId().push_back(ncbifile);
3018  seh = scope.AddTopLevelSeqEntry(*entry);
3019  eval = validator.Validate(seh, options);
3020  // AddChromosomeNoLocation(expected_errors, entry);
3021  CheckErrors(*eval, expected_errors);
3022  nuc_entry->SetSeq().SetId().pop_back();
3023  CLEAR_ERRORS
3024 
3025  // report if database name len too long
3026  scope.RemoveTopLevelSeqEntry(seh);
3027  entry = unit_test_util::BuildGoodSeq();
3028  CRef<CSeq_id> general(new CSeq_id());
3029  general->SetGeneral().SetDb("thisdatabasevalueislong");
3030  general->SetGeneral().SetTag().SetStr("b");
3031  entry->SetSeq().ResetId();
3032  entry->SetSeq().SetId().push_back(general);
3033  seh = scope.AddTopLevelSeqEntry(*entry);
3034  expected_errors.push_back(new CExpectedError("gnl|thisdatabasevalueislong|b", eDiag_Critical, "BadSeqIdFormat",
3035  "General database longer than 20 characters"));
3036 
3037  // AddChromosomeNoLocation(expected_errors, "gnl|thisdatabasevalueislong|b");
3038  eval = validator.Validate(seh, options);
3039  CheckErrors(*eval, expected_errors);
3040 
3041  CLEAR_ERRORS
3042 
3043  // do not report forward slash
3044  scope.RemoveTopLevelSeqEntry(seh);
3045  entry = unit_test_util::BuildGoodSeq();
3046  entry->SetSeq().SetId().front()->SetLocal().SetStr("a/b");
3047  seh = scope.AddTopLevelSeqEntry(*entry);
3048  eval = validator.Validate(seh, options);
3049  // AddChromosomeNoLocation(expected_errors, "lcl|a/b");
3050  CheckErrors(*eval, expected_errors);
3051 
3052  CLEAR_ERRORS
3053 }
3054 
3055 
3056 void TestOneGeneralSeqId(const string& db, const string& tag, const string& errmsg)
3057 {
3059  CRef<CSeq_id> id(new CSeq_id());
3060  id->SetGeneral().SetDb(db);
3061  id->SetGeneral().SetTag().SetStr(tag);
3062  entry->SetSeq().SetId().push_back(id);
3063 
3065 
3066  string acc_str = "lcl|good";
3067  if (!errmsg.empty()) {
3068  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Warning, "BadSeqIdCharacter",
3069  errmsg));
3070  }
3071  // AddChromosomeNoLocation(expected_errors, entry);
3072  eval = validator.Validate(seh, options);
3073  CheckErrors(*eval, expected_errors);
3074 
3075  CLEAR_ERRORS
3076 }
3077 
3078 
3080 {
3081  TestOneGeneralSeqId("PRJNA318798", " CpPA02_0001", "Bad character ' ' in sequence ID 'gnl|PRJNA318798| CpPA02_0001'");
3082  TestOneGeneralSeqId("PRJNA3 18798", "CpPA02_0001", "Bad character ' ' in sequence ID 'gnl|PRJNA3 18798|CpPA02_0001'");
3083 }
3084 
3085 
3086 void TestOneLongGeneral(bool emb, bool err)
3087 {
3089  CRef<CSeq_id> id(new CSeq_id());
3090  id->SetGeneral().SetDb("lgsi");
3091  id->SetGeneral().SetTag().SetStr("thisidentifierismorethanfiftycharactersinlengthsoitshouldberejected");
3092  entry->SetSeq().SetId().push_back(id);
3093 
3094  if (emb) {
3095  CRef<CSeq_id> emb(new CSeq_id());
3096  emb->SetEmbl().SetAccession("AY123457");
3097  emb->SetEmbl().SetVersion(1);
3098  entry->SetSeq().SetId().push_back(emb);
3099  }
3100 
3102 
3103  if (err) {
3104  string acc_str = "lcl|good";
3105  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Critical, "BadSeqIdLength",
3106  "General identifier longer than 50 characters"));
3107  }
3108 
3109  eval = validator.Validate(seh, options);
3110  CheckErrors(*eval, expected_errors);
3111 
3112  CLEAR_ERRORS
3113 }
3114 
3115 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_LongGeneralSeqId)
3116 {
3117  TestOneLongGeneral(false, true);
3118  TestOneLongGeneral(true, false);
3119 }
3120 
3121 
3122 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadSecondaryAccn)
3123 {
3125  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3126 
3128 
3129  CRef<CSeqdesc> gbdesc(new CSeqdesc());
3130  gbdesc->SetGenbank().SetExtra_accessions().push_back("AY123456");
3131  entry->SetSeq().SetDescr().Set().push_back(gbdesc);
3132 
3133  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "BadSecondaryAccn", "AY123456 used for both primary and secondary accession"));
3134  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
3135  eval = validator.Validate(seh, options);
3136  CheckErrors(*eval, expected_errors);
3137 
3138  gbdesc->SetEmbl().SetExtra_acc().push_back("AY123456");
3139  eval = validator.Validate(seh, options);
3140  CheckErrors(*eval, expected_errors);
3141 
3142  CLEAR_ERRORS
3143 }
3144 
3145 
3146 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ZeroGiNumber)
3147 {
3149  entry->SetSeq().SetId().front()->SetGi(ZERO_GI);
3150 
3152 
3153  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Critical, "ZeroGiNumber", "Invalid GI number"));
3154  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Error, "GiWithoutAccession", "No accession on sequence with gi number"));
3155  // AddChromosomeNoLocation(expected_errors, "gi|0");
3156  eval = validator.Validate(seh, options);
3157  CheckErrors(*eval, expected_errors);
3158 
3159  CLEAR_ERRORS
3160 }
3161 
3162 
3163 BOOST_AUTO_TEST_CASE(Test_HistoryGiCollision)
3164 {
3166  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3167  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3168  CRef<CSeq_id> gi_id(new CSeq_id());
3169  gi_id->SetGi(GI_CONST(21914627));
3170  entry->SetSeq().SetId().push_back(gi_id);
3171 
3173 
3174  CRef<CSeq_id> hist_id(new CSeq_id());
3175  hist_id->SetGi(GI_CONST(21914627));
3176  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetIds().push_back(hist_id);
3177  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetDate().SetStd().SetYear(2008);
3178 
3179  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error, "HistoryGiCollision", "Replaced by gi (21914627) is same as current Bioseq"));
3180  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
3181  eval = validator.Validate(seh, options);
3182  CheckErrors(*eval, expected_errors);
3183 
3184  entry->SetSeq().SetInst().SetHist().ResetReplaced_by();
3185  entry->SetSeq().SetInst().SetHist().SetReplaces().SetIds().push_back(hist_id);
3186  entry->SetSeq().SetInst().SetHist().SetReplaces().SetDate().SetStd().SetYear(2008);
3187  expected_errors[0]->SetErrMsg("Replaces gi (21914627) is same as current Bioseq");
3188  eval = validator.Validate(seh, options);
3189  CheckErrors(*eval, expected_errors);
3190 
3191  CLEAR_ERRORS
3192 
3193  // should not generate errors if date has not been set
3194  entry->SetSeq().SetInst().SetHist().ResetReplaces();
3195  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetIds().push_back(hist_id);
3196  eval = validator.Validate(seh, options);
3197  // AddChromosomeNoLocation(expected_errors, entry);
3198  CheckErrors(*eval, expected_errors);
3199 
3200  entry->SetSeq().SetInst().SetHist().ResetReplaced_by();
3201  entry->SetSeq().SetInst().SetHist().SetReplaces().SetIds().push_back(hist_id);
3202  eval = validator.Validate(seh, options);
3203  CheckErrors(*eval, expected_errors);
3204 
3205  CLEAR_ERRORS
3206 }
3207 
3208 
3209 BOOST_AUTO_TEST_CASE(Test_GiWithoutAccession)
3210 {
3212  entry->SetSeq().SetId().front()->SetGi(GI_CONST(123456));
3213 
3215 
3216  expected_errors.push_back(new CExpectedError("gi|123456", eDiag_Error, "GiWithoutAccession", "No accession on sequence with gi number"));
3217  // AddChromosomeNoLocation(expected_errors, entry);
3218  eval = validator.Validate(seh, options);
3219  CheckErrors(*eval, expected_errors);
3220 
3221  CLEAR_ERRORS
3222 }
3223 
3224 
3225 void TestOneOtherAcc(CRef<CSeq_id> other_acc, bool id_change, bool conflict, bool need_hist = false)
3226 {
3228  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3229  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3230  CRef<CSeq_id> gi_id(new CSeq_id());
3231  gi_id->SetGi(GI_CONST(21914627));
3232  entry->SetSeq().SetId().push_back(gi_id);
3233  entry->SetSeq().SetId().push_back(other_acc);
3234  string acc_str = "gb|AY123456.1|";
3235 
3237 
3238  if (conflict) {
3239  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error, "ConflictingIdsOnBioseq",
3240  "Conflicting ids on a Bioseq: (gb|AY123456.1| - " + other_acc->AsFastaString() + ")"));
3241  }
3242  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "MultipleAccessions", "Multiple accessions on sequence with gi number"));
3243  if (id_change) {
3244  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123457.1|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
3245  }
3246  if (need_hist) {
3247  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Info, "HistAssemblyMissing",
3248  "TPA record gb|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3249  }
3250  // AddChromosomeNoLocation(expected_errors, acc_str);
3251  eval = validator.Validate(seh, options);
3252  CheckErrors(*eval, expected_errors);
3253 
3254  CLEAR_ERRORS
3255 }
3256 
3257 
3258 BOOST_FIXTURE_TEST_CASE(Test_MultipleAccessions, CGenBankFixture)
3259 {
3260  CRef<CSeq_id> other_acc(new CSeq_id());
3261 
3262  // genbank, ddbj, embl, tpg, tpe, tpd, other, pir, swissprot, and prf all count as accessionts
3263  // genbank
3264  other_acc->SetGenbank().SetAccession("AY123457");
3265  other_acc->SetGenbank().SetVersion(1);
3266  TestOneOtherAcc(other_acc, true, true);
3267 
3268  // ddbj
3269  other_acc->SetDdbj().SetAccession("AY123457");
3270  other_acc->SetDdbj().SetVersion(1);
3271  TestOneOtherAcc(other_acc, false, true);
3272 
3273  // embl
3274  other_acc->SetEmbl().SetAccession("AY123457");
3275  other_acc->SetEmbl().SetVersion(1);
3276  TestOneOtherAcc(other_acc, false, true);
3277 
3278  // pir
3279  other_acc->SetPir().SetAccession("AY123457");
3280  other_acc->SetPir().SetVersion(1);
3281  TestOneOtherAcc(other_acc, false, false);
3282 
3283  // swissprot
3284  other_acc->SetSwissprot().SetAccession("AY123457");
3285  other_acc->SetSwissprot().SetVersion(1);
3286  TestOneOtherAcc(other_acc, false, false);
3287 
3288  // prf
3289  other_acc->SetPrf().SetAccession("AY123457");
3290  other_acc->SetPrf().SetVersion(1);
3291  TestOneOtherAcc(other_acc, false, false);
3292 
3293  // tpg
3294  other_acc->SetTpg().SetAccession("AY123457");
3295  other_acc->SetTpg().SetVersion(1);
3296  TestOneOtherAcc(other_acc, false, true, true);
3297 
3298  // tpe
3299  other_acc->SetTpe().SetAccession("AY123457");
3300  other_acc->SetTpe().SetVersion(1);
3301  TestOneOtherAcc(other_acc, false, true, true);
3302 
3303  // tpd
3304  other_acc->SetTpd().SetAccession("AY123457");
3305  other_acc->SetTpd().SetVersion(1);
3306  TestOneOtherAcc(other_acc, false, true, true);
3307 
3308  // other
3310  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3311  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3312  CRef<CSeq_id> gi_id(new CSeq_id());
3313  gi_id->SetGi(GI_CONST(21914627));
3314  entry->SetSeq().SetId().push_back(gi_id);
3315  entry->SetSeq().SetId().push_back(other_acc);
3316  other_acc->SetOther().SetAccession("NC_123457");
3317  other_acc->SetOther().SetVersion(1);
3318 
3320 
3321  string acc_str = "gb|AY123456.1|";
3322  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "INSDRefSeqPackaging", "INSD and RefSeq records should not be present in the same set"));
3323  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "MultipleAccessions", "Multiple accessions on sequence with gi number"));
3324  // AddChromosomeNoLocation(expected_errors, acc_str);
3325  eval = validator.Validate(seh, options);
3326  CheckErrors(*eval, expected_errors);
3327 
3328  CLEAR_ERRORS
3329 }
3330 
3331 
3332 BOOST_AUTO_TEST_CASE(Test_HistAssemblyMissing)
3333 {
3335  tpg_entry->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3336  tpg_entry->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3337 
3339  tpe_entry->SetSeq().SetId().front()->SetTpe().SetAccession("AY123456");
3340  tpe_entry->SetSeq().SetId().front()->SetTpe().SetVersion(1);
3341 
3343  tpd_entry->SetSeq().SetId().front()->SetTpd().SetAccession("AY123456");
3344  tpd_entry->SetSeq().SetId().front()->SetTpd().SetVersion(1);
3345 
3346  STANDARD_SETUP_NAME(tpg_entry)
3347 
3348  // tpg
3349  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Info, "HistAssemblyMissing", "TPA record tpg|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3350  // AddChromosomeNoLocation(expected_errors, tpg_entry);
3351  eval = validator.Validate(seh, options);
3352  CheckErrors(*eval, expected_errors);
3353 
3354  // tpe
3355  scope.RemoveTopLevelSeqEntry(seh);
3356  seh = scope.AddTopLevelSeqEntry(*tpe_entry);
3357  ChangeErrorAcc(expected_errors, "tpe|AY123456.1|");
3358  expected_errors[0]->SetErrMsg("TPA record tpe|AY123456.1| should have Seq-hist.assembly for PRIMARY block");
3359  eval = validator.Validate(seh, options);
3360  CheckErrors(*eval, expected_errors);
3361 
3362 
3363  // tpd
3364  scope.RemoveTopLevelSeqEntry(seh);
3365  seh = scope.AddTopLevelSeqEntry(*tpd_entry);
3366  ChangeErrorAcc(expected_errors, "tpd|AY123456.1|");
3367  expected_errors[0]->SetErrMsg("TPA record tpd|AY123456.1| should have Seq-hist.assembly for PRIMARY block");
3368  eval = validator.Validate(seh, options);
3369  CheckErrors(*eval, expected_errors);
3370 
3371  CLEAR_ERRORS
3372 
3373  // error suppressed if keyword present
3374  CRef<CSeqdesc> block(new CSeqdesc());
3375  block->SetGenbank().SetKeywords().push_back("TPA:reassembly");
3376  tpg_entry->SetSeq().SetDescr().Set().push_back(block);
3377  scope.RemoveTopLevelSeqEntry(seh);
3378  seh = scope.AddTopLevelSeqEntry(*tpg_entry);
3379  eval = validator.Validate(seh, options);
3380  // AddChromosomeNoLocation(expected_errors, tpg_entry);
3381 
3382  CheckErrors(*eval, expected_errors);
3383  block->SetEmbl().SetKeywords().push_back("TPA:reassembly");
3384  eval = validator.Validate(seh, options);
3385  CheckErrors(*eval, expected_errors);
3386  CLEAR_ERRORS
3387 }
3388 
3389 BOOST_AUTO_TEST_CASE(Test_TerminalNs)
3390 {
3392  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNNAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAANNNNNNNNNN");
3393  entry->SetSeq().SetInst().SetLength(62);
3394 
3396 
3397  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at beginning of sequence"));
3398  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at end of sequence"));
3399  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
3400  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3401  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
3402  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3403  // AddChromosomeNoLocation(expected_errors, entry);
3404  eval = validator.Validate(seh, options);
3405  CheckErrors(*eval, expected_errors);
3406 
3407  // warning level changes if not local only
3408  scope.RemoveTopLevelSeqEntry(seh);
3409  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3410  seh = scope.AddTopLevelSeqEntry(*entry);
3411  ChangeErrorAcc(expected_errors, "gb|AY123456|");
3412  expected_errors[0]->SetSeverity(eDiag_Error);
3413  expected_errors[1]->SetSeverity(eDiag_Error);
3414  eval = validator.Validate(seh, options);
3415  CheckErrors(*eval, expected_errors);
3416 
3417  CLEAR_ERRORS
3418 
3419  // also try delta sequence
3420  scope.RemoveTopLevelSeqEntry(seh);
3422  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNNNNNNNCCC");
3423  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCNNNNNNNNN");
3424  seh = scope.AddTopLevelSeqEntry(*entry);
3425 
3426  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 3 bases"));
3427  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at beginning of sequence"));
3428  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at end of sequence"));
3429  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 52 percent Ns"));
3430  eval = validator.Validate(seh, options);
3431  CheckErrors(*eval, expected_errors);
3432 
3433  // 10 Ns but just local stays at warning
3434  scope.RemoveTopLevelSeqEntry(seh);
3436  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNNNNNNNNCC");
3437  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCNNNNNNNNNN");
3438  seh = scope.AddTopLevelSeqEntry(*entry);
3439  expected_errors[0]->SetErrMsg("Maximum contig length is 2 bases");
3440  expected_errors.back()->SetErrMsg("Sequence contains 58 percent Ns");
3441  eval = validator.Validate(seh, options);
3442  CheckErrors(*eval, expected_errors);
3443 
3444  // 10 Ns but now has non-local ID, error
3445  scope.RemoveTopLevelSeqEntry(seh);
3446  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3447  seh = scope.AddTopLevelSeqEntry(*entry);
3448  ChangeErrorAcc(expected_errors, "gb|AY123456|");
3449  expected_errors[1]->SetSeverity(eDiag_Error);
3450  expected_errors[2]->SetSeverity(eDiag_Error);
3451  eval = validator.Validate(seh, options);
3452  CheckErrors(*eval, expected_errors);
3453 
3454  // NC and patent IDs back to warning
3455  scope.RemoveTopLevelSeqEntry(seh);
3456  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3457  seh = scope.AddTopLevelSeqEntry(*entry);
3458  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3459  expected_errors[1]->SetSeverity(eDiag_Warning);
3460  expected_errors[2]->SetSeverity(eDiag_Warning);
3461  eval = validator.Validate(seh, options);
3462  CheckErrors(*eval, expected_errors);
3463 
3464  scope.RemoveTopLevelSeqEntry(seh);
3465  entry->SetSeq().SetId().front()->SetPatent().SetSeqid(1);
3466  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetCountry("USA");
3467  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetId().SetNumber("1");
3468  seh = scope.AddTopLevelSeqEntry(*entry);
3469  ChangeErrorAcc(expected_errors, "pat|USA|1|1");
3470  delete expected_errors.back();
3471  expected_errors.pop_back();
3472  eval = validator.Validate(seh, options);
3473  CheckErrors(*eval, expected_errors);
3474  CLEAR_ERRORS
3475 
3476  // no more TerminalNs warnings if circular
3477  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3479  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Error, "ContigsTooShort",
3480  "Maximum contig length is 2 bases"));
3481  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Warning, "UnwantedCompleteFlag",
3482  "Suspicious use of complete"));
3483  // AddChromosomeNoLocation(expected_errors, entry);
3484 
3485  eval = validator.Validate(seh, options);
3486  CheckErrors(*eval, expected_errors);
3487 
3488  CLEAR_ERRORS
3489 }
3490 
3491 
3492 BOOST_FIXTURE_TEST_CASE(Test_UnexpectedIdentifierChange, CGenBankFixture)
3493 {
3495  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123457");
3496  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3497  CRef<CSeq_id> gi_id(new CSeq_id());
3498  gi_id->SetGi(GI_CONST(21914627));
3499  entry->SetSeq().SetId().push_back(gi_id);
3500 
3502 
3503  expected_errors.push_back(new CExpectedError("gb|AY123457.1|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123457.1|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
3504  // AddChromosomeNoLocation(expected_errors, entry);
3505  eval = validator.Validate(seh, options);
3506  CheckErrors(*eval, expected_errors);
3507 
3508  CLEAR_ERRORS
3509  scope.RemoveTopLevelSeqEntry(seh);
3510  entry->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3511  entry->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3512  seh = scope.AddTopLevelSeqEntry(*entry);
3513  // AddChromosomeNoLocation(expected_errors, entry);
3514  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Info, "HistAssemblyMissing", "TPA record tpg|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3515  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "Loss of accession (gb|AY123456.1|) on gi (21914627) compared to the NCBI sequence repository"));
3516  eval = validator.Validate(seh, options);
3517  CheckErrors(*eval, expected_errors);
3518 
3519  // TODO - try to instigate other errors
3520 
3521  CLEAR_ERRORS
3522 }
3523 
3524 
3525 BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqLit)
3526 {
3528  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNGG");
3529  SetTech(entry, CMolInfo::eTech_wgs);
3530 
3532 
3533  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqLit", "Run of 20 Ns in delta component 5 that starts at base 45"));
3534  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
3535  /*
3536  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3537  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3538  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3539  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3540  */
3541  AddChromosomeNoLocation(expected_errors, entry);
3542 
3543  eval = validator.Validate(seh, options);
3544  CheckErrors(*eval, expected_errors);
3545 
3546  CLEAR_ERRORS
3547 
3548  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGG");
3550  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqLit",
3551  "Run of 81 Ns in delta component 7 that starts at base 79"));
3552  /*
3553  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3554  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3555  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3556  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3557  */
3558  // AddChromosomeNoLocation(expected_errors, entry);
3559 
3560  eval = validator.Validate(seh, options);
3561  CheckErrors(*eval, expected_errors);
3562 
3564  eval = validator.Validate(seh, options);
3565  CheckErrors(*eval, expected_errors);
3566 
3568  eval = validator.Validate(seh, options);
3569  CheckErrors(*eval, expected_errors);
3570 
3571  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGG");
3573  expected_errors[0]->SetErrMsg("Run of 101 Ns in delta component 9 that starts at base 174");
3574  eval = validator.Validate(seh, options);
3575  CheckErrors(*eval, expected_errors);
3576 
3577  CLEAR_ERRORS
3578 }
3579 
3580 
3581 BOOST_AUTO_TEST_CASE(Test_SeqLitGapLength0)
3582 {
3584  CRef<CDelta_seq> delta_seq(new CDelta_seq());
3585  delta_seq->SetLiteral().SetLength(0);
3586  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(delta_seq);
3587 
3589 
3590  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLitGapLength0", "Gap of length 0 in delta chain"));
3591  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "Last delta seq component is a gap"));
3592  // AddChromosomeNoLocation(expected_errors, entry);
3593  eval = validator.Validate(seh, options);
3594  CheckErrors(*eval, expected_errors);
3595 
3596  // some kinds of fuzz don't trigger other kind of error
3597  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_gt);
3598  eval = validator.Validate(seh, options);
3599  CheckErrors(*eval, expected_errors);
3600 
3601  delta_seq->SetLiteral().SetFuzz().Reset();
3602  delta_seq->SetLiteral().SetFuzz().SetP_m(10);
3603  eval = validator.Validate(seh, options);
3604  CheckErrors(*eval, expected_errors);
3605 
3606  // others will
3607  delta_seq->SetLiteral().SetFuzz().Reset();
3608  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3609  expected_errors[0]->SetErrMsg("Gap of length 0 with unknown fuzz in delta chain");
3610  eval = validator.Validate(seh, options);
3611  CheckErrors(*eval, expected_errors);
3612 
3613  // try again with swissprot, error goes to warning
3614  scope.RemoveTopLevelSeqEntry(seh);
3615  entry->SetSeq().SetId().front()->SetSwissprot().SetAccession("AY123456");
3616  seh = scope.AddTopLevelSeqEntry(*entry);
3617  expected_errors[0]->SetSeverity(eDiag_Warning);
3618  ChangeErrorAcc(expected_errors, "sp|AY123456|");
3619  eval = validator.Validate(seh, options);
3620  CheckErrors(*eval, expected_errors);
3621 
3622  delta_seq->SetLiteral().SetFuzz().SetP_m(10);
3623  expected_errors[0]->SetErrMsg("Gap of length 0 in delta chain");
3624  eval = validator.Validate(seh, options);
3625  CheckErrors(*eval, expected_errors);
3626 
3627  delta_seq->SetLiteral().SetFuzz().Reset();
3628  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_gt);
3629  eval = validator.Validate(seh, options);
3630  CheckErrors(*eval, expected_errors);
3631 
3632  delta_seq->SetLiteral().ResetFuzz();
3633  eval = validator.Validate(seh, options);
3634  CheckErrors(*eval, expected_errors);
3635 
3636  CLEAR_ERRORS
3637 }
3638 
3639 
3641 {
3642  CRef<CSeqdesc> desc(new CSeqdesc());
3643  desc->SetUser().SetType().SetStr("TpaAssembly");
3644  entry->SetSeq().SetDescr().Set().push_back(desc);
3645 
3646  CRef<CUser_field> field(new CUser_field());
3647  field->SetLabel().SetStr("Label");
3648  field->SetData().SetStr("Data");
3649  desc->SetUser().SetData().push_back(field);
3650 }
3651 
3652 
3653 BOOST_FIXTURE_TEST_CASE(Test_TpaAssemblyProblem, CGenBankFixture)
3654 {
3655  CRef<CSeq_entry> entry(new CSeq_entry());
3658  member1->SetSeq().SetId().front()->SetLocal().SetStr("good");
3659  AddTpaAssemblyUserObject(member1);
3660  entry->SetSet().SetSeq_set().push_back(member1);
3662  member2->SetSeq().SetId().front()->SetLocal().SetStr("good2");
3663  AddTpaAssemblyUserObject(member2);
3664  entry->SetSet().SetSeq_set().push_back(member2);
3665 
3667 
3668  // two Tpa sequences, but neither has assembly and neither has GI, so no errors expected
3669  // AddChromosomeNoLocation(expected_errors, "lcl|good");
3670  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
3671  eval = validator.Validate(seh, options);
3672  CheckErrors(*eval, expected_errors);
3673 
3674  // now one has hist, other does not
3675  member1->SetSeq().SetInst().SetHist().SetAssembly().push_back(unit_test_util::BuildGoodAlign());
3676  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TpaAssemblyProblem", "There are 1 TPAs with history and 1 without history in this record."));
3677  eval = validator.Validate(seh, options);
3678  CheckErrors(*eval, expected_errors);
3679 
3680  // now one has gi
3681  scope.RemoveTopLevelSeqEntry(seh);
3682  member1->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3683  member1->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3684  CRef<CSeq_id> gi_id(new CSeq_id());
3685  gi_id->SetGi(GI_CONST(21914627));
3686  member1->SetSeq().SetId().push_back(gi_id);
3687  seh = scope.AddTopLevelSeqEntry(*entry);
3688 
3689  CLEAR_ERRORS
3690 
3691  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "Loss of accession (gb|AY123456.1|) on gi (21914627) compared to the NCBI sequence repository"));
3692  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Error, "TpaAssemblyProblem", "There are 1 TPAs with history and 1 without history in this record."));
3693  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "TpaAssemblyProblem", "There are 1 TPAs without history in this record, but the record has a gi number assignment."));
3694  // AddChromosomeNoLocation(expected_errors, "tpg|AY123456.1|");
3695  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
3696  eval = validator.Validate(seh, options);
3697  CheckErrors(*eval, expected_errors);
3698 
3699  CLEAR_ERRORS
3700 }
3701 
3702 
3704 {
3705  // prepare entry
3707  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
3708  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
3709  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(9);
3710  entry->SetSeq().SetInst().SetLength(32);
3711 
3713 
3714  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SeqLocLength", "Short length (10) on seq-loc (gb|AY123456|:1-10) of delta seq_ext"));
3715  // AddChromosomeNoLocation(expected_errors, entry);
3716  eval = validator.Validate(seh, options);
3717  CheckErrors(*eval, expected_errors);
3718 
3719  scope.RemoveTopLevelSeqEntry(seh);
3720  // if length 11, should not be a problem
3722  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
3723  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
3724  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(10);
3725  entry->SetSeq().SetInst().SetLength(33);
3726  seh = scope.AddTopLevelSeqEntry(*entry);
3727  eval = validator.Validate(seh, options);
3728  CheckErrors(*eval, expected_errors);
3729 
3730  CLEAR_ERRORS
3731 }
3732 
3733 
3734 BOOST_AUTO_TEST_CASE(Test_MissingGaps)
3735 {
3736  // prepare entry
3738  // remove gaps
3740 
3742 
3743  // AddChromosomeNoLocation(expected_errors, entry);
3744  // only report errors for specific molinfo tech values
3745  eval = validator.Validate(seh, options);
3746  CheckErrors(*eval, expected_errors);
3747  // htgs_3 should not report
3749  eval = validator.Validate(seh, options);
3750  CheckErrors(*eval, expected_errors);
3751 
3753  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingGaps", "HTGS delta seq should have gaps between all sequence runs"));
3754  eval = validator.Validate(seh, options);
3755  CheckErrors(*eval, expected_errors);
3756 
3758  eval = validator.Validate(seh, options);
3759  CheckErrors(*eval, expected_errors);
3760 
3762  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 delta seq has no gaps and no graphs"));
3763  eval = validator.Validate(seh, options);
3764  CheckErrors(*eval, expected_errors);
3765 
3766  // RefGeneTracking changes severity
3767  scope.RemoveTopLevelSeqEntry(seh);
3768  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3770  seh = scope.AddTopLevelSeqEntry(*entry);
3771  expected_errors[0]->SetSeverity(eDiag_Info);
3772  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3773  eval = validator.Validate(seh, options);
3774  CheckErrors(*eval, expected_errors);
3775  delete expected_errors[1];
3776  expected_errors.pop_back();
3777 
3779  eval = validator.Validate(seh, options);
3780  CheckErrors(*eval, expected_errors);
3781 
3783  eval = validator.Validate(seh, options);
3784  CheckErrors(*eval, expected_errors);
3785 
3786  CLEAR_ERRORS
3787 }
3788 
3789 
3790 BOOST_AUTO_TEST_CASE(Test_CompleteTitleProblem)
3791 {
3792  // prepare entry
3793  CRef<CSeq_entry> entry = BuildGoodSeq();
3794  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3795  SetLineage(entry, "Viruses; foo");
3796  SetTitle(entry, "Foo complete genome");
3797 
3799 
3800  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning, "CompleteTitleProblem", "Complete genome in title without complete flag set"));
3801  // AddChromosomeNoLocation(expected_errors, entry);
3802 
3803  eval = validator.Validate(seh, options);
3804  CheckErrors(*eval, expected_errors);
3805 
3806  CLEAR_ERRORS
3807 
3808  // should be no error if complete
3810 
3811  eval = validator.Validate(seh, options);
3812  // AddChromosomeNoLocation(expected_errors, entry);
3813  CheckErrors(*eval, expected_errors);
3814 
3815  // different message and code if gaps
3816  scope.RemoveTopLevelSeqEntry(seh);
3817  entry = BuildGoodDeltaSeq();
3818  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3819  unit_test_util::SetLineage(entry, "Viruses; foo");
3820  SetTitle(entry, "Foo complete genome");
3822  seh = scope.AddTopLevelSeqEntry(*entry);
3823 
3824  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3825  "CompleteGenomeHasGaps", "Title contains 'complete genome' but sequence has gaps"));
3826 
3827  eval = validator.Validate(seh, options);
3828  CheckErrors(*eval, expected_errors);
3829 
3830  CLEAR_ERRORS
3831 }
3832 
3833 
3834 BOOST_AUTO_TEST_CASE(Test_CompleteCircleProblem)
3835 {
3836  // prepare entry
3838  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3839 
3841 
3842  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
3843  "CompleteCircleProblem",
3844  "Circular topology without complete flag set"));
3845  // AddChromosomeNoLocation(expected_errors, entry);
3846 
3847  eval = validator.Validate(seh, options);
3848  CheckErrors(*eval, expected_errors);
3849 
3850  CLEAR_ERRORS
3851 
3852  scope.RemoveTopLevelSeqEntry(seh);
3853  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3854  SetTitle(entry, "This is just a title");
3856  seh = scope.AddTopLevelSeqEntry(*entry);
3857  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3858  "CompleteCircleProblem",
3859  "Circular topology has complete flag set, but title should say complete sequence or complete genome"));
3860  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3861  "UnwantedCompleteFlag",
3862  "Suspicious use of complete"));
3863  // AddChromosomeNoLocation(expected_errors, entry);
3864 
3865  eval = validator.Validate(seh, options);
3866  CheckErrors(*eval, expected_errors);
3867 
3868  CLEAR_ERRORS
3869 }
3870 
3871 
3872 BOOST_AUTO_TEST_CASE(Test_BadHTGSeq)
3873 {
3874  // prepare entry
3876  // remove gaps
3878 
3879  STANDARD_SETUP_NAME(delta_entry)
3880 
3881  SetTech(delta_entry, CMolInfo::eTech_htgs_2);
3882  // AddChromosomeNoLocation(expected_errors, delta_entry);
3883  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingGaps", "HTGS delta seq should have gaps between all sequence runs"));
3884  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 delta seq has no gaps and no graphs"));
3885  eval = validator.Validate(seh, options);
3886  CheckErrors(*eval, expected_errors);
3887 
3888  delete expected_errors[1];
3889  expected_errors.pop_back();
3890 
3891  // HTGS_ACTIVEFIN keyword disables BadHTGSeq error
3892  AddGenbankKeyword(delta_entry, "HTGS_ACTIVEFIN");
3893  eval = validator.Validate(seh, options);
3894  CheckErrors(*eval, expected_errors);
3895 
3896  CLEAR_ERRORS
3897 
3898  scope.RemoveTopLevelSeqEntry(seh);
3900  SetTech(raw_entry, CMolInfo::eTech_htgs_2);
3901  seh = scope.AddTopLevelSeqEntry(*raw_entry);
3902  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 raw seq has no gaps and no graphs"));
3903  // AddChromosomeNoLocation(expected_errors, raw_entry);
3904  eval = validator.Validate(seh, options);
3905  CheckErrors(*eval, expected_errors);
3906 
3907  CLEAR_ERRORS
3908 
3909  // HTGS_ACTIVEFIN keyword disables error
3910  AddGenbankKeyword(raw_entry, "HTGS_ACTIVEFIN");
3911  // AddChromosomeNoLocation(expected_errors, raw_entry);
3912  eval = validator.Validate(seh, options);
3913  CheckErrors(*eval, expected_errors);
3914 
3915 
3916  // htg3 errors
3917  SetTech(raw_entry, CMolInfo::eTech_htgs_3);
3918  AddGenbankKeyword(raw_entry, "HTGS_DRAFT");
3919  AddGenbankKeyword(raw_entry, "HTGS_PREFIN");
3920  AddGenbankKeyword(raw_entry, "HTGS_FULLTOP");
3921  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_DRAFT keyword"));
3922  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_PREFIN keyword"));
3923  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_ACTIVEFIN keyword"));
3924  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_FULLTOP keyword"));
3925  eval = validator.Validate(seh, options);
3926  CheckErrors(*eval, expected_errors);
3927 
3928  scope.RemoveTopLevelSeqEntry(seh);
3929  seh = scope.AddTopLevelSeqEntry(*delta_entry);
3930  SetTech(delta_entry, CMolInfo::eTech_htgs_3);
3931  AddGenbankKeyword(delta_entry, "HTGS_DRAFT");
3932  AddGenbankKeyword(delta_entry, "HTGS_PREFIN");
3933  AddGenbankKeyword(delta_entry, "HTGS_FULLTOP");
3934  eval = validator.Validate(seh, options);
3935  CheckErrors(*eval, expected_errors);
3936 
3937  CLEAR_ERRORS
3938 }
3939 
3940 
3941 BOOST_AUTO_TEST_CASE(Test_GapInProtein_and_BadProteinStart)
3942 {
3943  // prepare entry
3945  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("PRK-EIN");
3946 
3948 
3949  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapInProtein", "[1] internal gap symbols in protein sequence (gene? - fake protein name)"));
3950  // AddChromosomeNoLocation(expected_errors, entry);
3951  eval = validator.Validate(seh, options);
3952  CheckErrors(*eval, expected_errors);
3953 
3954  CLEAR_ERRORS
3955 
3956  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("-RKTEIN");
3957  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadProteinStart", "gap symbol at start of protein sequence (gene? - fake protein name)"));
3958  // AddChromosomeNoLocation(expected_errors, entry);
3959  eval = validator.Validate(seh, options);
3960  CheckErrors(*eval, expected_errors);
3961 
3962  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("-RK-EIN");
3963  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapInProtein", "[1] internal gap symbols in protein sequence (gene? - fake protein name)"));
3964  eval = validator.Validate(seh, options);
3965  CheckErrors(*eval, expected_errors);
3966 
3967  CLEAR_ERRORS
3968 }
3969 
3970 
3971 BOOST_AUTO_TEST_CASE(Test_TerminalGap)
3972 {
3973  // prepare entry
3975  CRef<CDelta_seq> first_seg(new CDelta_seq());
3976  first_seg->SetLiteral().SetLength(9);
3977  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_front(first_seg);
3978  CRef<CDelta_seq> last_seg(new CDelta_seq());
3979  last_seg->SetLiteral().SetLength(9);
3980  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(last_seg);
3981  entry->SetSeq().SetInst().SetLength(entry->SetSeq().SetInst().GetLength() + 18);
3982 
3984 
3985  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "First delta seq component is a gap"));
3986  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "Last delta seq component is a gap"));
3987  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at beginning of sequence"));
3988  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at end of sequence"));
3989  /*
3990  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3991  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3992  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3993  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3994  */
3995  // AddChromosomeNoLocation(expected_errors, entry);
3996 
3997  eval = validator.Validate(seh, options);
3998  CheckErrors(*eval, expected_errors);
3999 
4000  // if gap length is 10, severity is still warning because still all local IDS
4001  scope.RemoveTopLevelSeqEntry(seh);
4002  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetLength(10);
4003  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetLength(10);
4004  entry->SetSeq().SetInst().SetLength(entry->SetSeq().SetInst().GetLength() + 2);
4005  seh = scope.AddTopLevelSeqEntry(*entry);
4006  eval = validator.Validate(seh, options);
4007  CheckErrors(*eval, expected_errors);
4008 
4009 
4010  scope.RemoveTopLevelSeqEntry(seh);
4011  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
4012  seh = scope.AddTopLevelSeqEntry(*entry);
4013  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
4014  /*
4015  expected_errors[2]->SetSeverity(eDiag_Warning);
4016  expected_errors[3]->SetSeverity(eDiag_Warning);
4017  */
4018  eval = validator.Validate(seh, options);
4019  CheckErrors(*eval, expected_errors);
4020 
4021  scope.RemoveTopLevelSeqEntry(seh);
4022  entry->SetSeq().SetId().front()->SetPatent().SetSeqid(1);
4023  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetCountry("USA");
4024  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetId().SetNumber("1");
4025  seh = scope.AddTopLevelSeqEntry(*entry);
4026  ChangeErrorAcc(expected_errors, "pat|USA|1|1");
4027  eval = validator.Validate(seh, options);
4028  CheckErrors(*eval, expected_errors);
4029 
4030  CLEAR_ERRORS
4031 
4032  // no more terminal gap warnings if circular - changed to still show first/last delta component
4033  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
4035  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Warning, "UnwantedCompleteFlag",
4036  "Suspicious use of complete"));
4037  // AddChromosomeNoLocation(expected_errors, entry);
4038 
4039  eval = validator.Validate(seh, options);
4040  CheckErrors(*eval, expected_errors);
4041  CLEAR_ERRORS
4042 }
4043 
4044 
4045 BOOST_FIXTURE_TEST_CASE(Test_OverlappingDeltaRange, CGenBankFixture)
4046 {
4047  // prepare entry
4049  entry->SetSeq().SetInst().ResetExt();
4050  CRef<CSeq_id> seqid(new CSeq_id());
4051  seqid->SetGenbank().SetAccession("AY123456");
4052  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 0, 10);
4053  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 5, 15);
4054  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 20, 30);
4055  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 25, 35);
4056  entry->SetSeq().SetInst().SetLength(44);
4057 
4059 
4060  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingDeltaRange", "Overlapping delta range 6-16 and 1-11 on a Bioseq gb|AY123456|"));
4061  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingDeltaRange", "Overlapping delta range 26-36 and 21-31 on a Bioseq gb|AY123456|"));
4062  // AddChromosomeNoLocation(expected_errors, entry);
4063  eval = validator.Validate(seh, options);
4064  CheckErrors(*eval, expected_errors);
4065 
4066  CLEAR_ERRORS
4067 }
4068 
4069 
4070 BOOST_AUTO_TEST_CASE(Test_LeadingX)
4071 {
4072  // prepare entry
4074  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("XROTEIN");
4075 
4077 
4078  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LeadingX", "Sequence starts with leading X"));
4079  // AddChromosomeNoLocation(expected_errors, entry);
4080  eval = validator.Validate(seh, options);
4081  CheckErrors(*eval, expected_errors);
4082 
4083  CLEAR_ERRORS
4084 }
4085 
4086 BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqRaw)
4087 {
4088  // prepare entry
4090  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTTT");
4091  entry->SetSeq().SetInst().SetLength(110);
4092 
4094 
4095  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqRaw", "Run of 100 Ns in raw sequence starting at base 6"));
4096  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4097  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 90 percent Ns"));
4098  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4099  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4100  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4101  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4102  // AddChromosomeNoLocation(expected_errors, entry);
4103  eval = validator.Validate(seh, options);
4104  CheckErrors(*eval, expected_errors);
4105 
4106  CLEAR_ERRORS
4107 
4108  // expect no InternalNsInSeqRaw error
4109  scope.RemoveTopLevelSeqEntry(seh);
4110  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAANNNNNNNNNNNNNNNNNNNNTTTTT");
4111  entry->SetSeq().SetInst().SetLength(30);
4112  seh = scope.AddTopLevelSeqEntry(*entry);
4113  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4114  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 66 percent Ns"));
4115  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4116  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4117  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4118  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4119  // AddChromosomeNoLocation(expected_errors, entry);
4120  eval = validator.Validate(seh, options);
4121  CheckErrors(*eval, expected_errors);
4122 
4123  CLEAR_ERRORS
4124 
4125  // WGS has lower threshold
4126  SetTech(entry, CMolInfo::eTech_wgs);
4127  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqRaw", "Run of 20 Ns in raw sequence starting at base 6"));
4128  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4129  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 66 percent Ns"));
4130  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4131  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4132  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4133  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4134  AddChromosomeNoLocation(expected_errors, entry);
4135  eval = validator.Validate(seh, options);
4136  CheckErrors(*eval, expected_errors);
4137 
4138  CLEAR_ERRORS
4139 }
4140 
4141 
4142 BOOST_AUTO_TEST_CASE(Test_InternalNsAdjacentToGap)
4143 {
4144  // prepare entry
4146  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("ATGATGATGNNN");
4147  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNATGATGATG");
4148 
4150 
4151  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 9 bases"));
4152  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InternalNsAdjacentToGap", "Ambiguous residue N is adjacent to a gap around position 13"));
4153  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InternalNsAdjacentToGap", "Ambiguous residue N is adjacent to a gap around position 23"));
4154 // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4155 // "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4156 // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4157 // "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4158  // AddChromosomeNoLocation(expected_errors, entry);
4159 
4160  eval = validator.Validate(seh, options);
4161  CheckErrors(*eval, expected_errors);
4162 
4163  CLEAR_ERRORS
4164 }
4165 
4166 BOOST_AUTO_TEST_CASE(Test_DeltaComponentIsGi0)
4167 {
4168  // prepare entry
4170  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
4171  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
4172  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGi(ZERO_GI);
4173 
4175 
4176  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "DeltaComponentIsGi0", "Delta component is gi|0"));
4177  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DeltaSeqError", "Unable to find far delta sequence component"));
4178  // AddChromosomeNoLocation(expected_errors, entry);
4179 
4180  eval = validator.Validate(seh, options);
4181  CheckErrors(*eval, expected_errors);
4182 
4183  CLEAR_ERRORS
4184 }
4185 
4186 
4187 BOOST_AUTO_TEST_CASE(Test_InternalGapsInSeqRaw)
4188 {
4189  // prepare entry
4191  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGGCCAAAATTGGCCAAAATTGG-CAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
4192 
4194 
4195  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue '-' at position [27]"));
4196  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalGapsInSeqRaw", "Raw nucleotide should not contain gap characters"));
4197  // AddChromosomeNoLocation(expected_errors, entry);
4198 
4199  eval = validator.Validate(seh, options);
4200  CheckErrors(*eval, expected_errors);
4201 
4202  CLEAR_ERRORS
4203 }
4204 
4205 
4206 BOOST_AUTO_TEST_CASE(Test_SelfReferentialSequence)
4207 {
4208  // prepare entry
4210  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
4211  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
4212  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetLocal().SetStr("good");
4213 
4215 
4216  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "SelfReferentialSequence", "Self-referential delta sequence"));
4217  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InstantiatedGapMismatch", "Exception 4 in GapByGapInst"));
4218  // AddChromosomeNoLocation(expected_errors, entry);
4219 
4220  eval = validator.Validate(seh, options);
4221  CheckErrors(*eval, expected_errors);
4222 
4223  CLEAR_ERRORS
4224 }
4225 
4226 
4228 {
4229  // prepare entry
4231  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetWhole().SetGenbank().SetAccession("AY123456");
4232  entry->SetSeq().SetInst().SetLength(507);
4233 
4235 
4236  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WholeComponent", "Delta seq component should not be of type whole"));
4237  // AddChromosomeNoLocation(expected_errors, entry);
4238 
4239  eval = validator.Validate(seh, options);
4240  CheckErrors(*eval, expected_errors);
4241 
4242  CLEAR_ERRORS
4243 }
4244 
4245 
4247 {
4248  CRef<CSeq_id> gnl(new CSeq_id());
4249  gnl->SetGeneral().SetDb("a");
4250  gnl->SetGeneral().SetTag().SetStr("b");
4251  seq.SetId().front()->Assign(*gnl);
4252  CRef<CSeq_id> lcl(new CSeq_id());
4253  lcl->SetLocal().SetStr("x");
4254  seq.SetId().push_back(lcl);
4255  seq.SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().Assign(*gnl);
4256 }
4257 
4258 
4259 BOOST_AUTO_TEST_CASE(Test_ProteinsHaveGeneralID)
4260 {
4261  // prepare entry
4263  s_AddGeneralAndLocal(entry->SetSeq());
4264 
4266 
4267  // no error unless part of nuc-prot set
4268  // AddChromosomeNoLocation(expected_errors,entry);
4269  eval = validator.Validate(seh, options);
4270  CheckErrors(*eval, expected_errors);
4271  CLEAR_ERRORS
4272 
4273  scope.RemoveTopLevelSeqEntry(seh);
4276  s_AddGeneralAndLocal(prot->SetSeq());
4277 
4279  cds->SetProduct().SetWhole().SetGeneral().SetDb("a");
4280  cds->SetProduct().SetWhole().SetGeneral().SetTag().SetStr("b");
4281  seh = scope.AddTopLevelSeqEntry(*entry);
4282 
4283  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "ProteinsHaveGeneralID", "INDEXER_ONLY - Protein bioseqs have general seq-id."));
4284  // AddChromosomeNoLocation(expected_errors, entry);
4285 
4286  eval = validator.Validate(seh, options);
4287  CheckErrors(*eval, expected_errors);
4288 
4289  CLEAR_ERRORS
4290 }
4291 
4292 
4293 BOOST_AUTO_TEST_CASE(Test_HighNContentPercent_and_HighNContentStretch)
4294 {
4295  // prepare entry
4297  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCNNNNNNNNNNNAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTT");
4298  entry->SetSeq().SetInst().SetLength(100);
4299  SetTech(entry, CMolInfo::eTech_tsa);
4301  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
4302 
4304 
4305  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 11 percent Ns"));
4306  // AddChromosomeNoLocation(expected_errors, entry);
4307  eval = validator.Validate(seh, options);
4308  CheckErrors(*eval, expected_errors);
4309 
4310  scope.RemoveTopLevelSeqEntry(seh);
4311  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCNNNNNNNNNNNNNNNNTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTT");
4312  seh = scope.AddTopLevelSeqEntry(*entry);
4313  expected_errors[0]->SetErrMsg("Sequence contains 16 percent Ns");
4314  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentStretch", "Sequence has a stretch of 16 Ns"));
4315  eval = validator.Validate(seh, options);
4316  CheckErrors(*eval, expected_errors);
4317 
4318  CLEAR_ERRORS
4319 
4320  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentStretch", "Sequence has a stretch of 16 Ns"));
4321  eval = validator.GetTSANStretchErrors(seh);
4322  CheckErrors(*eval, expected_errors);
4323  eval = validator.GetTSANStretchErrors(entry->GetSeq());
4324  CheckErrors(*eval, expected_errors);
4325 
4326  CLEAR_ERRORS
4327 
4328  scope.RemoveTopLevelSeqEntry(seh);
4329  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AANNNNNNNNNNGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGTTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCNNNNNNNNNNAAA");
4330  seh = scope.AddTopLevelSeqEntry(*entry);
4331  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4332  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4333  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4334  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4335  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent",
4336  "Sequence contains 20 percent Ns"));
4337  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent5Prime",
4338  "Sequence has a stretch of at least 10 Ns within the first 20 bases"));
4339  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent3Prime",
4340  "Sequence has a stretch of at least 10 Ns within the last 20 bases"));
4341  // AddChromosomeNoLocation(expected_errors, entry);
4342  eval = validator.Validate(seh, options);
4343  CheckErrors(*eval, expected_errors);
4344 
4345  CLEAR_ERRORS
4346 
4347  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent5Prime", "Sequence has a stretch of at least 10 Ns within the first 20 bases"));
4348  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent3Prime", "Sequence has a stretch of at least 10 Ns within the last 20 bases"));
4349  eval = validator.GetTSANStretchErrors(seh);
4350  CheckErrors(*eval, expected_errors);
4351  eval = validator.GetTSANStretchErrors(entry->GetSeq());
4352  CheckErrors(*eval, expected_errors);
4353 
4354  CLEAR_ERRORS
4355 
4356  scope.RemoveTopLevelSeqEntry(seh);
4358  CRef<CDelta_seq> gap_seg(new CDelta_seq());
4359  gap_seg->SetLiteral().SetSeq_data().SetGap();
4360  gap_seg->SetLiteral().SetLength(10);
4361  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
4362  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGA", CSeq_inst::eMol_dna);
4363  entry->SetSeq().SetInst().SetLength(entry->GetSeq().GetInst().GetLength() + 20);
4364  seh = scope.AddTopLevelSeqEntry(*entry);
4365 
4366  /*
4367  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4368  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4369  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4370  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4371  */
4372  // AddChromosomeNoLocation(expected_errors, entry);
4373 
4374  eval = validator.Validate(seh, options);
4375  CheckErrors(*eval, expected_errors);
4376 
4377  CLEAR_ERRORS
4378 }
4379 
4380 
4381 BOOST_AUTO_TEST_CASE(Test_SeqLitDataLength0)
4382 {
4383  // prepare entry
4385 
4386  CDelta_ext::Tdata::iterator seg_it = entry->SetSeq().SetInst().SetExt().SetDelta().Set().begin();
4387  ++seg_it;
4388  (*seg_it)->SetLiteral().SetSeq_data().SetIupacna().Set();
4389  (*seg_it)->SetLiteral().SetLength(0);
4390 
4391  entry->SetSeq().SetInst().SetLength(24);
4392 
4394 
4395  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLitDataLength0", "Seq-lit of length 0 in delta chain"));
4396  // AddChromosomeNoLocation(expected_errors, entry);
4397  eval = validator.Validate(seh, options);
4398  CheckErrors(*eval, expected_errors);
4399 
4400  CLEAR_ERRORS
4401 }
4402 
4403 
4405 {
4407 
4408  entry->SetSeq().SetInst().ResetSeq_data();
4409  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
4410  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("ATGATGATGCCC", CSeq_inst::eMol_dna);
4411  CRef<CDelta_seq> gap_seg(new CDelta_seq());
4412  gap_seg->SetLiteral().SetLength(101);
4413  gap_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
4414  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
4415  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATG", CSeq_inst::eMol_dna);
4416  entry->SetSeq().SetInst().SetLength(125);
4417 
4418  return entry;
4419 }
4420 
4421 
4422 BOOST_AUTO_TEST_CASE(Test_UnknownLengthGapNot100)
4423 {
4425 
4427 
4428  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnknownLengthGapNot100", "Gap of unknown length should have length 100"));
4429  /*
4430  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4431  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4432  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4433  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4434  */
4435  // AddChromosomeNoLocation(expected_errors, entry);
4436  eval = validator.Validate(seh, options);
4437  CheckErrors(*eval, expected_errors);
4438 
4439  CLEAR_ERRORS
4440 }
4441 
4442 
4444 {
4445  // prepare entry
4447  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
4449  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ds);
4450 
4452 
4453  // double strand
4454  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "mRNAshouldBeSingleStranded", "mRNA should be single stranded not double stranded"));
4455  // AddChromosomeNoLocation(expected_errors, entry);
4456  eval = validator.Validate(seh, options);
4457  CheckErrors(*eval, expected_errors);
4458 
4459  // mixed strand
4460  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_mixed);
4461  eval = validator.Validate(seh, options);
4462  CheckErrors(*eval, expected_errors);
4463 
4464  // mixed strand
4465  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_other);
4466  eval = validator.Validate(seh, options);
4467  CheckErrors(*eval, expected_errors);
4468 
4469  CLEAR_ERRORS
4470 
4471  // these should not produce errors
4472 
4473  // strand not set
4474  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_not_set);
4475  eval = validator.Validate(seh, options);
4476  // AddChromosomeNoLocation(expected_errors, entry);
4477 
4478  CheckErrors(*eval, expected_errors);
4479 
4480  entry->SetSeq().SetInst().ResetStrand();
4481  eval = validator.Validate(seh, options);
4482  CheckErrors(*eval, expected_errors);
4483 
4484  // single strand
4485  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ss);
4486  eval = validator.Validate(seh, options);
4487  CheckErrors(*eval, expected_errors);
4488 
4489  CLEAR_ERRORS
4490 }
4491 
4492 
4493 BOOST_AUTO_TEST_CASE(Test_BioSourceMissing)
4494 {
4495  // prepare entry
4498  unit_test_util::AddGoodSource(entry->SetSet().SetSeq_set().front());
4499 
4501 
4502  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing", "Nuc-prot set does not contain expected BioSource descriptor"));
4503  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Fatal, "NoOrgFound", "No organism name included in the source. Other qualifiers may exist."));
4504  // AddChromosomeNoLocation(expected_errors, entry);
4505 
4506  eval = validator.Validate(seh, options);
4507  CheckErrors(*eval, expected_errors);
4508 
4509  CLEAR_ERRORS
4510 }
4511 
4512 
4513 BOOST_AUTO_TEST_CASE(Test_Descr_InvalidForType)
4514 {
4515  // prepare entry
4517  CRef<CSeqdesc> desc;
4518  desc.Reset(new CSeqdesc());
4520  entry->SetDescr().Set().push_back(desc);
4521  desc.Reset(new CSeqdesc());
4522  desc->SetModif().push_back(eGIBB_mod_dna);
4523  entry->SetDescr().Set().push_back(desc);
4524  desc.Reset(new CSeqdesc());
4526  entry->SetDescr().Set().push_back(desc);
4527  desc.Reset(new CSeqdesc());
4528  desc->SetOrg().SetTaxname("Sebaea microphylla");
4529  entry->SetDescr().Set().push_back(desc);
4530  AddTpaAssemblyUserObject(entry);
4531 
4533 
4534  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide",
4535  "Nucleic acid with protein sequence method"));
4536  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4537  "MolType descriptor is obsolete"));
4538  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4539  "Modif descriptor is obsolete"));
4540  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4541  "Method descriptor is obsolete"));
4542  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4543  "OrgRef descriptor is obsolete"));
4544  // AddChromosomeNoLocation(expected_errors, entry);
4545 
4546  // won't complain about TPA assembly if only local ID
4547  eval = validator.Validate(seh, options);
4548  CheckErrors(*eval, expected_errors);
4549 
4550  CLEAR_ERRORS
4551 
4552  scope.RemoveTopLevelSeqEntry(seh);
4553  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
4558  seh = scope.AddTopLevelSeqEntry(*entry);
4559  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TPAassemblyWithoutTPAKeyword",
4560  "Non-TPA record gb|AY123456| should not have TpaAssembly object"));
4561  // AddChromosomeNoLocation(expected_errors, entry);
4562  SetErrorsAccessions(expected_errors, "gb|AY123456|");
4563  eval = validator.Validate(seh, options);
4564  CheckErrors(*eval, expected_errors);
4565 
4566  scope.RemoveTopLevelSeqEntry(seh);
4567  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
4568  seh = scope.AddTopLevelSeqEntry(*entry);
4569  SetErrorsAccessions(expected_errors, "ref|NC_123456|");
4570  expected_errors[0]->SetErrMsg("Non-TPA record ref|NC_123456| should not have TpaAssembly object");
4571  eval = validator.Validate(seh, options);
4572  CheckErrors(*eval, expected_errors);
4573 
4574  desc.Reset(new CSeqdesc());
4576  entry->SetDescr().Set().push_back(desc);
4577  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InvalidForTypeGIBB",
4578  "Nucleic acid with GIBB-mol = peptide"));
4579  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InvalidForType",
4580  "MolType descriptor is obsolete"));
4581  eval = validator.Validate(seh, options);
4582  CheckErrors(*eval, expected_errors);
4583 
4585  expected_errors[1]->SetErrMsg("GIBB-mol unknown or other used");
4586  eval = validator.Validate(seh, options);
4587  CheckErrors(*eval, expected_errors);
4588 
4590  eval = validator.Validate(seh, options);
4591  CheckErrors(*eval, expected_errors);
4592 
4593  CLEAR_ERRORS
4594 
4595  scope.RemoveTopLevelSeqEntry(seh);
4597  desc.Reset(new CSeqdesc());
4599  entry->SetDescr().Set().push_back(desc);
4600  seh = scope.AddTopLevelSeqEntry(*entry);
4601  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4602  "GIBB-mol [1] used on protein"));
4603  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4604  "MolType descriptor is obsolete"));
4605  // AddChromosomeNoLocation(expected_errors, entry);
4606  eval = validator.Validate(seh, options);
4607  CheckErrors(*eval, expected_errors);
4608 
4610  expected_errors[0]->SetErrMsg("GIBB-mol [2] used on protein");
4611  eval = validator.Validate(seh, options);
4612  CheckErrors(*eval, expected_errors);
4613 
4614  desc->SetMol_type(eGIBB_mol_mRNA);
4615  expected_errors[0]->SetErrMsg("GIBB-mol [3] used on protein");
4616  eval = validator.Validate(seh, options);
4617  CheckErrors(*eval, expected_errors);
4618 
4619  desc->SetMol_type(eGIBB_mol_rRNA);
4620  expected_errors[0]->SetErrMsg("GIBB-mol [4] used on protein");
4621  eval = validator.Validate(seh, options);
4622  CheckErrors(*eval, expected_errors);
4623 
4624  desc->SetMol_type(eGIBB_mol_tRNA);
4625  expected_errors[0]->SetErrMsg("GIBB-mol [5] used on protein");
4626  eval = validator.Validate(seh, options);
4627  CheckErrors(*eval, expected_errors);
4628 
4630  expected_errors[0]->SetErrMsg("GIBB-mol [6] used on protein");
4631  eval = validator.Validate(seh, options);
4632  CheckErrors(*eval, expected_errors);
4633 
4635  expected_errors[0]->SetErrMsg("GIBB-mol [7] used on protein");
4636  eval = validator.Validate(seh, options);
4637  CheckErrors(*eval, expected_errors);
4638 
4640  expected_errors[0]->SetErrMsg("GIBB-mol [9] used on protein");
4641  eval = validator.Validate(seh, options);
4642  CheckErrors(*eval, expected_errors);
4643 
4645  expected_errors[0]->SetErrMsg("GIBB-mol [10] used on protein");
4646  eval = validator.Validate(seh, options);
4647  CheckErrors(*eval, expected_errors);
4648 
4649  CLEAR_ERRORS
4650 
4651  // invalid modif
4652  desc->SetModif().push_back(eGIBB_mod_dna);
4653  desc->SetModif().push_back(eGIBB_mod_rna);
4654  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4655  "Nucleic acid GIBB-mod [0] on protein"));
4656  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4657  "Nucleic acid GIBB-mod [1] on protein"));
4658  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4659  "Modif descriptor is obsolete"));
4660  // AddChromosomeNoLocation(expected_errors, entry);
4661  eval = validator.Validate(seh, options);
4662  CheckErrors(*eval, expected_errors);
4663 
4664  CLEAR_ERRORS
4665 
4666  scope.RemoveTopLevelSeqEntry(seh);
4667  entry = unit_test_util::BuildGoodSeq();
4668  for (auto& it : entry->SetSeq().SetDescr().Set()) {
4669  if (it->IsSource()) {
4670  it->SetSource().SetOrigin(CBioSource::eOrigin_synthetic);
4671  }
4672  }
4673  seh = scope.AddTopLevelSeqEntry(*entry);
4674  // if biomol not other, should generate error
4675  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidForType",
4676  "Molinfo-biomol other should be used if Biosource-location is synthetic"));
4677  // AddChromosomeNoLocation(expected_errors, entry);
4678  eval = validator.Validate(seh, options);
4679  CheckErrors(*eval, expected_errors);
4680 
4681  CLEAR_ERRORS
4682 
4683  for (auto& it : entry->SetSeq().SetDescr().Set()) {
4684  if (it->IsSource()) {
4685  it->SetSource().ResetOrigin();
4686  }
4687  }
4688 
4690  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidMolInfo",
4691  "Nucleic acid with Molinfo = peptide"));
4692  // AddChromosomeNoLocation(expected_errors, entry);
4693  eval = validator.Validate(seh, options);
4694  CheckErrors(*eval, expected_errors);
4695  CLEAR_ERRORS
4696 
4698  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
4699  "MoltypeOtherGenetic", "Molinfo-biomol = other genetic"));
4700  // AddChromosomeNoLocation(expected_errors, entry);
4701  eval = validator.Validate(seh, options);
4702  CheckErrors(*eval, expected_errors);
4703  CLEAR_ERRORS
4704 
4706  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4707  "MoltypeUnknown", "Molinfo-biomol unknown used"));
4708  // AddChromosomeNoLocation(expected_errors, entry);
4709  eval = validator.Validate(seh, options);
4710  CheckErrors(*eval, expected_errors);
4711  CLEAR_ERRORS
4712 
4714  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
4715  "MoltypeOther", "Molinfo-biomol other used"));
4716  // AddChromosomeNoLocation(expected_errors, entry);
4717  eval = validator.Validate(seh, options);
4718  CheckErrors(*eval, expected_errors);
4719  CLEAR_ERRORS
4720 
4721  scope.RemoveTopLevelSeqEntry(seh);
4723  seh = scope.AddTopLevelSeqEntry(*entry);
4724 
4725  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4726  "InvalidForType", "Molinfo-biomol [1] used on protein"));
4727  // AddChromosomeNoLocation(expected_errors, entry);
4729  expected_errors[0]->SetErrMsg("Molinfo-biomol [1] used on protein");
4730  eval = validator.Validate(seh, options);
4731  CheckErrors(*eval, expected_errors);
4732 
4734  expected_errors[0]->SetErrMsg("Molinfo-biomol [2] used on protein");
4735  eval = validator.Validate(seh, options);
4736  CheckErrors(*eval, expected_errors);
4737 
4739  expected_errors[0]->SetErrMsg("Molinfo-biomol [3] used on protein");
4740  eval = validator.Validate(seh, options);
4741  CheckErrors(*eval, expected_errors);
4742 
4744  expected_errors[0]->SetErrMsg("Molinfo-biomol [4] used on protein");
4745  eval = validator.Validate(seh, options);
4746  CheckErrors(*eval, expected_errors);
4747 
4749  expected_errors[0]->SetErrMsg("Molinfo-biomol [5] used on protein");
4750  eval = validator.Validate(seh, options);
4751  CheckErrors(*eval, expected_errors);
4752 
4754  expected_errors[0]->SetErrMsg("Molinfo-biomol [6] used on protein");
4755  eval = validator.Validate(seh, options);
4756  CheckErrors(*eval, expected_errors);
4757 
4759  expected_errors[0]->SetErrMsg("Molinfo-biomol [7] used on protein");
4760  eval = validator.Validate(seh, options);
4761  CheckErrors(*eval, expected_errors);
4762 
4764  expected_errors[0]->SetErrMsg("Molinfo-biomol [10] used on protein");
4765  eval = validator.Validate(seh, options);
4766  CheckErrors(*eval, expected_errors);
4767 
4769  expected_errors[0]->SetErrMsg("Molinfo-biomol [11] used on protein");
4770  eval = validator.Validate(seh, options);
4771  CheckErrors(*eval, expected_errors);
4772 
4774  expected_errors[0]->SetErrMsg("Molinfo-biomol [12] used on protein");
4775  eval = validator.Validate(seh, options);
4776  CheckErrors(*eval, expected_errors);
4777 
4779  expected_errors[0]->SetErrMsg("Molinfo-biomol [13] used on protein");
4780  eval = val