NCBI C++ ToolKit
unit_test_validator.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: unit_test_validator.cpp 102041 2024-03-21 18:50:50Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin, NCBI
27  *
28  * File Description:
29  * Unit tests for the validator.
30  *
31  * ===========================================================================
32  */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "unit_test_validator.hpp"
37 
38 #include <corelib/ncbi_system.hpp>
39 
40 // This macro should be defined before inclusion of test_boost.hpp in all
41 // "*.cpp" files inside executable except one. It is like function main() for
42 // non-Boost.Test executables is defined only in one *.cpp file - other files
43 // should not include it. If NCBI_BOOST_NO_AUTO_TEST_MAIN will not be defined
44 // then test_boost.hpp will define such "main()" function for tests.
45 //
46 // Usually if your unit tests contain only one *.cpp file you should not
47 // care about this macro at all.
48 //
49 //#define NCBI_BOOST_NO_AUTO_TEST_MAIN
50 
51 #define BAD_VALIDATOR
52 
53 // This header must be included before all Boost.Test headers if there are any
54 #include <corelib/test_boost.hpp>
55 
56 // for ignoring external config files
57 #include <util/util_misc.hpp>
58 
60 #include <objects/biblio/Title.hpp>
66 #include <objects/pub/Pub.hpp>
68 #include <objects/seq/GIBB_mol.hpp>
69 #include <objects/seq/Seq_ext.hpp>
73 #include <objects/seq/Ref_ext.hpp>
74 #include <objects/seq/Map_ext.hpp>
75 #include <objects/seq/Seg_ext.hpp>
76 #include <objects/seq/Seq_gap.hpp>
77 #include <objects/seq/Seq_data.hpp>
79 #include <objects/seq/Seqdesc.hpp>
80 #include <objects/seq/MolInfo.hpp>
81 #include <objects/seq/Pubdesc.hpp>
82 #include <objects/seq/Seq_hist.hpp>
100 #include <objmgr/object_manager.hpp>
101 #include <objmgr/scope.hpp>
102 #include <objmgr/bioseq_ci.hpp>
103 #include <objmgr/feat_ci.hpp>
104 #include <objmgr/seq_vector.hpp>
105 #include <objmgr/util/sequence.hpp>
106 #include <objmgr/seqdesc_ci.hpp>
107 #include <objmgr/util/sequence.hpp>
115 #include <corelib/ncbiapp.hpp>
116 #include <common/ncbi_export.h>
120 #include <objtools/edit/cds_fix.hpp>
122 
123 // for writing out tmp files
124 #include <serial/objostrasn.hpp>
125 #include <serial/objostrasnb.hpp>
126 
128 
131 
132 using namespace validator;
133 using namespace unit_test_util;
134 
135 
136 CExpectedError::CExpectedError(string accession, EDiagSev severity, string err_code, string err_msg)
137  : m_Accession(accession), m_Severity(severity), m_ErrCode(err_code), m_ErrMsg(err_msg)
138 {
139 }
140 
142 {
143 }
144 
145 
146 bool CExpectedError::Match(const CValidErrItem& err_item, bool ignore_severity)
147 {
148  if (!NStr::IsBlank(m_Accession) && !NStr::IsBlank(err_item.GetAccnver())
149  && !NStr::Equal(err_item.GetAccnver(), m_Accession)) {
150  return false;
151  }
152  if (!NStr::Equal(err_item.GetErrCode(), m_ErrCode)) {
153  return false;
154  }
155  string msg = err_item.GetMsg();
156  size_t pos = NStr::Find(msg, " EXCEPTION: NCBI C++ Exception:");
157  if (pos != string::npos) {
158  msg = msg.substr(0, pos);
159  }
160 
161  if (!NStr::Equal(msg, m_ErrMsg)) {
162  return false;
163  }
164  if (!ignore_severity && m_Severity != err_item.GetSeverity()) {
165  return false;
166  }
167  return true;
168 }
169 
170 
171 void CExpectedError::Test(const CValidErrItem& err_item)
172 {
173  if (!NStr::IsBlank(m_Accession) && !NStr::IsBlank(err_item.GetAccnver())) {
174  BOOST_CHECK_EQUAL(err_item.GetAccnver(), m_Accession);
175  }
176  BOOST_CHECK_EQUAL(err_item.GetSeverity(), m_Severity);
177  BOOST_CHECK_EQUAL(err_item.GetErrCode(), m_ErrCode);
178  string msg = err_item.GetMsg();
179  size_t pos = NStr::Find(msg, " EXCEPTION: NCBI C++ Exception:");
180  if (pos != string::npos) {
181  msg = msg.substr(0, pos);
182  }
183  BOOST_CHECK_EQUAL(msg, m_ErrMsg);
184 }
185 
186 
188 {
189  string description = err_item.GetAccnver() + ":"
190  + CValidErrItem::ConvertSeverity(err_item.GetSeverity()) + ":"
191  + err_item.GetErrCode() + ":"
192  + err_item.GetMsg();
193  printf("%s\n", description.c_str());
194 }
195 
196 
198 {
199  string description = m_Accession + ":"
201  + m_ErrCode + ":"
202  + m_ErrMsg;
203  printf("%s\n", description.c_str());
204 }
205 
206 
207 static bool s_debugMode = false;
208 
209 void WriteErrors(const CValidError& eval, bool debug_mode)
210 {
211  if (debug_mode) {
212  printf("\n-\n");
213  }
214  for (CValidError_CI vit(eval); vit; ++vit) {
216  }
217  if (debug_mode) {
218  printf("\n\n");
219  }
220  printf("\n\n");
221 }
222 
223 
224 void CheckErrors(const CValidError& eval,
225  vector<CExpectedError*>& expected_errors)
226 {
227  //static int count(1);
228  //if (count == 1367) {
229  // cerr << "";
230  //}
231  //cerr << count++ << "\n";
232 
233  bool problem_found = false;
234 
235  if (s_debugMode) {
236  WriteErrors(eval, true);
237  return;
238  }
239 
240  vector<bool> expected_found;
241  for (size_t i = 0; i < expected_errors.size(); i++) {
242  if (expected_errors[i]) {
243  expected_found.push_back(false);
244  } else {
245  expected_found.push_back(true);
246  }
247  }
248 
249  for (CValidError_CI vit(eval); vit; ++vit) {
250  bool found = false;
251  for (size_t i = 0; i < expected_errors.size(); i++) {
252  if (!expected_found[i] && expected_errors[i]->Match(*vit)) {
253  expected_found[i] = true;
254  found = true;
255  break;
256  }
257  }
258  if (!found) {
259  for (size_t i = 0; i < expected_errors.size(); i++) {
260  if (!expected_found[i] && expected_errors[i]->Match(*vit, true)) {
261  printf("Problem with ");
263  expected_errors[i]->Test(*vit);
264  expected_found[i] = true;
265  found = true;
266  problem_found = true;
267  break;
268  }
269  }
270  }
271  if (!found) {
272  BOOST_CHECK_EQUAL("Unexpected error", "Error not found");
274  problem_found = true;
275  }
276  }
277 
278  for (size_t i = 0; i < expected_errors.size(); i++) {
279  if (!expected_found[i]) {
280  BOOST_CHECK_EQUAL(expected_errors[i]->GetErrMsg(), "Expected error not found");
281  problem_found = true;
282  }
283  }
284 
285  if (problem_found) {
286  WriteErrors(eval, false);
287 
288  printf("Expected:\n");
289  for (auto it : expected_errors) {
290  if (it) {
291  it->Print();
292  }
293  }
294  }
295 }
296 
297 
298 void CheckStrings(const vector<string>& seen, const vector<string>& expected)
299 {
300  auto it1 = seen.begin();
301  auto it2 = expected.begin();
302  bool any = false;
303  while (it1 != seen.end() && it2 != expected.end()) {
304  BOOST_CHECK_EQUAL(*it1, *it2);
305  if (!NStr::Equal(*it1, *it2)) {
306  any = true;
307  }
308  it1++;
309  it2++;
310  }
311  while (it1 != seen.end()) {
312  BOOST_CHECK_EQUAL(*it1, "Unexpected string");
313  it1++;
314  any = true;
315  }
316  while (it2 != expected.end()) {
317  BOOST_CHECK_EQUAL("Missing string", *it2);
318  it2++;
319  any = true;
320  }
321 
322  if (any) {
323  printf("Seen:\n");
324  auto it1 = seen.begin();
325  while (it1 != seen.end()) {
326  printf("%s\n", (*it1).c_str());
327  it1++;
328  }
329  printf("Expected:\n");
330  auto it2 = expected.begin();
331  while (it2 != expected.end()) {
332  printf("%s\n", (*it2).c_str());
333  it2++;
334  }
335  }
336 }
337 
338 
339 // Not currently used, but I'll leave it here in case
340 // it's useful in the future.
341 
342 #if 0
343 static void SetCountryOnSrc(CBioSource& src, string country)
344 {
345  if (NStr::IsBlank(country)) {
346  if (src.IsSetSubtype()) {
347  auto& cont = src.SetSubtype();
348  cont.remove_if([](CSubSource* it) {
349  return (it->IsSetSubtype() && it->GetSubtype() == CSubSource::eSubtype_country);
350  });
351  }
352  } else {
354  src.SetSubtype().push_back(sub);
355  }
356 }
357 #endif
358 
360 static string ToAsn1(const CRef<CSeq_entry>& entry)
361 {
362  CNcbiOstrstream os;
363  os << MSerial_AsnText << entry;
364  return os.str();
365 }
366 
369 
372 
374 {
375  if (!CNcbiApplication::Instance()->GetConfig().HasEntry("NCBI", "Data")) {
376  NCBITEST_DISABLE(Test_Descr_BadStructuredCommentFormat);
377  NCBITEST_DISABLE(Test_Descr_MissingKeyword);
378  }
379 }
380 
381 
382 static void SetErrorsAccessions(vector<CExpectedError*>& expected_errors, string accession)
383 {
384  size_t i, len = expected_errors.size();
385  for (i = 0; i < len; i++) {
386  expected_errors[i]->SetAccession(accession);
387  }
388 }
389 
391 {
392  // Here we make descriptions of command line parameters that we are
393  // going to use.
394 
395  arg_desc->AddFlag(
396  "debug_mode", "Debugging mode writes errors seen for each test");
397 }
398 
400 {
401  // initialization function body
402 
403  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
404  if (args["debug_mode"]) {
405  s_debugMode = true;
406  }
407  g_IgnoreDataFile("institution_codes.txt");
408 }
409 
410 void AddChromosomeNoLocation(vector<CExpectedError*>& expected_errors, const string& id)
411 {
412  expected_errors.push_back(new CExpectedError(id, eDiag_Error,
413  "ChromosomeWithoutLocation",
414  "INDEXER_ONLY - source contains chromosome value '1' but the BioSource location is not set to chromosome"));
415 }
416 
417 void AddChromosomeNoLocation(vector<CExpectedError*>& expected_errors, CRef<CSeq_entry> entry)
418 {
419  if (entry->IsSeq()) {
420  CConstRef<CSeq_id> seqid = sequence::GetId(entry->GetSeq(), sequence::eGetId_Best).GetSeqId();
421  AddChromosomeNoLocation(expected_errors, seqid->AsFastaString());
422  } else if (entry->IsSet()) {
423  if (entry->GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
425  AddChromosomeNoLocation(expected_errors, nuc_entry);
426  } else {
427  for (auto it : entry->SetSet().SetSeq_set()) {
428  AddChromosomeNoLocation(expected_errors, it);
429  }
430  }
431  }
432 }
433 
434 
435 // new case test ground
436 
437 BOOST_AUTO_TEST_CASE(Test_Descr_MissingKeyword)
438 {
439  // prepare entry
441  CRef<CSeqdesc> sdesc(new CSeqdesc());
442  sdesc->SetUser().SetType().SetStr("StructuredComment");
443  entry->SetSeq().SetDescr().Set().push_back(sdesc);
444 
445  sdesc->SetUser().AddField("StructuredCommentPrefix", "##MIGS-Data-START##", CUser_object::eParse_String);
446  sdesc->SetUser().AddField("alt_elev", "foo", CUser_object::eParse_String);
447  sdesc->SetUser().AddField("assembly", "foo", CUser_object::eParse_String);
448  sdesc->SetUser().AddField("collection_date", "foo", CUser_object::eParse_String);
449  sdesc->SetUser().AddField("country", "foo", CUser_object::eParse_String);
450  sdesc->SetUser().AddField("depth", "foo", CUser_object::eParse_String);
451  sdesc->SetUser().AddField("environment", "foo", CUser_object::eParse_String);
452  sdesc->SetUser().AddField("investigation_type", "eukaryote", CUser_object::eParse_String);
453  sdesc->SetUser().AddField("isol_growth_condt", "foo", CUser_object::eParse_String);
454  sdesc->SetUser().AddField("sequencing_meth", "foo", CUser_object::eParse_String);
455  sdesc->SetUser().AddField("project_name", "foo", CUser_object::eParse_String);
456  sdesc->SetUser().AddField("ploidy", "foo", CUser_object::eParse_String);
457  sdesc->SetUser().AddField("num_replicons", "foo", CUser_object::eParse_String);
458  sdesc->SetUser().AddField("estimated_size", "foo", CUser_object::eParse_String);
459  sdesc->SetUser().AddField("trophic_level", "foo", CUser_object::eParse_String);
460  sdesc->SetUser().AddField("propagation", "foo", CUser_object::eParse_String);
461  sdesc->SetUser().AddField("lat_lon", "foo", CUser_object::eParse_String);
462 
463  CRef<CSeqdesc> gdesc(new CSeqdesc());
464  gdesc->SetGenbank().SetKeywords().push_back("GSC:MIGS:2.1");
465  entry->SetSeq().SetDescr().Set().push_back(gdesc);
466 
468 
469  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadKeywordForStrucComm",
470  "Structured Comment is non-compliant, keyword should be removed"));
471  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommMissingField",
472  "Required field finishing_strategy is missing when investigation_type has value 'eukaryote'"));
473  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue",
474  "Structured Comment invalid; the field value and/or name are incorrect"));
475  // AddChromosomeNoLocation(expected_errors, entry);
476  eval = validator.Validate(seh, options);
477  CheckErrors(*eval, expected_errors);
478 
479  // if no keyword, no badkeyword error
480  entry->SetSeq().SetDescr().Set().pop_back();
481  delete expected_errors[0];
482  expected_errors[0] = nullptr;
483  eval = validator.Validate(seh, options);
484  CheckErrors(*eval, expected_errors);
485 
487 
488  // make the comment valid, should complain about missing keyword
489  sdesc->SetUser().AddField("finishing_strategy", "foo", CUser_object::eParse_String);
490  // AddChromosomeNoLocation(expected_errors, entry);
491  eval = validator.Validate(seh, options);
492  CheckErrors(*eval, expected_errors);
493 
495  // put keyword back, should have no errors
496  entry->SetSeq().SetDescr().Set().push_back(gdesc);
497  // AddChromosomeNoLocation(expected_errors, entry);
498  eval = validator.Validate(seh, options);
499  CheckErrors(*eval, expected_errors);
501 }
502 
503 
504 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonValue)
505 {
506  // prepare entry
510 
512 
513  /*
514  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
515  "Latitude should be set to N (northern hemisphere)"));
516  eval = validator.Validate(seh, options);
517  CheckErrors(*eval, expected_errors);
518  */
519 
522  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
523  "Longitude should be set to W (western hemisphere)"));
524  eval = validator.Validate(seh, options);
525  CheckErrors(*eval, expected_errors);
526 
531  expected_errors[0]->SetErrMsg("Latitude should be set to S (southern hemisphere)");
532  eval = validator.Validate(seh, options);
533  CheckErrors(*eval, expected_errors);
534 
535  /*
536  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
537  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "25 S 47 W");
538  expected_errors[0]->SetErrMsg("Longitude should be set to E (eastern hemisphere)");
539  eval = validator.Validate(seh, options);
540  CheckErrors(*eval, expected_errors);
541  */
542 
544 
549  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
550  "Latitude and longitude values appear to be exchanged"));
551  eval = validator.Validate(seh, options);
552  CheckErrors(*eval, expected_errors);
553 
555 }
556 
557 
558 void TestOneLatLonCountry(const string& country, const string& lat_lon, const string& error, bool use_state = false, const string& err_code = "LatLonCountry")
559 {
560  // prepare entry
564 
566 
567  if (use_state) {
569  }
570 
571  if (!error.empty()) {
572  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, err_code, error));
573  }
574  eval = validator.Validate(seh, options);
575  CheckErrors(*eval, expected_errors);
576 
577  if (!error.empty()) {
578  CValidErrorFormat format(*objmgr);
579  vector<string> expected;
580  expected.push_back("LatLonCountry Errors");
581  expected.push_back("lcl|good:" + error);
582  expected.push_back("");
583 
584  vector<string> seen;
585  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
586  for (const string& it : cat_list) {
587  vector<string> sublist;
588  NStr::Split(it, "\n", sublist);
589  for (const string& sit : sublist) {
590  seen.push_back(sit);
591  }
592  }
593 
594  CheckStrings(seen, expected);
595  }
596 
598 }
599 
600 
602 {
603  TestOneLatLonCountry("Portugal", "37.7715 N 25.3097 W", "", true);
604 }
605 
606 
607 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonCountry)
608 {
609  TestOneLatLonCountry("Romania", "46.5 N 20 E",
610  "Lat_lon '46.5 N 20 E' maps to 'Hungary' instead of 'Romania' - claimed region 'Romania' is at distance 45 km");
611  TestOneLatLonCountry("Romania", "34 N 65 E", "Lat_lon '34 N 65 E' maps to 'Afghanistan' instead of 'Romania'");
612  TestOneLatLonCountry("Romania", "48 N 15 E", "Lat_lon '48 N 15 E' maps to 'Austria' instead of 'Romania'");
613  TestOneLatLonCountry("Romania", "48 N 15 W", "Lat_lon '48 N 15 W' is in water 'Atlantic Ocean'", false, "LatLonWater");
614  // RW-1137 this had inconsistent behavior in production vs. development tests, possibly due to version skew in
615  // Puerto Rico cleanup code, so commenting out to avoid spurious error reports
616  /*
617  TestOneLatLonCountry("Puerto Rico: Rio Mameyes in Luquillo", "18.47 N 64.23000000000002 W",
618  "Lat_lon '18.47 N 64.23000000000002 W' is in water 'Caribbean Sea', 'Puerto Rico: Rio Mameyes in Luquillo' is 108 km away",
619  false, "LatLonWater");
620  */
621 
622 }
623 
624 
625 BOOST_AUTO_TEST_CASE(Test_ValidError_Format)
626 {
628 
629  // Create consensus splice problems
632  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
633  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
634  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'G';
635  CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
636  unit_test_util::AddFeat(intron, nuc);
637 
639  other_intron->SetData().SetImp().SetKey("intron");
641  gene->SetData().SetGene().SetLocus_tag("fake_locustag");
642  AddFeat(gene, nuc);
643 
644  // create EC number problems
645  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
646  prot->SetData().SetProt().SetEc().push_back("1.2.3.10");
647  prot->SetData().SetProt().SetEc().push_back("1.1.3.22");
648  prot->SetData().SetProt().SetEc().push_back("1.1.99.n");
649  prot->SetData().SetProt().SetEc().push_back("1.1.1.17");
650  prot->SetData().SetProt().SetEc().push_back("11.22.33.44");
651  prot->SetData().SetProt().SetEc().push_back("11.22.n33.44");
652  prot->SetData().SetProt().SetEc().push_back("11.22.33.n44");
653 
654 
655  // create bad institution code errors
659 
660  // create lat-lon country error
663 
665 
666  eval = validator.Validate(seh, options);
667 
668  CValidErrorFormat format(*objmgr);
669 
670  vector<string> expected;
671  expected.push_back("intron\tlcl|nuc\tGT at 17");
672  expected.push_back("intron\tlcl|nuc\tGT at 1");
673  expected.push_back("intron\tlcl|nuc\tAG at 11");
674  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
675  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
676  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
677  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
678  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
679  expected.push_back("CDS\tlcl|nuc\tGT at 16");
680  expected.push_back("lcl|nuc:Lat_lon '30 N 30 E' maps to 'Egypt' instead of 'Panama'");
681  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
682  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
683  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
684 
685  vector<string> seen;
686  for (CValidError_CI vit(*eval); vit; ++vit) {
687  string val = format.FormatForSubmitterReport(*vit, scope);
688  seen.push_back(val);
689  }
690  CheckStrings(seen, expected);
691 
692  expected.clear();
693  seen.clear();
694  for (CValidError_CI vit(*eval); vit; ++vit) {
695  seen.push_back(vit->GetErrCode());
696  }
697  expected.push_back("NotSpliceConsensusDonor");
698  expected.push_back("NotSpliceConsensusDonorTerminalIntron");
699  expected.push_back("NotSpliceConsensusAcceptor");
700  expected.push_back("DeletedEcNumber");
701  expected.push_back("ReplacedEcNumber");
702  expected.push_back("BadEcNumberValue");
703  expected.push_back("BadEcNumberFormat");
704  expected.push_back("BadEcNumberValue");
705  expected.push_back("NotSpliceConsensusDonor");
706  expected.push_back("LatLonCountry");
707  expected.push_back("BadInstitutionCode");
708  expected.push_back("BadInstitutionCode");
709  expected.push_back("BadInstitutionCode");
710  CheckStrings(seen, expected);
711 
712  seen.clear();
713  expected.clear();
714  vector<CValidErrItem::TErrIndex> codes = format.GetListOfErrorCodes(*eval);
715  for (CValidErrItem::TErrIndex it : codes) {
716  string val = CValidErrItem::ConvertErrCode(it);
717  seen.push_back(val);
718  }
719  expected.push_back("LatLonCountry");
720  expected.push_back("BadInstitutionCode");
721  expected.push_back("BadEcNumberFormat");
722  expected.push_back("BadEcNumberValue");
723  expected.push_back("NotSpliceConsensusDonor");
724  expected.push_back("NotSpliceConsensusAcceptor");
725  expected.push_back("DeletedEcNumber");
726  expected.push_back("ReplacedEcNumber");
727  expected.push_back("NotSpliceConsensusDonorTerminalIntron");
728  CheckStrings(seen, expected);
729 
730  string rval = format.FormatForSubmitterReport(*eval, scope, eErr_SEQ_FEAT_NotSpliceConsensusDonor);
731  expected.clear();
732  seen.clear();
733  NStr::Split(rval, "\n", seen);
734  expected.push_back("Not Splice Consensus");
735  expected.push_back("intron\tlcl|nuc\tGT at 17");
736  expected.push_back("CDS\tlcl|nuc\tGT at 16");
737  expected.push_back("");
738  CheckStrings(seen, expected);
739 
740  rval = format.FormatCategoryForSubmitterReport(*eval, scope, eSubmitterFormatErrorGroup_ConsensusSplice);
741  expected.clear();
742  seen.clear();
743  NStr::Split(rval, "\n", seen);
744  expected.push_back("Not Splice Consensus");
745  expected.push_back("intron\tlcl|nuc\tGT at 17");
746  expected.push_back("intron\tlcl|nuc\tGT at 1");
747  expected.push_back("intron\tlcl|nuc\tAG at 11");
748  expected.push_back("CDS\tlcl|nuc\tGT at 16");
749  expected.push_back("");
750  CheckStrings(seen, expected);
751 
752  expected.clear();
753  seen.clear();
754  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
755  for (const string& it : cat_list) {
756  vector<string> sublist;
757  NStr::Split(it, "\n", sublist);
758  for (const string& sit : sublist) {
759  seen.push_back(sit);
760  }
761  }
762  expected.push_back("Not Splice Consensus");
763  expected.push_back("intron\tlcl|nuc\tGT at 17");
764  expected.push_back("intron\tlcl|nuc\tGT at 1");
765  expected.push_back("intron\tlcl|nuc\tAG at 11");
766  expected.push_back("CDS\tlcl|nuc\tGT at 16");
767  expected.push_back("");
768  expected.push_back("EC Number Format");
769  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
770  expected.push_back("");
771  expected.push_back("EC Number Value");
772  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
773  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
774  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
775  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
776  expected.push_back("");
777  expected.push_back("Bad Institution Codes");
778  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
779  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
780  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
781  expected.push_back("");
782  expected.push_back("LatLonCountry Errors");
783  expected.push_back("lcl|nuc:Lat_lon '30 N 30 E' maps to 'Egypt' instead of 'Panama'");
784  expected.push_back("");
785  CheckStrings(seen, expected);
786 }
787 
788 
789 BOOST_AUTO_TEST_CASE(Test_GB_6395)
790 {
791  // prepare entry
793  unit_test_util::SetTaxon(entry, 0);
794 
796 
797  eval = validator.Validate(seh, options);
798 
799  CValidErrorFormat format(*objmgr);
800  vector<string> expected;
801  vector<string> seen;
802 
803  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
804  for (const string& it : cat_list) {
805  vector<string> sublist;
806  NStr::Split(it, "\n", sublist);
807  for (const string& sit : sublist) {
808  seen.push_back(sit);
809  }
810  }
811  expected.push_back("NoTaxonID");
812  expected.push_back("lcl|good:Sebaea microphylla");
813  expected.push_back("");
814 
815  CheckStrings(seen, expected);
816 }
817 
818 
819 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonState)
820 {
821  // prepare entry
823  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "USA: South Carolina");
825 
827 
828  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "LatLonState",
829  "Lat_lon '36 N 80 W' maps to 'USA: North Carolina' instead of 'USA: South Carolina' - claimed region 'USA: South Carolina' is at distance 130 km"));
830  // AddChromosomeNoLocation(expected_errors, "lcl|good");
832  eval = validator.Validate(seh, options);
833  CheckErrors(*eval, expected_errors);
834 
836 }
837 
838 
840 {
842  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
843  prot->SetData().SetProt().SetEc().push_back("1.2.3.10");
844  prot->SetData().SetProt().SetEc().push_back("1.1.3.22");
845  prot->SetData().SetProt().SetEc().push_back("1.1.99.n");
846  prot->SetData().SetProt().SetEc().push_back("1.1.1.17");
847  prot->SetData().SetProt().SetEc().push_back("11.22.33.44");
848  prot->SetData().SetProt().SetEc().push_back("11.22.n33.44");
849  prot->SetData().SetProt().SetEc().push_back("11.22.33.n44");
850  return entry;
851 }
852 
853 
854 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadEcNumberValue)
855 {
857  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
858 
860 
861  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "DeletedEcNumber",
862  "EC_number 1.2.3.10 was deleted"));
863  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ReplacedEcNumber",
864  "EC_number 1.1.3.22 was transferred and is no longer valid"));
865  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadEcNumberValue",
866  "11.22.33.44 is not a legal value for qualifier EC_number"));
867  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadEcNumberFormat",
868  "11.22.n33.44 is not in proper EC_number format"));
869  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Info, "BadEcNumberValue",
870  "11.22.33.n44 is not a legal preliminary value for qualifier EC_number"));
871  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
872  eval = validator.Validate(seh, options);
873  CheckErrors(*eval, expected_errors);
874 
875  scope.RemoveTopLevelSeqEntry(seh);
876  prot->SetData().SetProt().ResetEc();
878  misc->SetData().SetImp().SetKey("exon");
879  misc->AddQualifier("EC_number", "1.2.3.10");
880  misc->AddQualifier("EC_number", "1.1.3.22");
881  misc->AddQualifier("EC_number", "1.1.99.n");
882  misc->AddQualifier("EC_number", "1.1.1.17");
883  misc->AddQualifier("EC_number", "11.22.33.44");
884  misc->AddQualifier("EC_number", "11.22.n33.44");
885  misc->AddQualifier("EC_number", "11.22.33.n44");
886  SetErrorsAccessions(expected_errors, "lcl|nuc");
887  expected_errors[1]->SetErrMsg("EC_number 1.1.3.22 was replaced");
888  seh = scope.AddTopLevelSeqEntry(*entry);
889  eval = validator.Validate(seh, options);
890  CheckErrors(*eval, expected_errors);
891 
893 }
894 
895 
896 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidQualifierValue)
897 {
900  misc->SetData().SetImp().SetKey("repeat_region");
901  misc->AddQualifier("rpt_unit_seq", "ATA");
902 
904 
905  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "RepeatSeqDoNotMatch",
906  "repeat_region /rpt_unit and underlying sequence do not match"));
907  // AddChromosomeNoLocation(expected_errors, "lcl|good");
908  eval = validator.Validate(seh, options);
909  CheckErrors(*eval, expected_errors);
910 
911  scope.RemoveTopLevelSeqEntry(seh);
913  misc = unit_test_util::AddMiscFeature(entry);
914  misc->SetData().SetImp().SetKey("repeat_region");
915  misc->AddQualifier("rpt_unit_seq", "ATAGTGATAGTG");
916  seh = scope.AddTopLevelSeqEntry(*entry);
917  expected_errors[0]->SetErrCode("InvalidRepeatUnitLength");
918  expected_errors[0]->SetErrMsg("Length of rpt_unit_seq is greater than feature length");
919  expected_errors[0]->SetSeverity(eDiag_Info);
920  eval = validator.Validate(seh, options);
921  CheckErrors(*eval, expected_errors);
922 
924 }
925 
926 
927 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ExtNotAllowed)
928 {
930 
932 
933  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "ExtNotAllowed", "Bioseq-ext not allowed on virtual Bioseq"));
934  // AddChromosomeNoLocation(expected_errors, "lcl|good");
935 
936  // repr = virtual
937  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_virtual);
938  entry->SetSeq().SetInst().ResetSeq_data();
939  entry->SetSeq().SetInst().SetExt().SetDelta();
940  eval = validator.Validate(seh, options);
941  CheckErrors(*eval, expected_errors);
942 
943  // repr = raw
944  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_raw);
945  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
946  expected_errors[0]->SetErrMsg("Bioseq-ext not allowed on raw Bioseq");
947  eval = validator.Validate(seh, options);
948  CheckErrors(*eval, expected_errors);
949 
950  entry->SetSeq().SetInst().ResetExt();
951  entry->SetSeq().SetInst().ResetSeq_data();
952  expected_errors[0]->SetErrCode("SeqDataNotFound");
953  expected_errors[0]->SetErrMsg("Missing Seq-data on raw Bioseq");
954  expected_errors[0]->SetSeverity(eDiag_Critical);
955  eval = validator.Validate(seh, options);
956  CheckErrors(*eval, expected_errors);
957 
958  entry->SetSeq().SetInst().SetSeq_data().SetGap();
959  eval = validator.Validate(seh, options);
960  CheckErrors(*eval, expected_errors);
961 
962  // repr = const
963  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_const);
964  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
965  entry->SetSeq().SetInst().SetExt().SetDelta();
966  expected_errors[0]->SetErrCode("ExtNotAllowed");
967  expected_errors[0]->SetErrMsg("Bioseq-ext not allowed on constructed Bioseq");
968  eval = validator.Validate(seh, options);
969  CheckErrors(*eval, expected_errors);
970 
971  entry->SetSeq().SetInst().ResetExt();
972  entry->SetSeq().SetInst().ResetSeq_data();
973  expected_errors[0]->SetErrCode("SeqDataNotFound");
974  expected_errors[0]->SetErrMsg("Missing Seq-data on constructed Bioseq");
975  expected_errors[0]->SetSeverity(eDiag_Critical);
976  eval = validator.Validate(seh, options);
977  CheckErrors(*eval, expected_errors);
978 
979  entry->SetSeq().SetInst().SetSeq_data().SetGap();
980  eval = validator.Validate(seh, options);
981  CheckErrors(*eval, expected_errors);
982 
983  // repr = map
984  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_map);
985  entry->SetSeq().SetInst().ResetSeq_data();
986  expected_errors[0]->SetErrCode("ExtBadOrMissing");
987  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on map Bioseq");
988  expected_errors[0]->SetSeverity(eDiag_Error);
989  eval = validator.Validate(seh, options);
990  CheckErrors(*eval, expected_errors);
991 
992  entry->SetSeq().SetInst().SetExt().SetDelta();
993  eval = validator.Validate(seh, options);
994  CheckErrors(*eval, expected_errors);
995 
996  entry->SetSeq().SetInst().SetExt().SetRef();
997  eval = validator.Validate(seh, options);
998  CheckErrors(*eval, expected_errors);
999 
1000  entry->SetSeq().SetInst().SetExt().SetMap();
1001  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
1002  expected_errors[0]->SetErrCode("SeqDataNotAllowed");
1003  expected_errors[0]->SetErrMsg("Seq-data not allowed on map Bioseq");
1004  eval = validator.Validate(seh, options);
1005  CheckErrors(*eval, expected_errors);
1006 
1007 
1008  // repr = ref
1009  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_ref);
1010  entry->SetSeq().SetInst().ResetExt();
1011  entry->SetSeq().SetInst().ResetSeq_data();
1012  expected_errors[0]->SetErrCode("ExtBadOrMissing");
1013  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on reference Bioseq");
1014  eval = validator.Validate(seh, options);
1015  CheckErrors(*eval, expected_errors);
1016 
1017  /*
1018  // repr = seg
1019  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
1020  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on seg Bioseq");
1021  eval = validator.Validate(seh, options);
1022  CheckErrors(*eval, expected_errors);
1023  */
1024 
1025  // repr = consen
1026  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_consen);
1027  expected_errors[0]->SetSeverity(eDiag_Critical);
1028  expected_errors[0]->SetErrCode("ReprInvalid");
1029  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 6");
1030  eval = validator.Validate(seh, options);
1031  CheckErrors(*eval, expected_errors);
1032 
1033  // repr = notset
1034  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_not_set);
1035  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 0");
1036  eval = validator.Validate(seh, options);
1037  CheckErrors(*eval, expected_errors);
1038 
1039  // repr = other
1040  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_other);
1041  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 255");
1042  eval = validator.Validate(seh, options);
1043  CheckErrors(*eval, expected_errors);
1044 
1045  // repr = delta
1046  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1047  entry->SetSeq().SetInst().SetExt().SetDelta();
1048  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
1049  expected_errors[0]->SetSeverity(eDiag_Error);
1050  expected_errors[0]->SetErrCode("SeqDataNotAllowed");
1051  expected_errors[0]->SetErrMsg("Seq-data not allowed on delta Bioseq");
1052  eval = validator.Validate(seh, options);
1053  CheckErrors(*eval, expected_errors);
1054 
1055  entry->SetSeq().SetInst().ResetExt();
1056  entry->SetSeq().SetInst().ResetSeq_data();
1057  expected_errors[0]->SetSeverity(eDiag_Error);
1058  expected_errors[0]->SetErrCode("ExtBadOrMissing");
1059  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on delta Bioseq");
1060  eval = validator.Validate(seh, options);
1061  CheckErrors(*eval, expected_errors);
1062 
1063  CLEAR_ERRORS
1064 }
1065 
1066 
1067 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ReprInvalid)
1068 {
1070 
1072  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "ReprInvalid", "Invalid Bioseq->repr = 0"));
1073  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1074 
1075  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_not_set);
1076  eval = validator.Validate(seh, options);
1077  CheckErrors(*eval, expected_errors);
1078 
1079  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 255");
1080  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_other);
1081  eval = validator.Validate(seh, options);
1082  CheckErrors(*eval, expected_errors);
1083 
1084  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 6");
1085  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_consen);
1086  eval = validator.Validate(seh, options);
1087  CheckErrors(*eval, expected_errors);
1088 
1089  CLEAR_ERRORS
1090 }
1091 
1092 
1093 BOOST_AUTO_TEST_CASE(Test_CollidingLocusTags)
1094 {
1095  CRef<CSeq_entry> entry(new CSeq_entry());
1096  {
1098  istr >> MSerial_AsnText >> *entry;
1099  }
1100 
1102  CScope scope(*objmgr);
1103  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
1104 
1105  CValidator validator(*objmgr);
1106 
1107  // Set validator options
1108  unsigned int options = CValidator::eVal_need_isojta
1112 
1113  // list of expected errors
1114  vector<CExpectedError*> expected_errors;
1115  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Warning, "TerminalNs", "N at end of sequence"));
1116  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Warning, "GeneLocusCollidesWithLocusTag", "locus collides with locus_tag in another gene"));
1117  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "CollidingLocusTags", "Colliding locus_tags in gene features"));
1118  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "CollidingLocusTags", "Colliding locus_tags in gene features"));
1119  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoMolInfoFound", "No Mol-info applies to this Bioseq"));
1120  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "LocusTagGeneLocusMatch", "Gene locus and locus_tag 'foo' match"));
1121  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoPubFound", "No publications anywhere on this entire record."));
1122  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Info, "MissingPubRequirement", "No submission citation anywhere on this entire record."));
1123  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoSourceDescriptor", "No source information included on this record."));
1124 
1125  CConstRef<CValidError> eval = validator.Validate(seh, options);
1126  CheckErrors(*eval, expected_errors);
1127 
1128  CLEAR_ERRORS
1129 }
1130 
1131 
1132 const string sc_TestEntryCollidingLocusTags = "Seq-entry ::= seq {\
1133  id {\
1134  local str \"LocusCollidesWithLocusTag\" } ,\
1135  inst {\
1136  repr raw ,\
1137  mol dna ,\
1138  length 24 ,\
1139  seq-data\
1140  iupacna \"AATTGGCCAANNAATTGGCCAANN\" } ,\
1141  annot {\
1142  {\
1143  data\
1144  ftable {\
1145  {\
1146  data\
1147  gene {\
1148  locus \"foo\" ,\
1149  locus-tag \"foo\" } ,\
1150  location\
1151  int {\
1152  from 0 ,\
1153  to 4 ,\
1154  strand plus ,\
1155  id\
1156  local str \"LocusCollidesWithLocusTag\" } } ,\
1157  {\
1158  data\
1159  gene {\
1160  locus \"bar\" ,\
1161  locus-tag \"foo\" } ,\
1162  location\
1163  int {\
1164  from 5 ,\
1165  to 9 ,\
1166  strand plus ,\
1167  id\
1168  local str \"LocusCollidesWithLocusTag\" } } ,\
1169  {\
1170  data\
1171  gene {\
1172  locus \"bar\" ,\
1173  locus-tag \"baz\" } ,\
1174  location\
1175  int {\
1176  from 10 ,\
1177  to 14 ,\
1178  strand plus ,\
1179  id\
1180  local str \"LocusCollidesWithLocusTag\" } } ,\
1181  {\
1182  data\
1183  gene {\
1184  locus \"quux\" ,\
1185  locus-tag \"baz\" } ,\
1186  location\
1187  int {\
1188  from 15 ,\
1189  to 19 ,\
1190  strand plus ,\
1191  id\
1192  local str \"LocusCollidesWithLocusTag\" } } } } } }\
1193 ";
1194 
1195 
1196 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_CircularProtein)
1197 {
1199 
1201 
1202  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "CircularProtein", "Non-linear topology set on protein"));
1203  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1204 
1206 
1207  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
1208  eval = validator.Validate(seh, options);
1209  CheckErrors(*eval, expected_errors);
1210 
1211  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_tandem);
1212  eval = validator.Validate(seh, options);
1213  CheckErrors(*eval, expected_errors);
1214 
1215  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_other);
1216  eval = validator.Validate(seh, options);
1217  CheckErrors(*eval, expected_errors);
1218 
1219  // should be no error for not set or linear
1220  CLEAR_ERRORS
1221 
1222  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_not_set);
1223  eval = validator.Validate(seh, options);
1224  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1225  CheckErrors(*eval, expected_errors);
1226 
1227  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_linear);
1228  eval = validator.Validate(seh, options);
1229  CheckErrors(*eval, expected_errors);
1230 
1231  CLEAR_ERRORS
1232 }
1233 
1234 
1235 BOOST_AUTO_TEST_CASE(Test_BadProteinMoltype)
1236 {
1238 
1240 
1241  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadProteinMoltype", "Protein not single stranded"));
1242  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1243 
1244  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ds);
1245  eval = validator.Validate(seh, options);
1246  CheckErrors(*eval, expected_errors);
1247 
1248  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_mixed);
1249  eval = validator.Validate(seh, options);
1250  CheckErrors(*eval, expected_errors);
1251 
1252  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_other);
1253  eval = validator.Validate(seh, options);
1254  CheckErrors(*eval, expected_errors);
1255 
1256  // no errors expected for not set or single strand
1257  CLEAR_ERRORS
1258 
1259  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1260 
1261  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_not_set);
1262  eval = validator.Validate(seh, options);
1263  CheckErrors(*eval, expected_errors);
1264 
1265  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ss);
1266  eval = validator.Validate(seh, options);
1267  CheckErrors(*eval, expected_errors);
1268 
1269  CLEAR_ERRORS
1270 }
1271 
1272 
1273 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNotSet)
1274 {
1276 
1278 
1279  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MolNotSet", "Bioseq.mol is 0"));
1280  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1281 
1282  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_not_set);
1283  eval = validator.Validate(seh, options);
1284  CheckErrors(*eval, expected_errors);
1285 
1286  expected_errors[0]->SetErrCode("MolOther");
1287  expected_errors[0]->SetErrMsg("Bioseq.mol is type other");
1288  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_other);
1289  eval = validator.Validate(seh, options);
1290  CheckErrors(*eval, expected_errors);
1291 
1292  expected_errors[0]->SetErrCode("MolNuclAcid");
1293  expected_errors[0]->SetErrMsg("Bioseq.mol is type nucleic acid");
1294  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_na);
1295  eval = validator.Validate(seh, options);
1296  CheckErrors(*eval, expected_errors);
1297 
1298  CLEAR_ERRORS
1299 }
1300 
1301 
1302 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_FuzzyLen)
1303 {
1305 
1307 
1308  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "FuzzyLen", "Fuzzy length on raw Bioseq"));
1309  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1310 
1311  entry->SetSeq().SetInst().SetFuzz();
1312  eval = validator.Validate(seh, options);
1313  CheckErrors(*eval, expected_errors);
1314 
1315  expected_errors[0]->SetErrMsg("Fuzzy length on const Bioseq");
1316  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_const);
1317  eval = validator.Validate(seh, options);
1318  CheckErrors(*eval, expected_errors);
1319 
1320  // shouldn't get fuzzy length if gap
1321  expected_errors[0]->SetErrCode("SeqDataNotFound");
1322  expected_errors[0]->SetErrMsg("Missing Seq-data on constructed Bioseq");
1323  expected_errors[0]->SetSeverity(eDiag_Critical);
1324  entry->SetSeq().SetInst().SetSeq_data().SetGap();
1325  eval = validator.Validate(seh, options);
1326  CheckErrors(*eval, expected_errors);
1327 
1328  CLEAR_ERRORS
1329 }
1330 
1331 
1332 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidAlphabet)
1333 {
1335 
1337  CScope scope(*objmgr);
1338  scope.AddDefaults();
1339  CSeq_entry_Handle prot_seh = scope.AddTopLevelSeqEntry(*prot_entry);
1340 
1341  CValidator validator(*objmgr);
1342 
1343  // Set validator options
1344  unsigned int options = CValidator::eVal_need_isojta
1348 
1349  // list of expected errors
1350  vector<CExpectedError*> expected_errors;
1351  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidAlphabet", "Using a nucleic acid alphabet on a protein sequence"));
1352  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1353  prot_entry->SetSeq().SetInst().SetSeq_data().SetIupacna();
1354  CConstRef<CValidError> eval = validator.Validate(prot_seh, options);
1355  CheckErrors(*eval, expected_errors);
1356 
1357  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi2na();
1358  eval = validator.Validate(prot_seh, options);
1359  CheckErrors(*eval, expected_errors);
1360 
1361  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na();
1362  eval = validator.Validate(prot_seh, options);
1363  CheckErrors(*eval, expected_errors);
1364 
1365  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi8na();
1366  eval = validator.Validate(prot_seh, options);
1367  CheckErrors(*eval, expected_errors);
1368 
1369  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbipna();
1370  eval = validator.Validate(prot_seh, options);
1371  CheckErrors(*eval, expected_errors);
1372 
1374  CScope scope2(*objmgr);
1375  scope2.AddDefaults();
1376  CSeq_entry_Handle seh = scope2.AddTopLevelSeqEntry(*entry);
1377 
1378  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa();
1379  expected_errors[0]->SetErrMsg("Using a protein alphabet on a nucleic acid");
1380 
1381  eval = validator.Validate(seh, options);
1382  CheckErrors(*eval, expected_errors);
1383 
1384  entry->SetSeq().SetInst().SetSeq_data().SetNcbi8aa();
1385  eval = validator.Validate(seh, options);
1386  CheckErrors(*eval, expected_errors);
1387 
1388  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa();
1389  eval = validator.Validate(seh, options);
1390  CheckErrors(*eval, expected_errors);
1391 
1392  entry->SetSeq().SetInst().SetSeq_data().SetNcbipaa();
1393  eval = validator.Validate(seh, options);
1394  CheckErrors(*eval, expected_errors);
1395 
1396  entry->SetSeq().SetInst().SetSeq_data().SetNcbistdaa();
1397  eval = validator.Validate(seh, options);
1398  CheckErrors(*eval, expected_errors);
1399 
1400  CLEAR_ERRORS
1401 }
1402 
1403 
1404 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidResidue)
1405 {
1407 
1409 
1410  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1411  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1412  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1413  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1414  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1415  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1416  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1417  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1418  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1419  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1420  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFE');
1421  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFE');
1422  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFF');
1423  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFF');
1424  entry->SetSeq().SetInst().SetLength(65);
1425  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'E' at position [5]"));
1426  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'F' at position [6]"));
1427  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'I' at position [9]"));
1428  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'J' at position [10]"));
1429  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'L' at position [12]"));
1430  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'O' at position [15]"));
1431  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'P' at position [16]"));
1432  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Q' at position [17]"));
1433  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'U' at position [21]"));
1434  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'X' at position [24]"));
1435  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Z' at position [26]"));
1436  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'E' at position [31]"));
1437  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'F' at position [32]"));
1438  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'I' at position [35]"));
1439  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'J' at position [36]"));
1440  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'L' at position [38]"));
1441  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'O' at position [41]"));
1442  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'P' at position [42]"));
1443  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Q' at position [43]"));
1444  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'U' at position [47]"));
1445  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'X' at position [50]"));
1446  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Z' at position [52]"));
1447  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [53]"));
1448  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [54]"));
1449  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [55]"));
1450  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [56]"));
1451  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [57]"));
1452  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [58]"));
1453  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [59]"));
1454  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [60]"));
1455  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [61]"));
1456  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [254] at position [62]"));
1457  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "More than 10 invalid residues. Checking stopped"));
1458  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
1459  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1460 
1461  eval = validator.Validate(seh, options);
1462  CheckErrors(*eval, expected_errors);
1463 
1464  // now repeat test, but with mRNA - this time Us should not be reported
1465  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
1466  delete expected_errors[8];
1467  expected_errors[8] = nullptr;
1468  delete expected_errors[19];
1469  expected_errors[19] = nullptr;
1470  eval = validator.Validate(seh, options);
1471  CheckErrors(*eval, expected_errors);
1472 
1473  // now repeat test, but with protein
1474  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_aa);
1475  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1476  if (it->IsMolinfo()) {
1477  it->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1478  }
1479  }
1480  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1481  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1482  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1483  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1484  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1485  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1486  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1487  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1488  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1489  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1490  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFE');
1491  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFE');
1492  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFF');
1493  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFF');
1494  entry->SetSeq().SetInst().SetLength(65);
1495  CRef<CSeq_feat> feat(new CSeq_feat());
1496  feat->SetData().SetProt().SetName().push_back("fake protein name");
1497  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
1498  feat->SetLocation().SetInt().SetFrom(0);
1499  feat->SetLocation().SetInt().SetTo(64);
1500  unit_test_util::AddFeat(feat, entry);
1501  scope.RemoveEntry(*entry);
1502  seh = scope.AddTopLevelSeqEntry(*entry);
1503 
1504  for (int j = 0; j < 22; j++) {
1505  if (expected_errors[j]) {
1506  delete expected_errors[j];
1507  expected_errors[j] = nullptr;
1508  }
1509  }
1510  eval = validator.Validate(seh, options);
1511  CheckErrors(*eval, expected_errors);
1512 
1513  CLEAR_ERRORS
1514 
1515  // now look for lowercase characters
1516  scope.RemoveEntry(*entry);
1517  entry = unit_test_util::BuildGoodSeq();
1518  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("abcdefghijklmnopqrstuvwxyz");
1519  entry->SetSeq().SetInst().SetLength(26);
1520  seh = scope.AddTopLevelSeqEntry(*entry);
1521  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Sequence contains lower-case characters"));
1522  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1523  eval = validator.Validate(seh, options);
1524  CheckErrors(*eval, expected_errors);
1525 
1526  scope.RemoveEntry(*entry);
1528  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("protein");
1529  seh = scope.AddTopLevelSeqEntry(*entry);
1530  eval = validator.Validate(seh, options);
1531  CheckErrors(*eval, expected_errors);
1532 
1533  CLEAR_ERRORS
1534 
1535  // now try delta sequence
1536  scope.RemoveEntry(*entry);
1537  entry = unit_test_util::BuildGoodSeq();
1538  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1539  entry->SetSeq().SetInst().ResetSeq_data();
1540  CRef<CDelta_seq> seg(new CDelta_seq());
1541  seg->SetLiteral().SetSeq_data().SetIupacna().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1542  seg->SetLiteral().SetLength(52);
1543  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(seg);
1544  entry->SetSeq().SetInst().SetLength(52);
1545  seh = scope.AddTopLevelSeqEntry(*entry);
1546 
1547  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [E] at position [5]"));
1548  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [F] at position [6]"));
1549  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [I] at position [9]"));
1550  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [J] at position [10]"));
1551  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [L] at position [12]"));
1552  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [O] at position [15]"));
1553  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [P] at position [16]"));
1554  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Q] at position [17]"));
1555  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [U] at position [21]"));
1556  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [X] at position [24]"));
1557  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Z] at position [26]"));
1558  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [E] at position [31]"));
1559  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [F] at position [32]"));
1560  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [I] at position [35]"));
1561  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [J] at position [36]"));
1562  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [L] at position [38]"));
1563  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [O] at position [41]"));
1564  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [P] at position [42]"));
1565  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Q] at position [43]"));
1566  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [U] at position [47]"));
1567  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [X] at position [50]"));
1568  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Z] at position [52]"));
1569  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1570  eval = validator.Validate(seh, options);
1571  CheckErrors(*eval, expected_errors);
1572 
1573  CLEAR_ERRORS
1574 
1575  // try protein delta sequence
1576  scope.RemoveEntry(*entry);
1578  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1579  entry->SetSeq().SetInst().ResetSeq_data();
1580  CRef<CDelta_seq> seg2(new CDelta_seq());
1581  seg2->SetLiteral().SetSeq_data().SetIupacaa().Set("1234567");
1582  seg2->SetLiteral().SetLength(7);
1583  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(seg2);
1584  entry->SetSeq().SetInst().SetLength(7);
1585  seh = scope.AddTopLevelSeqEntry(*entry);
1586 
1587  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [1] at position [1]"));
1588  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [2] at position [2]"));
1589  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [3] at position [3]"));
1590  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [4] at position [4]"));
1591  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [5] at position [5]"));
1592  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [6] at position [6]"));
1593  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [7] at position [7]"));
1594  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1595 
1596  eval = validator.Validate(seh, options);
1597  CheckErrors(*eval, expected_errors);
1598 
1599  CLEAR_ERRORS
1600 }
1601 
1602 
1603 /*
1604 static void WriteOutTemp(CRef<CSeq_entry> entry)
1605 {
1606  // construct a temp file name
1607  CNcbiOstrstream oss;
1608  oss << "test.asn";
1609  string filename = CNcbiOstrstreamToString(oss);
1610  string fullPath = CDirEntry::MakePath(".", filename);
1611 
1612  // initialize a binary output stream
1613  unique_ptr<CNcbiOstream> outStream;
1614  outStream.reset(new CNcbiOfstream(
1615  fullPath.c_str(),
1616  IOS_BASE::out));
1617  if (!(*outStream)) {
1618  return;
1619  }
1620 
1621  unique_ptr<CObjectOStream> outObject;
1622  // Associate ASN.1 text serialization methods with the input
1623  outObject.reset(new CObjectOStreamAsn(*outStream));
1624 
1625  // write the asn data
1626  try {
1627  *outObject << *entry;
1628  outStream->flush();
1629  } catch (exception&) {
1630  }
1631 }
1632 */
1633 
1634 
1635 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_StopInProtein)
1636 {
1638 
1640 
1641  entry->SetSet().SetSeq_set().back()->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MP*K*E*N");
1642  entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("GTGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
1644  cds->SetExcept(true);
1645  cds->SetExcept_text("unclassified translation discrepancy");
1646 
1647  BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
1648  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
1649 
1650  // list of expected errors
1651  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
1652  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "ExceptionProblem", "unclassified translation discrepancy is not a legal exception explanation"));
1653  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
1654  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
1655  "CDS has unnecessary translated product replaced exception"));
1656  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
1657 
1658  eval = validator.Validate(seh, options);
1659  CheckErrors(*eval, expected_errors);
1660  // WriteOutTemp(entry);
1661 
1662  CLEAR_ERRORS
1663  cds->ResetExcept();
1664  cds->ResetExcept_text();
1665  BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
1666  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
1667  BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
1668 
1669  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
1670  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon", "Illegal start codon (and 3 internal stops). Probably wrong genetic code [0]"));
1671  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
1672  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
1673 
1674  eval = validator.Validate(seh, options);
1675  CheckErrors(*eval, expected_errors);
1676  // WriteOutTemp(entry);
1677 
1679  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
1680 
1681  // write out seq-entry
1682  // WriteOutTemp(entry);
1683 
1684  delete expected_errors[1];
1685  expected_errors[1] = nullptr;
1686  expected_errors[2]->SetErrMsg("3 internal stops. Genetic code [0]");
1687  eval = validator.Validate(seh, options);
1688  CheckErrors(*eval, expected_errors);
1689 
1690  CLEAR_ERRORS
1691 }
1692 
1693 
1694 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_PartialInconsistent)
1695 {
1696 #if 0
1697  //We don't care about segmented sets any more
1699 
1701 
1702  entry->SetSeq().SetInst().ResetSeq_data();
1703  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
1704  CRef<CSeq_id> id(new CSeq_id("gb|AY123456"));
1705  CRef<CSeq_loc> loc1(new CSeq_loc(*id, 0, 3));
1706  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc1);
1707  CRef<CSeq_id> id2(new CSeq_id("gb|AY123457"));
1708  CRef<CSeq_loc> loc2(new CSeq_loc(*id2, 0, 2));
1709  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc2);
1710 
1711  // list of expected errors
1712  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "PartialInconsistent", "Partial segmented sequence without MolInfo partial"));
1713 
1714  // not-set
1715  loc1->SetPartialStart(true, eExtreme_Biological);
1716  loc2->SetPartialStop(true, eExtreme_Biological);
1717  eval = validator.Validate(seh, options);
1718  CheckErrors(*eval, expected_errors);
1719  loc1->SetPartialStart(true, eExtreme_Biological);
1720  loc2->SetPartialStop(false, eExtreme_Biological);
1721  eval = validator.Validate(seh, options);
1722  CheckErrors(*eval, expected_errors);
1723  loc1->SetPartialStart(false, eExtreme_Biological);
1724  loc2->SetPartialStop(true, eExtreme_Biological);
1725  eval = validator.Validate(seh, options);
1726  CheckErrors(*eval, expected_errors);
1727 
1728  // unknown
1730 
1731  loc1->SetPartialStart(true, eExtreme_Biological);
1732  loc2->SetPartialStop(true, eExtreme_Biological);
1733  eval = validator.Validate(seh, options);
1734  CheckErrors(*eval, expected_errors);
1735  loc1->SetPartialStart(true, eExtreme_Biological);
1736  loc2->SetPartialStop(false, eExtreme_Biological);
1737  eval = validator.Validate(seh, options);
1738  CheckErrors(*eval, expected_errors);
1739  loc1->SetPartialStart(false, eExtreme_Biological);
1740  loc2->SetPartialStop(true, eExtreme_Biological);
1741  eval = validator.Validate(seh, options);
1742  CheckErrors(*eval, expected_errors);
1743 
1744  // complete
1746 
1747  loc1->SetPartialStart(true, eExtreme_Biological);
1748  loc2->SetPartialStop(true, eExtreme_Biological);
1749  eval = validator.Validate(seh, options);
1750  CheckErrors(*eval, expected_errors);
1751  loc1->SetPartialStart(true, eExtreme_Biological);
1752  loc2->SetPartialStop(false, eExtreme_Biological);
1753  eval = validator.Validate(seh, options);
1754  CheckErrors(*eval, expected_errors);
1755  loc1->SetPartialStart(false, eExtreme_Biological);
1756  loc2->SetPartialStop(true, eExtreme_Biological);
1757  eval = validator.Validate(seh, options);
1758  CheckErrors(*eval, expected_errors);
1759 
1760  // partial
1762 
1763  loc1->SetPartialStart(false, eExtreme_Biological);
1764  loc2->SetPartialStop(false, eExtreme_Biological);
1765  expected_errors[0]->SetErrMsg("Complete segmented sequence with MolInfo partial");
1766  eval = validator.Validate(seh, options);
1767  CheckErrors(*eval, expected_errors);
1768 
1769  // no-left
1771 
1772  loc1->SetPartialStart(true, eExtreme_Biological);
1773  loc2->SetPartialStop(true, eExtreme_Biological);
1774  expected_errors[0]->SetErrMsg("No-left inconsistent with segmented SeqLoc");
1775  eval = validator.Validate(seh, options);
1776  CheckErrors(*eval, expected_errors);
1777  loc1->SetPartialStart(false, eExtreme_Biological);
1778  loc2->SetPartialStop(true, eExtreme_Biological);
1779  eval = validator.Validate(seh, options);
1780  CheckErrors(*eval, expected_errors);
1781  loc1->SetPartialStart(false, eExtreme_Biological);
1782  loc2->SetPartialStop(false, eExtreme_Biological);
1783  eval = validator.Validate(seh, options);
1784  CheckErrors(*eval, expected_errors);
1785 
1786  // no-right
1788 
1789  loc1->SetPartialStart(true, eExtreme_Biological);
1790  loc2->SetPartialStop(true, eExtreme_Biological);
1791  expected_errors[0]->SetErrMsg("No-right inconsistent with segmented SeqLoc");
1792  eval = validator.Validate(seh, options);
1793  CheckErrors(*eval, expected_errors);
1794  loc1->SetPartialStart(true, eExtreme_Biological);
1795  loc2->SetPartialStop(false, eExtreme_Biological);
1796  eval = validator.Validate(seh, options);
1797  CheckErrors(*eval, expected_errors);
1798  loc1->SetPartialStart(false, eExtreme_Biological);
1799  loc2->SetPartialStop(false, eExtreme_Biological);
1800  eval = validator.Validate(seh, options);
1801  CheckErrors(*eval, expected_errors);
1802 
1803  // no-ends
1805 
1806  expected_errors[0]->SetErrMsg("No-ends inconsistent with segmented SeqLoc");
1807  loc1->SetPartialStart(true, eExtreme_Biological);
1808  loc2->SetPartialStop(false, eExtreme_Biological);
1809  eval = validator.Validate(seh, options);
1810  CheckErrors(*eval, expected_errors);
1811  loc1->SetPartialStart(false, eExtreme_Biological);
1812  loc2->SetPartialStop(true, eExtreme_Biological);
1813  eval = validator.Validate(seh, options);
1814  CheckErrors(*eval, expected_errors);
1815  loc1->SetPartialStart(false, eExtreme_Biological);
1816  loc2->SetPartialStop(false, eExtreme_Biological);
1817  eval = validator.Validate(seh, options);
1818  CheckErrors(*eval, expected_errors);
1819 
1820  CLEAR_ERRORS
1821 #endif
1822 }
1823 
1824 
1825 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ShortSeq)
1826 {
1828 
1830 
1831  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPR");
1832  entry->SetSeq().SetInst().SetLength(3);
1833  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetTo(2);
1834 
1835  // don't report if pdb
1836  CRef<CPDB_seq_id> pdb_id(new CPDB_seq_id());
1837  pdb_id->SetMol().Set("foo");
1838  entry->SetSeq().SetId().front()->SetPdb(*pdb_id);
1839  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetPdb(*pdb_id);
1840  scope.RemoveTopLevelSeqEntry(seh);
1841  seh = scope.AddTopLevelSeqEntry(*entry);
1842  eval = validator.Validate(seh, options);
1843  // AddChromosomeNoLocation(expected_errors, "pdb|foo| ");
1844  CheckErrors(*eval, expected_errors);
1845 
1846  // new test if no coding region
1847  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PartialsInconsistent", "Molinfo completeness and protein feature partials conflict"));
1848  expected_errors[0]->SetAccession("lcl|good");
1849  entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
1850  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
1851  scope.RemoveTopLevelSeqEntry(seh);
1852  seh = scope.AddTopLevelSeqEntry(*entry);
1854 
1855  eval = validator.Validate(seh, options);
1856  CheckErrors(*eval, expected_errors);
1858  eval = validator.Validate(seh, options);
1859  CheckErrors(*eval, expected_errors);
1861  eval = validator.Validate(seh, options);
1862  CheckErrors(*eval, expected_errors);
1864  eval = validator.Validate(seh, options);
1865  CheckErrors(*eval, expected_errors);
1866 
1867  CLEAR_ERRORS
1868 
1869  // for all other completeness, report
1870  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortSeq", "Sequence only 3 residues"));
1871  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1872  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1873  if (it->IsMolinfo()) {
1874  it->SetMolinfo().ResetCompleteness();
1875  }
1876  }
1877  eval = validator.Validate(seh, options);
1878  CheckErrors(*eval, expected_errors);
1880  eval = validator.Validate(seh, options);
1881  CheckErrors(*eval, expected_errors);
1883  eval = validator.Validate(seh, options);
1884  CheckErrors(*eval, expected_errors);
1886  eval = validator.Validate(seh, options);
1887  CheckErrors(*eval, expected_errors);
1888 
1889  // nucleotide
1890  scope.RemoveTopLevelSeqEntry(seh);
1891  entry = unit_test_util::BuildGoodSeq();
1892  seh = scope.AddTopLevelSeqEntry(*entry);
1893  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCTTT");
1894  entry->SetSeq().SetInst().SetLength(9);
1895  expected_errors[0]->SetErrMsg("Sequence only 9 residues");
1896  eval = validator.Validate(seh, options);
1897  CheckErrors(*eval, expected_errors);
1898 
1899  CLEAR_ERRORS
1900 
1901  // don't report if pdb
1902  entry->SetSeq().SetId().front()->SetPdb(*pdb_id);
1903  scope.RemoveTopLevelSeqEntry(seh);
1904  seh = scope.AddTopLevelSeqEntry(*entry);
1905  eval = validator.Validate(seh, options);
1906  // AddChromosomeNoLocation(expected_errors, "pdb|foo| ");
1907  CheckErrors(*eval, expected_errors);
1908 
1909  CLEAR_ERRORS
1910 }
1911 
1912 
1914 {
1915  bool rval = false;
1916 
1917  switch (tech) {
1920  case CMolInfo::eTech_both:
1924  rval = true;
1925  break;
1926  default:
1927  break;
1928  }
1929  return rval;
1930 }
1931 
1932 
1934 {
1935  CRef<CSeqdesc> desc(new CSeqdesc());
1938  if (entry->IsSeq()) {
1939  entry->SetSeq().SetDescr().Set().push_back(desc);
1940  } else if (entry->IsSet()) {
1941  entry->SetSet().SetDescr().Set().push_back(desc);
1942  }
1943 }
1944 
1945 
1946 static void SetRefGeneTrackingStatus(CRef<CSeq_entry> entry, string status)
1947 {
1948  if (entry->IsSeq()) {
1949  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1950  if (it->IsUser() && it->GetUser().IsRefGeneTracking()) {
1951  it->SetUser().SetData().front()->SetData().SetStr(status);
1952  }
1953  }
1954  } else if (entry->IsSet()) {
1955  for (auto& it : entry->SetSet().SetDescr().Set()) {
1956  if (it->IsUser() && it->GetUser().IsRefGeneTracking()) {
1957  it->SetUser().SetData().front()->SetData().SetStr(status);
1958  }
1959  }
1960  }
1961 }
1962 
1963 
1964 static void SetTitle(CRef<CSeq_entry> entry, string title)
1965 {
1966  bool found = false;
1967 
1968  if (entry->IsSetDescr()) {
1969  auto& cont = entry->SetDescr().Set();
1970  for (auto it = cont.begin(); it != cont.end();) {
1971  if ((*it)->IsTitle()) {
1972  found = true;
1973  if (NStr::IsBlank((*it)->GetTitle())) {
1974  it = cont.erase(it);
1975  continue;
1976  } else {
1977  (*it)->SetTitle(title);
1978  }
1979  }
1980  ++it;
1981  }
1982  }
1983 
1984  if (!found && !NStr::IsBlank(title)) {
1985  CRef<CSeqdesc> desc(new CSeqdesc());
1986  desc->SetTitle(title);
1987  entry->SetSeq().SetDescr().Set().push_back(desc);
1988  }
1989 }
1990 
1991 
1992 static void AddGenbankKeyword(CRef<CSeq_entry> entry, string keyword)
1993 {
1994  bool found = false;
1995 
1996  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1997  if (it->IsGenbank()) {
1998  it->SetGenbank().SetKeywords().push_back(keyword);
1999  found = true;
2000  }
2001  }
2002  if (!found) {
2003  CRef<CSeqdesc> desc(new CSeqdesc());
2004  desc->SetGenbank().SetKeywords().push_back(keyword);
2005  entry->SetSeq().SetDescr().Set().push_back(desc);
2006  }
2007 }
2008 
2009 
2011 {
2014 
2015  SetTech(entry, tech);
2016  eval = validator.Validate(seh, options);
2017  if (tech == CMolInfo::eTech_barcode) {
2018  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
2019  } else if (tech == CMolInfo::eTech_tsa) {
2020  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TSAseqGapProblem", "TSA Seq_gap NULL"));
2021  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2022  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"genomic\" is not appropriate for sequences that use the TSA technique."));
2023  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAseqGapProblem", "TSA submission includes wrong gap type. Gaps for TSA should be Assembly Gaps with linkage evidence."));
2024  } else if (tech == CMolInfo::eTech_wgs) {
2025  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
2026  }
2027  if (tech == CMolInfo::eTech_wgs) {
2028  AddChromosomeNoLocation(expected_errors, "lcl|good");
2029  }
2030 
2031  CheckErrors(*eval, expected_errors);
2032 
2033  CLEAR_ERRORS
2034 }
2035 
2036 
2038 {
2041 
2042  SetTech(entry, tech);
2043  eval = validator.Validate(seh, options);
2044  if (IsProteinTech(tech)) {
2045  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide", "Nucleic acid with protein sequence method"));
2046  } else if (tech == CMolInfo::eTech_est) {
2047  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA", "EST sequence should be mRNA"));
2048  }
2049  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDeltaSeq", "Delta seq technique should not be [" + NStr::UIntToString(tech) + "]"));
2050  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2051  eval = validator.Validate(seh, options);
2052  CheckErrors(*eval, expected_errors);
2053  CLEAR_ERRORS
2054 }
2055 
2056 
2058 {
2060 
2062 
2063  CRef<CDelta_seq> start_gap_seg(new CDelta_seq());
2064  start_gap_seg->SetLiteral().SetLength(10);
2065  start_gap_seg->SetLiteral().SetSeq_data().SetGap();
2066  entry->SetSeq().SetInst().SetExt().SetDelta().Set().insert(entry->SetSeq().SetInst().SetExt().SetDelta().Set().begin(), start_gap_seg);
2067  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2068  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2069  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("AAATTTGGGC", CSeq_inst::eMol_dna);
2070  CRef<CDelta_seq> end_gap_seg(new CDelta_seq());
2071  end_gap_seg->SetLiteral().SetLength(10);
2072  end_gap_seg->SetLiteral().SetSeq_data().SetGap();
2073  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(end_gap_seg);
2074  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2075  entry->SetSeq().SetInst().SetLength(94);
2076  SetTech(entry, tech);
2077  if (tech == CMolInfo::eTech_wgs) {
2078  AddChromosomeNoLocation(expected_errors, "lcl|good");
2079  }
2080  // expected_errors.push_back(new CExpectedError("lcl|good", tech == CMolInfo::eTech_wgs ? eDiag_Warning : eDiag_Error, "BadDeltaSeq", "First delta seq component is a gap"));
2081  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDeltaSeq", "There is 1 adjacent gap in delta seq"));
2082  // expected_errors.push_back(new CExpectedError("lcl|good", tech == CMolInfo::eTech_wgs ? eDiag_Warning : eDiag_Error, "BadDeltaSeq", "Last delta seq component is a gap"));
2083  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at beginning of sequence"));
2084  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at end of sequence"));
2085  /*
2086  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
2087  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
2088  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
2089  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
2090  */
2091  if (tech == CMolInfo::eTech_wgs) {
2092  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
2093  }
2094  eval = validator.Validate(seh, options);
2095  CheckErrors(*eval, expected_errors);
2096 
2097  CLEAR_ERRORS
2098 }
2099 
2100 
2101 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadDeltaSeq)
2102 {
2104 
2106 
2107  for (auto& it : entry->SetSeq().SetDescr().Set()) {
2108  if (it->IsMolinfo()) {
2109  it->SetMolinfo().SetTech(CMolInfo::eTech_derived);
2110  }
2111  }
2112 
2113  // don't report if NT or NC
2114  scope.RemoveTopLevelSeqEntry(seh);
2115  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
2116  seh = scope.AddTopLevelSeqEntry(*entry);
2117  eval = validator.Validate(seh, options);
2118  // AddChromosomeNoLocation(expected_errors, "ref|NC_123456|");
2119  CheckErrors(*eval, expected_errors);
2120  CLEAR_ERRORS
2121 
2122  entry->SetSeq().SetId().front()->SetOther().SetAccession("NT_123456");
2123  scope.RemoveTopLevelSeqEntry(seh);
2124  seh = scope.AddTopLevelSeqEntry(*entry);
2125  eval = validator.Validate(seh, options);
2126  // AddChromosomeNoLocation(expected_errors, "ref|NT_123456|");
2127  CheckErrors(*eval, expected_errors);
2128  CLEAR_ERRORS
2129 
2130  // don't report if gen-prod-set
2131 
2132  entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
2133  scope.RemoveTopLevelSeqEntry(seh);
2134  seh = scope.AddTopLevelSeqEntry(*entry);
2135 
2136  // allowed tech values
2137  vector<CMolInfo::TTech> allowed_list;
2138  allowed_list.push_back(CMolInfo::eTech_htgs_0);
2139  allowed_list.push_back(CMolInfo::eTech_htgs_1);
2140  allowed_list.push_back(CMolInfo::eTech_htgs_2);
2141  allowed_list.push_back(CMolInfo::eTech_htgs_3);
2142  allowed_list.push_back(CMolInfo::eTech_wgs);
2143  allowed_list.push_back(CMolInfo::eTech_composite_wgs_htgs);
2144  allowed_list.push_back(CMolInfo::eTech_unknown);
2145  allowed_list.push_back(CMolInfo::eTech_standard);
2146  allowed_list.push_back(CMolInfo::eTech_htc);
2147  allowed_list.push_back(CMolInfo::eTech_barcode);
2148  allowed_list.push_back(CMolInfo::eTech_tsa);
2149 
2151  bool allowed = false;
2152  for (CMolInfo::TTech it : allowed_list) {
2153  if (it == i) {
2154  allowed = true;
2155  break;
2156  }
2157  }
2158  if (allowed) {
2159  // don't report for htgs_0
2161  } else {
2163  }
2164  }
2165 
2166  CLEAR_ERRORS
2167 
2170 
2171  CLEAR_ERRORS
2172 }
2173 
2174 
2175 void AdjustGap(CSeq_gap& gap, CSeq_gap::EType gap_type, bool is_linked, vector<CLinkage_evidence::EType> linkage_evidence)
2176 {
2177  gap.Reset();
2178  gap.SetType(gap_type);
2179  if (is_linked) {
2181  } else {
2182  gap.ResetLinkage();
2183  }
2184  gap.ResetLinkage_evidence();
2185  for (auto it : linkage_evidence) {
2187  ev->SetType(it);
2188  gap.SetLinkage_evidence().push_back(ev);
2189  }
2190 }
2191 
2192 
2193 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqGapBadLinkage)
2194 {
2196 
2197  vector<CLinkage_evidence::EType> evidence;
2198  evidence.push_back(CLinkage_evidence::eType_align_genus);
2199  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2200  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2201  && it->GetLiteral().GetSeq_data().IsGap()) {
2202  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2203  CSeq_gap::eType_short_arm, true, evidence);
2204  }
2205  }
2206 
2208 
2209  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2210  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
2211  "SeqGapBadLinkage", "Seq-gap of type 3 should not have linkage evidence"));
2212 
2213  eval = validator.Validate(seh, options);
2214  CheckErrors(*eval, expected_errors);
2215 
2216  CLEAR_ERRORS
2217 
2218  scope.RemoveTopLevelSeqEntry(seh);
2219  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2220  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2221  && it->GetLiteral().GetSeq_data().IsGap()) {
2222  CSeq_gap& gap = it->SetLiteral().SetSeq_data().SetGap();
2223  gap.ResetLinkage();
2224  gap.ResetType();
2225  }
2226  }
2227  seh = scope.AddTopLevelSeqEntry(*entry);
2228 
2229  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2230  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
2231  "SeqGapBadLinkage", "Seq-gap with linkage evidence must have linkage field set to linked"));
2232 
2233  eval = validator.Validate(seh, options);
2234  CheckErrors(*eval, expected_errors);
2235 
2236  CLEAR_ERRORS
2237 
2238  scope.RemoveTopLevelSeqEntry(seh);
2239  evidence.push_back(CLinkage_evidence::eType_align_genus);
2240  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2241  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2242  && it->GetLiteral().GetSeq_data().IsGap()) {
2243  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2244  CSeq_gap::eType_fragment, true, evidence);
2245  }
2246  }
2247  seh = scope.AddTopLevelSeqEntry(*entry);
2248 
2249  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2250  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
2251  "SeqGapBadLinkage", "Linkage evidence 'align genus' appears 2 times"));
2252 
2253  eval = validator.Validate(seh, options);
2254  CheckErrors(*eval, expected_errors);
2255 
2256  CLEAR_ERRORS
2257 
2258  evidence.pop_back();
2259  evidence.push_back(CLinkage_evidence::eType_unspecified);
2260  scope.RemoveTopLevelSeqEntry(seh);
2261  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2262  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2263  && it->GetLiteral().GetSeq_data().IsGap()) {
2264  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2265  CSeq_gap::eType_fragment, true, evidence);
2266  }
2267  }
2268  seh = scope.AddTopLevelSeqEntry(*entry);
2269 
2270  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2271  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
2272  "SeqGapBadLinkage", "Seq-gap type has unspecified and additional linkage evidence"));
2273 
2274  eval = validator.Validate(seh, options);
2275  CheckErrors(*eval, expected_errors);
2276 
2277  CLEAR_ERRORS
2278 
2279  scope.RemoveTopLevelSeqEntry(seh);
2280  evidence.clear();
2281  evidence.push_back(CLinkage_evidence::eType_unspecified);
2282  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2283  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2284  && it->GetLiteral().GetSeq_data().IsGap()) {
2285  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2286  CSeq_gap::eType_unknown, true, evidence);
2287  }
2288  }
2289  seh = scope.AddTopLevelSeqEntry(*entry);
2290 
2291  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2292  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
2293  "SeqGapBadLinkage", "Single Seq-gap has unknown type and unspecified linkage"));
2294 
2295  eval = validator.Validate(seh, options);
2296  CheckErrors(*eval, expected_errors);
2297 
2298  CLEAR_ERRORS
2299 
2300  scope.RemoveTopLevelSeqEntry(seh);
2301  CRef<CDelta_seq> gap_seg(new CDelta_seq());
2302  gap_seg->SetLiteral().SetLength(10);
2303  AdjustGap(gap_seg->SetLiteral().SetSeq_data().SetGap(),
2304  CSeq_gap::eType_unknown, true, evidence);
2305 
2306  // adjust delta to avoid errors about large number of Ns in first and last 50 bp
2307  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCATGATGATGTACCGTACGTTTTCCCATGATGATGTACCGTACGTTTT");
2308  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetLength(50);
2309  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
2310  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATGTACCGTACGTTTTCCCATGATGATGTACCGTACGTTTT", CSeq_inst::eMol_dna);
2311  entry->SetSeq().SetInst().SetLength(132);
2312 
2313  seh = scope.AddTopLevelSeqEntry(*entry);
2314 
2315  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2316  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
2317  "SeqGapBadLinkage", "All 2 Seq-gaps have unknown type and unspecified linkage"));
2318 
2319  eval = validator.Validate(seh, options);
2320  CheckErrors(*eval, expected_errors);
2321 
2322  CLEAR_ERRORS
2323 }
2324 
2325 
2326 void ChangeErrorAcc(vector<CExpectedError*> expected_errors, const string& acc)
2327 {
2328  for (auto it : expected_errors) {
2329  if (it) {
2330  it->SetAccession(acc);
2331  }
2332  }
2333 }
2334 
2335 
2336 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingIdsOnBioseq)
2337 {
2339 
2341 
2342  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (lcl|good - lcl|bad)"));
2343  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2344 
2345  // local IDs
2346  scope.RemoveTopLevelSeqEntry(seh);
2347  CRef<CSeq_id> id2(new CSeq_id());
2348  id2->SetLocal().SetStr("bad");
2349  entry->SetSeq().SetId().push_back(id2);
2350  seh = scope.AddTopLevelSeqEntry(*entry);
2351  eval = validator.Validate(seh, options);
2352  CheckErrors(*eval, expected_errors);
2353 
2354  // GIBBSQ
2355  scope.RemoveTopLevelSeqEntry(seh);
2356  CRef<CSeq_id> id1 = entry->SetSeq().SetId().front();
2357  id1->SetGibbsq(1);
2358  id2->SetGibbsq(2);
2359  seh = scope.AddTopLevelSeqEntry(*entry);
2360  ChangeErrorAcc(expected_errors, "bbs|1");
2361  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (bbs|1 - bbs|2)");
2362  eval = validator.Validate(seh, options);
2363  CheckErrors(*eval, expected_errors);
2364 
2365  // GIBBSQ
2366  scope.RemoveTopLevelSeqEntry(seh);
2367  id1->SetGibbmt(1);
2368  id2->SetGibbmt(2);
2369  seh = scope.AddTopLevelSeqEntry(*entry);
2370  ChangeErrorAcc(expected_errors, "bbm|1");
2371  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (bbm|1 - bbm|2)");
2372  eval = validator.Validate(seh, options);
2373  CheckErrors(*eval, expected_errors);
2374 
2375  // GI
2376  scope.RemoveTopLevelSeqEntry(seh);
2377  id1->SetGi(GI_CONST(1));
2378  id2->SetGi(GI_CONST(2));
2379  CRef<CSeq_id> id3(new CSeq_id("gb|AY123456.1"));
2380  entry->SetSeq().SetId().push_back(id3);
2381  seh = scope.AddTopLevelSeqEntry(*entry);
2382  ChangeErrorAcc(expected_errors, "gb|AY123456.1|");
2383  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (gi|1 - gi|2)");
2384  eval = validator.Validate(seh, options);
2385  CheckErrors(*eval, expected_errors);
2386  entry->SetSeq().SetId().pop_back();
2387 
2388  // GIIM
2389  scope.RemoveTopLevelSeqEntry(seh);
2390  id1->SetGiim().SetId(1);
2391  id1->SetGiim().SetDb("foo");
2392  id2->SetGiim().SetId(2);
2393  id2->SetGiim().SetDb("foo");
2394  seh = scope.AddTopLevelSeqEntry(*entry);
2395  CLEAR_ERRORS
2396 
2397  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "IdOnMultipleBioseqs", "BioseqFind (gim|1) unable to find itself - possible internal error"));
2398  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gim|1 - gim|2)"));
2399  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "IdOnMultipleBioseqs", "BioseqFind (gim|2) unable to find itself - possible internal error"));
2400  // AddChromosomeNoLocation(expected_errors, "gim|1");
2401  eval = validator.Validate(seh, options);
2402  CheckErrors(*eval, expected_errors);
2403  CLEAR_ERRORS
2404 
2405  // patent
2406  scope.RemoveTopLevelSeqEntry(seh);
2407  id1->SetPatent().SetSeqid(1);
2408  id1->SetPatent().SetCit().SetCountry("USA");
2409  id1->SetPatent().SetCit().SetId().SetNumber("1");
2410  id2->SetPatent().SetSeqid(2);
2411  id2->SetPatent().SetCit().SetCountry("USA");
2412  id2->SetPatent().SetCit().SetId().SetNumber("2");
2413  seh = scope.AddTopLevelSeqEntry(*entry);
2414  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (pat|USA|1|1 - pat|USA|2|2)"));
2415  // AddChromosomeNoLocation(expected_errors, "pat|USA|1|1");
2416  eval = validator.Validate(seh, options);
2417  CheckErrors(*eval, expected_errors);
2418 
2419  // pdb
2420  scope.RemoveTopLevelSeqEntry(seh);
2421  id1->SetPdb().SetMol().Set("good");
2422  id2->SetPdb().SetMol().Set("badd");
2423  seh = scope.AddTopLevelSeqEntry(*entry);
2424  ChangeErrorAcc(expected_errors, "pdb|good| ");
2425  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (pdb|good| - pdb|badd| )");
2426  eval = validator.Validate(seh, options);
2427  CheckErrors(*eval, expected_errors);
2428 
2429  // general
2430  scope.RemoveTopLevelSeqEntry(seh);
2431  id1->SetGeneral().SetDb("a");
2432  id1->SetGeneral().SetTag().SetStr("good");
2433  id2->SetGeneral().SetDb("a");
2434  id2->SetGeneral().SetTag().SetStr("bad");
2435  seh = scope.AddTopLevelSeqEntry(*entry);
2436  ChangeErrorAcc(expected_errors, "gnl|a|good");
2437  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (gnl|a|good - gnl|a|bad)");
2438  eval = validator.Validate(seh, options);
2439  CheckErrors(*eval, expected_errors);
2440 
2441  CLEAR_ERRORS
2442  // should get no error if db values are different
2443  scope.RemoveTopLevelSeqEntry(seh);
2444  id2->SetGeneral().SetDb("b");
2445  seh = scope.AddTopLevelSeqEntry(*entry);
2446  // AddChromosomeNoLocation(expected_errors, "gnl|a|good");
2447  eval = validator.Validate(seh, options);
2448  CheckErrors(*eval, expected_errors);
2449 
2450  // genbank
2451  scope.RemoveTopLevelSeqEntry(seh);
2452  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gb|AY222222|)"));
2453  id1->SetGenbank().SetAccession("AY123456");
2454  id2->SetGenbank().SetAccession("AY222222");
2455  seh = scope.AddTopLevelSeqEntry(*entry);
2456  eval = validator.Validate(seh, options);
2457  CheckErrors(*eval, expected_errors);
2458 
2459  // try genbank with accession same, versions different
2460  scope.RemoveTopLevelSeqEntry(seh);
2461  id2->SetGenbank().SetAccession("AY123456");
2462  id2->SetGenbank().SetVersion(2);
2463  seh = scope.AddTopLevelSeqEntry(*entry);
2464  CLEAR_ERRORS
2465  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.2|");
2466  expected_errors.push_back(new CExpectedError("gb|AY123456.2|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gb|AY123456.2|)"));
2467  eval = validator.Validate(seh, options);
2468  CheckErrors(*eval, expected_errors);
2469 
2470  // try similar id type
2471  scope.RemoveTopLevelSeqEntry(seh);
2472  id2->SetGpipe().SetAccession("AY123456");
2473  seh = scope.AddTopLevelSeqEntry(*entry);
2474  CLEAR_ERRORS
2475  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gpp|AY123456|)"));
2476  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
2477  eval = validator.Validate(seh, options);
2478  CheckErrors(*eval, expected_errors);
2479 
2480  // LRG
2481  scope.RemoveTopLevelSeqEntry(seh);
2482  id1->SetGeneral().SetDb("LRG");
2483  id1->SetGeneral().SetTag().SetStr("good");
2484  seh = scope.AddTopLevelSeqEntry(*entry);
2485  ChangeErrorAcc(expected_errors, "gpp|AY123456|");
2486  expected_errors[0]->SetErrMsg("LRG sequence needs NG_ accession");
2487  expected_errors[0]->SetSeverity(eDiag_Critical);
2488  eval = validator.Validate(seh, options);
2489  CheckErrors(*eval, expected_errors);
2490  // no error if has NG
2491  scope.RemoveTopLevelSeqEntry(seh);
2492  id2->SetOther().SetAccession("NG_123456");
2493  seh = scope.AddTopLevelSeqEntry(*entry);
2494  CLEAR_ERRORS
2495  // AddChromosomeNoLocation(expected_errors, "ref|NG_123456|");
2496  eval = validator.Validate(seh, options);
2497  CheckErrors(*eval, expected_errors);
2498 
2499  CLEAR_ERRORS
2500 }
2501 
2502 
2503 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNuclAcid)
2504 {
2506 
2508 
2509  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2510  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MolNuclAcid", "Bioseq.mol is type nucleic acid"));
2511 
2512  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_na);
2513  eval = validator.Validate(seh, options);
2514  CheckErrors(*eval, expected_errors);
2515 
2516  CLEAR_ERRORS
2517 }
2518 
2519 
2520 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingBiomolTech)
2521 {
2523 
2525 
2526  // allowed tech values
2527  vector<CMolInfo::TTech> genomic_list;
2528  genomic_list.push_back(CMolInfo::eTech_sts);
2529  genomic_list.push_back(CMolInfo::eTech_survey);
2530  genomic_list.push_back(CMolInfo::eTech_wgs);
2531  genomic_list.push_back(CMolInfo::eTech_htgs_0);
2532  genomic_list.push_back(CMolInfo::eTech_htgs_1);
2533  genomic_list.push_back(CMolInfo::eTech_htgs_2);
2534  genomic_list.push_back(CMolInfo::eTech_htgs_3);
2535  genomic_list.push_back(CMolInfo::eTech_composite_wgs_htgs);
2536 
2538  bool genomic = false;
2539  for (CMolInfo::TTech it : genomic_list) {
2540  if (it == i) {
2541  genomic = true;
2542  break;
2543  }
2544  }
2545  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
2546  SetTech(entry, i);
2548  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolType", "Molecule type (DNA) does not match biomol (RNA)"));
2549  if (i == CMolInfo::eTech_wgs) {
2550  AddChromosomeNoLocation(expected_errors, "lcl|good");
2551  }
2552  if (i == CMolInfo::eTech_est) {
2553  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA", "EST sequence should be mRNA"));
2554  }
2555  if (i == CMolInfo::eTech_htgs_2) {
2556  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 raw seq has no gaps and no graphs"));
2557  }
2558  if (genomic) {
2559  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic", "HTGS/STS/GSS/WGS sequence should be genomic"));
2560  eval = validator.Validate(seh, options);
2561  CheckErrors(*eval, expected_errors);
2563  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
2564  delete expected_errors[0];
2565  expected_errors[0] = nullptr;
2566  expected_errors.back()->SetErrCode("HTGS_STS_GSS_WGSshouldNotBeRNA");
2567  expected_errors.back()->SetErrMsg("HTGS/STS/GSS/WGS sequence should not be RNA");
2568  eval = validator.Validate(seh, options);
2569  CheckErrors(*eval, expected_errors);
2570  } else {
2571  if (IsProteinTech(i)) {
2572  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide", "Nucleic acid with protein sequence method"));
2573  }
2574  if (i == CMolInfo::eTech_barcode) {
2575  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
2576  } else if (i == CMolInfo::eTech_tsa) {
2577  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2578  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"cRNA\" is not appropriate for sequences that use the TSA technique."));
2579  }
2580  eval = validator.Validate(seh, options);
2581  CheckErrors(*eval, expected_errors);
2582  }
2583  CLEAR_ERRORS
2584  }
2585 
2586  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
2587  SetTech(entry, CMolInfo::eTech_tsa);
2588  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2589  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolType", "Molecule type (DNA) does not match biomol (RNA)"));
2590  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2591  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"cRNA\" is not appropriate for sequences that use the TSA technique."));
2592  eval = validator.Validate(seh, options);
2593  CheckErrors(*eval, expected_errors);
2594 
2595  CLEAR_ERRORS
2596 
2597  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2598  eval = validator.GetTSAConflictingBiomolTechErrors(seh);
2599  CheckErrors(*eval, expected_errors);
2600  eval = validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
2601  CheckErrors(*eval, expected_errors);
2602  CLEAR_ERRORS
2603 }
2604 
2605 
2606 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqIdNameHasSpace)
2607 {
2609  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
2610  entry->SetSeq().SetId().front()->SetOther().SetName("good one");
2611 
2613 
2614  expected_errors.push_back(new CExpectedError("ref|NC_123456|good one", eDiag_Critical, "SeqIdNameHasSpace", "Seq-id.name 'good one' should be a single word without any spaces"));
2615  // AddChromosomeNoLocation(expected_errors, "ref|NC_123456|good one");
2616 
2617  eval = validator.Validate(seh, options);
2618  CheckErrors(*eval, expected_errors);
2619 
2620  CLEAR_ERRORS
2621 }
2622 
2623 
2624 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_DuplicateSegmentReferences)
2625 {
2626 #if 0
2627  // removed per VR-779
2629  entry->SetSeq().SetInst().ResetSeq_data();
2630  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
2631  CRef<CSeq_loc> seg1(new CSeq_loc());
2632  seg1->SetWhole().SetGenbank().SetAccession("AY123456");
2633  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(seg1);
2634  CRef<CSeq_loc> seg2(new CSeq_loc());
2635  seg2->SetWhole().SetGenbank().SetAccession("AY123456");
2636  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(seg2);
2637  entry->SetSeq().SetInst().SetLength(970);
2638 
2640  // need to call this statement before calling AddDefaults
2641  // to make sure that we can fetch the sequence referenced by the
2642  // delta sequence so that we can detect that the loc in the
2643  // delta sequence is longer than the referenced sequence
2645  CScope scope(*objmgr);
2646  scope.AddDefaults();
2647  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
2648 
2649  CValidator validator(*objmgr);
2650 
2651  // Set validator options
2652  unsigned int options = CValidator::eVal_need_isojta
2656 
2657  // list of expected errors
2658  vector<CExpectedError*> expected_errors;
2659  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLocOrder", "Segmented BioseqIntervals out of order in SeqLoc [[gb|AY123456|, gb|AY123456|]]"));
2660  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DuplicateSegmentReferences", "Segmented sequence has multiple references to gb|AY123456"));
2662 
2663  eval = validator.Validate(seh, options);
2664  CheckErrors(*eval, expected_errors);
2665 
2666  seg2->SetInt().SetId().SetGenbank().SetAccession("AY123456");
2667  seg2->SetInt().SetFrom(0);
2668  seg2->SetInt().SetTo(484);
2669  expected_errors[0]->SetErrMsg("Segmented BioseqIntervals out of order in SeqLoc [[gb|AY123456|, 1-485]]");
2670  expected_errors[1]->SetSeverity(eDiag_Warning);
2671  expected_errors[1]->SetErrMsg("Segmented sequence has multiple references to gb|AY123456 that are not SEQLOC_WHOLE");
2672  eval = validator.Validate(seh, options);
2673  CheckErrors(*eval, expected_errors);
2674 
2675  CLEAR_ERRORS
2676 #endif
2677 }
2678 
2679 
2680 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_TrailingX)
2681 {
2683  CRef<CSeq_entry> nuc = entry->SetSet().SetSeq_set().front();
2684  CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
2685  CRef<CSeq_feat> prot_feat = prot->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
2687  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATANNNNNN");
2688  nuc->SetSeq().SetInst().SetLength(27);
2689  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEIXX");
2690  prot->SetSeq().SetInst().SetLength(9);
2692  prot_feat->SetLocation().SetInt().SetTo(8);
2693  prot_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
2694  prot_feat->SetPartial(true);
2695  cds_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
2696  cds_feat->SetPartial(true);
2697 
2699 
2700  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TerminalNs", "N at end of sequence"));
2701  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "TrailingX", "Sequence ends in 2 trailing Xs"));
2702  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "HighNpercent3Prime",
2703  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
2704  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2705 
2706  eval = validator.Validate(seh, options);
2707  CheckErrors(*eval, expected_errors);
2708 
2709  CLEAR_ERRORS
2710 }
2711 
2712 
2713 void TestBadProtId(const string& id_str)
2714 {
2715  // bad for just prots
2717  CRef<CSeq_id> bad_id(new CSeq_id());
2718  bad_id->SetGenbank().SetAccession(id_str);
2719  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2720  good_nuc_id->SetLocal().SetStr("nuc");
2721  CRef<CSeq_id> good_prot_id(new CSeq_id());
2722  good_prot_id->SetLocal().SetStr("prot");
2723 
2724  unit_test_util::ChangeNucId(entry, good_nuc_id);
2725  unit_test_util::ChangeProtId(entry, bad_id);
2726 
2728 
2729  expected_errors.push_back(new CExpectedError("gb|" + id_str + "|", eDiag_Error, "BadSeqIdFormat", "Bad accession " + id_str));
2730  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2731 
2732  eval = validator.Validate(seh, options);
2733  CheckErrors(*eval, expected_errors);
2734  CLEAR_ERRORS
2735 }
2736 
2737 
2738 void TestGoodProtId(const string& id_str)
2739 {
2741  CRef<CSeq_id> bad_id(new CSeq_id());
2742  bad_id->SetGenbank().SetAccession(id_str);
2743  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2744  good_nuc_id->SetLocal().SetStr("nuc");
2745  CRef<CSeq_id> good_prot_id(new CSeq_id());
2746  good_prot_id->SetLocal().SetStr("prot");
2747 
2748  unit_test_util::ChangeNucId(entry, good_nuc_id);
2749  unit_test_util::ChangeProtId(entry, bad_id);
2750 
2752 
2753  eval = validator.Validate(seh, options);
2754  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2755 
2756  CheckErrors(*eval, expected_errors);
2757  CLEAR_ERRORS
2758 }
2759 
2760 
2761 void TestGoodNucId(const string& id_str)
2762 {
2764  CRef<CSeq_id> bad_id(new CSeq_id());
2765  bad_id->SetGenbank().SetAccession(id_str);
2766  CRef<CSeq_id> good_prot_id(new CSeq_id());
2767  good_prot_id->SetLocal().SetStr("prot");
2768  unit_test_util::ChangeNucId(entry, bad_id);
2769  unit_test_util::ChangeProtId(entry, good_prot_id);
2770  bool is_wgs = false;
2771  if (id_str.length() == 12 || id_str.length() == 13 || id_str.length() == 14 || id_str.length() == 15) {
2772  SetTech(entry->SetSet().SetSeq_set().front(), CMolInfo::eTech_wgs);
2773  is_wgs = true;
2774  }
2775 
2777 
2778  if (is_wgs) {
2779  AddChromosomeNoLocation(expected_errors, "gb|" + id_str + "|");
2780  }
2781  eval = validator.Validate(seh, options);
2782  CheckErrors(*eval, expected_errors);
2783  CLEAR_ERRORS
2784 }
2785 
2786 
2787 BOOST_FIXTURE_TEST_CASE(Test_SEQ_INST_BadSeqIdFormat, CGenBankFixture)
2788 {
2790  CRef<CSeq_entry> nuc_entry = entry->SetSet().SetSeq_set().front();
2791  CRef<CSeq_entry> prot_entry = entry->SetSet().SetSeq_set().back();
2792  CRef<CSeq_feat> prot_feat = prot_entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
2794 
2796 
2797  expected_errors.push_back(new CExpectedError("",eDiag_Error, "BadSeqIdFormat", "Bad accession"));
2798 
2799  vector<string> bad_ids;
2800  bad_ids.push_back("AY123456ABC"); // can't have letters after digits
2801  bad_ids.push_back("A1234"); // for a single letter, only acceptable number of digits is 5
2802  bad_ids.push_back("A123456");
2803  bad_ids.push_back("AY12345"); // for two letters, only acceptable number of digits is 6
2804  bad_ids.push_back("AY1234567");
2805  bad_ids.push_back("ABC1234"); // three letters bad unless prot and 5 digits
2806  bad_ids.push_back("ABC123456");
2807  bad_ids.push_back("ABCD1234567"); // four letters
2808  bad_ids.push_back("ABCDE123456"); // five letters
2809  bad_ids.push_back("ABCDE12345678");
2810 
2811  vector<string> bad_nuc_ids;
2812  bad_nuc_ids.push_back("ABC12345");
2813 
2814  vector<string> bad_prot_ids;
2815  bad_prot_ids.push_back("AY123456");
2816  bad_prot_ids.push_back("A12345");
2817 
2818  vector<string> good_ids;
2819 
2820  vector<string> good_nuc_ids;
2821  good_nuc_ids.push_back("AY123456");
2822  good_nuc_ids.push_back("A12345");
2823  good_nuc_ids.push_back("ABCD123456789");
2824  good_nuc_ids.push_back("ABCD1234567890");
2825 
2826  vector<string> good_prot_ids;
2827  good_prot_ids.push_back("ABC12345");
2828 
2829 
2830  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2831  good_nuc_id->SetLocal().SetStr("nuc");
2832  CRef<CSeq_id> good_prot_id(new CSeq_id());
2833  good_prot_id->SetLocal().SetStr("prot");
2834 
2835  CRef<CSeq_id> bad_id(new CSeq_id());
2836 
2837  // bad for both
2838  for (const string& id_str : bad_ids) {
2839  const string acc_str = "gb|" + id_str + "|";
2840  ChangeErrorAcc(expected_errors, acc_str);
2841  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2842 
2843  // GenBank
2844  scope.RemoveTopLevelSeqEntry(seh);
2845  scope.ResetDataAndHistory();
2846  bad_id->SetGenbank().SetAccession(id_str);
2847  unit_test_util::ChangeNucId(entry, bad_id);
2848  unit_test_util::ChangeProtId(entry, good_prot_id);
2849  seh = scope.AddTopLevelSeqEntry(*entry);
2850  eval = validator.Validate(seh, options);
2851  CheckErrors(*eval, expected_errors);
2852  scope.RemoveTopLevelSeqEntry(seh);
2853  scope.ResetDataAndHistory();
2854  unit_test_util::ChangeNucId(entry, good_nuc_id);
2855  unit_test_util::ChangeProtId(entry, bad_id);
2856  seh = scope.AddTopLevelSeqEntry(*entry);
2857  eval = validator.Validate(seh, options);
2858  CheckErrors(*eval, expected_errors);
2859  }
2860 
2861  for (const string& id_it : bad_ids) {
2862  const string id_str = "B" + id_it.substr(1);
2863  expected_errors[0]->SetAccession("embl|" + id_str + "|");
2864  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2865 
2866  // EMBL
2867  scope.RemoveTopLevelSeqEntry(seh);
2868  scope.ResetDataAndHistory();
2869  bad_id->SetEmbl().SetAccession(id_str);
2870  unit_test_util::ChangeNucId(entry, bad_id);
2871  unit_test_util::ChangeProtId(entry, good_prot_id);
2872  seh = scope.AddTopLevelSeqEntry(*entry);
2873  eval = validator.Validate(seh, options);
2874  expected_errors[0]->SetAccession("emb|" + id_str + "|");
2875  CheckErrors(*eval, expected_errors);
2876  scope.RemoveTopLevelSeqEntry(seh);
2877  scope.ResetDataAndHistory();
2878  unit_test_util::ChangeNucId(entry, good_nuc_id);
2879  unit_test_util::ChangeProtId(entry, bad_id);
2880  seh = scope.AddTopLevelSeqEntry(*entry);
2881  eval = validator.Validate(seh, options);
2882  CheckErrors(*eval, expected_errors);
2883  }
2884 
2885  for (const string& id_it : bad_ids) {
2886  const string id_str = "C" + id_it.substr(1);
2887  expected_errors[0]->SetAccession("dbj|" + id_str + "|");
2888  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2889 
2890  // DDBJ
2891  scope.RemoveTopLevelSeqEntry(seh);
2892  scope.ResetDataAndHistory();
2893  bad_id->SetDdbj().SetAccession(id_str);
2894  unit_test_util::ChangeNucId(entry, bad_id);
2895  unit_test_util::ChangeProtId(entry, good_prot_id);
2896  seh = scope.AddTopLevelSeqEntry(*entry);
2897  eval = validator.Validate(seh, options);
2898  expected_errors[0]->SetAccession("dbj|" + id_str + "|");
2899  CheckErrors(*eval, expected_errors);
2900  scope.RemoveTopLevelSeqEntry(seh);
2901  scope.ResetDataAndHistory();
2902  unit_test_util::ChangeNucId(entry, good_nuc_id);
2903  unit_test_util::ChangeProtId(entry, bad_id);
2904  seh = scope.AddTopLevelSeqEntry(*entry);
2905  eval = validator.Validate(seh, options);
2906  CheckErrors(*eval, expected_errors);
2907  }
2908 
2909  // bad for just nucs
2910  for (const string& id_str : bad_nuc_ids) {
2911  bad_id->SetGenbank().SetAccession(id_str);
2912  scope.RemoveTopLevelSeqEntry(seh);
2913  unit_test_util::ChangeNucId(entry, bad_id);
2914  unit_test_util::ChangeProtId(entry, good_prot_id);
2915  expected_errors[0]->SetAccession("gb|" + id_str + "|");
2916  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2917  seh = scope.AddTopLevelSeqEntry(*entry);
2918  eval = validator.Validate(seh, options);
2919  CheckErrors(*eval, expected_errors);
2920  }
2921 
2922  // bad for just prots
2923  for (auto id_it : bad_prot_ids) {
2924  TestBadProtId(id_it);
2925  }
2926 
2927  CLEAR_ERRORS
2928 
2929  // good for both
2930  for (const string& id_str : good_ids) {
2931  bad_id->SetGenbank().SetAccession(id_str);
2932  scope.RemoveTopLevelSeqEntry(seh);
2933  unit_test_util::ChangeNucId(entry, bad_id);
2934  unit_test_util::ChangeProtId(entry, good_prot_id);
2935  seh = scope.AddTopLevelSeqEntry(*entry);
2936  eval = validator.Validate(seh, options);
2937  // AddChromosomeNoLocation(expected_errors, "gb|" + *id_it + "|");
2938  CheckErrors(*eval, expected_errors);
2939  scope.RemoveTopLevelSeqEntry(seh);
2940  unit_test_util::ChangeNucId(entry, good_nuc_id);
2941  unit_test_util::ChangeProtId(entry, bad_id);
2942  seh = scope.AddTopLevelSeqEntry(*entry);
2943  eval = validator.Validate(seh, options);
2944  CheckErrors(*eval, expected_errors);
2945  CLEAR_ERRORS
2946  }
2947 
2948  // good for nucs
2949  for (const string& id_it : good_nuc_ids) {
2950  TestGoodNucId(id_it);
2951  }
2952 
2953  // good for just prots
2954  for (const string& id_it : good_prot_ids) {
2955  TestGoodProtId(id_it);
2956  }
2957 
2958  // if GI, needs version
2959  scope.RemoveTopLevelSeqEntry(seh);
2960  bad_id->SetGenbank().SetAccession("AY123456");
2961  bad_id->SetGenbank().SetVersion(0);
2962  unit_test_util::ChangeNucId(entry, bad_id);
2963  unit_test_util::ChangeProtId(entry, good_prot_id);
2964  CRef<CSeq_id> gi_id(new CSeq_id("gi|21914627"));
2965  nuc_entry->SetSeq().SetId().push_back(gi_id);
2966  seh = scope.AddTopLevelSeqEntry(*entry);
2967  eval = validator.Validate(seh, options);
2968  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Critical, "BadSeqIdFormat",
2969  "Accession AY123456 has 0 version"));
2970  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123456|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
2971  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
2972  CheckErrors(*eval, expected_errors);
2973 
2974  CLEAR_ERRORS
2975 
2976  nuc_entry->SetSeq().SetId().pop_back();
2977 
2978  // id that is too long
2979  scope.RemoveTopLevelSeqEntry(seh);
2980  bad_id->SetLocal().SetStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234");
2981  unit_test_util::ChangeNucId(entry, bad_id);
2982  seh = scope.AddTopLevelSeqEntry(*entry);
2983  eval = validator.Validate(seh, options);
2984  // AddChromosomeNoLocation(expected_errors, "lcl|ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234");
2985  CheckErrors(*eval, expected_errors);
2986 
2987  CLEAR_ERRORS
2988 
2989  // shouldn't report if ncbifile ID
2990  scope.RemoveTopLevelSeqEntry(seh);
2991  CRef<CSeq_id> ncbifile(new CSeq_id("gnl|NCBIFILE|ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234"));
2992  unit_test_util::ChangeNucId(entry, good_nuc_id);
2993  nuc_entry->SetSeq().SetId().push_back(ncbifile);
2994  seh = scope.AddTopLevelSeqEntry(*entry);
2995  eval = validator.Validate(seh, options);
2996  // AddChromosomeNoLocation(expected_errors, entry);
2997  CheckErrors(*eval, expected_errors);
2998  nuc_entry->SetSeq().SetId().pop_back();
2999  CLEAR_ERRORS
3000 
3001  // report if database name len too long
3002  scope.RemoveTopLevelSeqEntry(seh);
3003  entry = unit_test_util::BuildGoodSeq();
3004  CRef<CSeq_id> general(new CSeq_id());
3005  general->SetGeneral().SetDb("thisdatabasevalueislong");
3006  general->SetGeneral().SetTag().SetStr("b");
3007  entry->SetSeq().ResetId();
3008  entry->SetSeq().SetId().push_back(general);
3009  seh = scope.AddTopLevelSeqEntry(*entry);
3010  expected_errors.push_back(new CExpectedError("gnl|thisdatabasevalueislong|b", eDiag_Critical, "BadSeqIdFormat",
3011  "General database longer than 20 characters"));
3012 
3013  // AddChromosomeNoLocation(expected_errors, "gnl|thisdatabasevalueislong|b");
3014  eval = validator.Validate(seh, options);
3015  CheckErrors(*eval, expected_errors);
3016 
3017  CLEAR_ERRORS
3018 
3019  // do not report forward slash
3020  scope.RemoveTopLevelSeqEntry(seh);
3021  entry = unit_test_util::BuildGoodSeq();
3022  entry->SetSeq().SetId().front()->SetLocal().SetStr("a/b");
3023  seh = scope.AddTopLevelSeqEntry(*entry);
3024  eval = validator.Validate(seh, options);
3025  // AddChromosomeNoLocation(expected_errors, "lcl|a/b");
3026  CheckErrors(*eval, expected_errors);
3027 
3028  CLEAR_ERRORS
3029 }
3030 
3031 
3032 void TestOneGeneralSeqId(const string& db, const string& tag, const string& errmsg)
3033 {
3035  CRef<CSeq_id> id(new CSeq_id());
3036  id->SetGeneral().SetDb(db);
3037  id->SetGeneral().SetTag().SetStr(tag);
3038  entry->SetSeq().SetId().push_back(id);
3039 
3041 
3042  string acc_str = "lcl|good";
3043  if (!errmsg.empty()) {
3044  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Warning, "BadSeqIdCharacter",
3045  errmsg));
3046  }
3047  // AddChromosomeNoLocation(expected_errors, entry);
3048  eval = validator.Validate(seh, options);
3049  CheckErrors(*eval, expected_errors);
3050 
3051  CLEAR_ERRORS
3052 }
3053 
3054 
3056 {
3057  TestOneGeneralSeqId("PRJNA318798", " CpPA02_0001", "Bad character ' ' in sequence ID 'gnl|PRJNA318798| CpPA02_0001'");
3058  TestOneGeneralSeqId("PRJNA3 18798", "CpPA02_0001", "Bad character ' ' in sequence ID 'gnl|PRJNA3 18798|CpPA02_0001'");
3059 }
3060 
3061 
3062 void TestOneLongGeneral(bool emb, bool err)
3063 {
3065  CRef<CSeq_id> id(new CSeq_id());
3066  id->SetGeneral().SetDb("lgsi");
3067  id->SetGeneral().SetTag().SetStr("thisidentifierismorethanfiftycharactersinlengthsoitshouldberejected");
3068  entry->SetSeq().SetId().push_back(id);
3069 
3070  if (emb) {
3071  CRef<CSeq_id> emb(new CSeq_id());
3072  emb->SetEmbl().SetAccession("AY123457");
3073  emb->SetEmbl().SetVersion(1);
3074  entry->SetSeq().SetId().push_back(emb);
3075  }
3076 
3078 
3079  if (err) {
3080  string acc_str = "lcl|good";
3081  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Critical, "BadSeqIdLength",
3082  "General identifier longer than 50 characters"));
3083  }
3084 
3085  eval = validator.Validate(seh, options);
3086  CheckErrors(*eval, expected_errors);
3087 
3088  CLEAR_ERRORS
3089 }
3090 
3091 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_LongGeneralSeqId)
3092 {
3093  TestOneLongGeneral(false, true);
3094  TestOneLongGeneral(true, false);
3095 }
3096 
3097 
3098 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadSecondaryAccn)
3099 {
3101  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3102 
3104 
3105  CRef<CSeqdesc> gbdesc(new CSeqdesc());
3106  gbdesc->SetGenbank().SetExtra_accessions().push_back("AY123456");
3107  entry->SetSeq().SetDescr().Set().push_back(gbdesc);
3108 
3109  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "BadSecondaryAccn", "AY123456 used for both primary and secondary accession"));
3110  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
3111  eval = validator.Validate(seh, options);
3112  CheckErrors(*eval, expected_errors);
3113 
3114  gbdesc->SetEmbl().SetExtra_acc().push_back("AY123456");
3115  eval = validator.Validate(seh, options);
3116  CheckErrors(*eval, expected_errors);
3117 
3118  CLEAR_ERRORS
3119 }
3120 
3121 
3122 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ZeroGiNumber)
3123 {
3125  entry->SetSeq().SetId().front()->SetGi(ZERO_GI);
3126 
3128 
3129  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Critical, "ZeroGiNumber", "Invalid GI number"));
3130  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Error, "GiWithoutAccession", "No accession on sequence with gi number"));
3131  // AddChromosomeNoLocation(expected_errors, "gi|0");
3132  eval = validator.Validate(seh, options);
3133  CheckErrors(*eval, expected_errors);
3134 
3135  CLEAR_ERRORS
3136 }
3137 
3138 
3139 BOOST_AUTO_TEST_CASE(Test_HistoryGiCollision)
3140 {
3142  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3143  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3144  CRef<CSeq_id> gi_id(new CSeq_id());
3145  gi_id->SetGi(GI_CONST(21914627));
3146  entry->SetSeq().SetId().push_back(gi_id);
3147 
3149 
3150  CRef<CSeq_id> hist_id(new CSeq_id());
3151  hist_id->SetGi(GI_CONST(21914627));
3152  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetIds().push_back(hist_id);
3153  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetDate().SetStd().SetYear(2008);
3154 
3155  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error, "HistoryGiCollision", "Replaced by gi (21914627) is same as current Bioseq"));
3156  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
3157  eval = validator.Validate(seh, options);
3158  CheckErrors(*eval, expected_errors);
3159 
3160  entry->SetSeq().SetInst().SetHist().ResetReplaced_by();
3161  entry->SetSeq().SetInst().SetHist().SetReplaces().SetIds().push_back(hist_id);
3162  entry->SetSeq().SetInst().SetHist().SetReplaces().SetDate().SetStd().SetYear(2008);
3163  expected_errors[0]->SetErrMsg("Replaces gi (21914627) is same as current Bioseq");
3164  eval = validator.Validate(seh, options);
3165  CheckErrors(*eval, expected_errors);
3166 
3167  CLEAR_ERRORS
3168 
3169  // should not generate errors if date has not been set
3170  entry->SetSeq().SetInst().SetHist().ResetReplaces();
3171  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetIds().push_back(hist_id);
3172  eval = validator.Validate(seh, options);
3173  // AddChromosomeNoLocation(expected_errors, entry);
3174  CheckErrors(*eval, expected_errors);
3175 
3176  entry->SetSeq().SetInst().SetHist().ResetReplaced_by();
3177  entry->SetSeq().SetInst().SetHist().SetReplaces().SetIds().push_back(hist_id);
3178  eval = validator.Validate(seh, options);
3179  CheckErrors(*eval, expected_errors);
3180 
3181  CLEAR_ERRORS
3182 }
3183 
3184 
3185 BOOST_AUTO_TEST_CASE(Test_GiWithoutAccession)
3186 {
3188  entry->SetSeq().SetId().front()->SetGi(GI_CONST(123456));
3189 
3191 
3192  expected_errors.push_back(new CExpectedError("gi|123456", eDiag_Error, "GiWithoutAccession", "No accession on sequence with gi number"));
3193  // AddChromosomeNoLocation(expected_errors, entry);
3194  eval = validator.Validate(seh, options);
3195  CheckErrors(*eval, expected_errors);
3196 
3197  CLEAR_ERRORS
3198 }
3199 
3200 
3201 void TestOneOtherAcc(CRef<CSeq_id> other_acc, bool id_change, bool conflict, bool need_hist = false)
3202 {
3204  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3205  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3206  CRef<CSeq_id> gi_id(new CSeq_id());
3207  gi_id->SetGi(GI_CONST(21914627));
3208  entry->SetSeq().SetId().push_back(gi_id);
3209  entry->SetSeq().SetId().push_back(other_acc);
3210  string acc_str = "gb|AY123456.1|";
3211 
3213 
3214  if (conflict) {
3215  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error, "ConflictingIdsOnBioseq",
3216  "Conflicting ids on a Bioseq: (gb|AY123456.1| - " + other_acc->AsFastaString() + ")"));
3217  }
3218  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "MultipleAccessions", "Multiple accessions on sequence with gi number"));
3219  if (id_change) {
3220  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123457.1|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
3221  }
3222  if (need_hist) {
3223  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Info, "HistAssemblyMissing",
3224  "TPA record gb|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3225  }
3226  // AddChromosomeNoLocation(expected_errors, acc_str);
3227  eval = validator.Validate(seh, options);
3228  CheckErrors(*eval, expected_errors);
3229 
3230  CLEAR_ERRORS
3231 }
3232 
3233 
3234 BOOST_FIXTURE_TEST_CASE(Test_MultipleAccessions, CGenBankFixture)
3235 {
3236  CRef<CSeq_id> other_acc(new CSeq_id());
3237 
3238  // genbank, ddbj, embl, tpg, tpe, tpd, other, pir, swissprot, and prf all count as accessionts
3239  // genbank
3240  other_acc->SetGenbank().SetAccession("AY123457");
3241  other_acc->SetGenbank().SetVersion(1);
3242  TestOneOtherAcc(other_acc, true, true);
3243 
3244  // ddbj
3245  other_acc->SetDdbj().SetAccession("AY123457");
3246  other_acc->SetDdbj().SetVersion(1);
3247  TestOneOtherAcc(other_acc, false, true);
3248 
3249  // embl
3250  other_acc->SetEmbl().SetAccession("AY123457");
3251  other_acc->SetEmbl().SetVersion(1);
3252  TestOneOtherAcc(other_acc, false, true);
3253 
3254  // pir
3255  other_acc->SetPir().SetAccession("AY123457");
3256  other_acc->SetPir().SetVersion(1);
3257  TestOneOtherAcc(other_acc, false, false);
3258 
3259  // swissprot
3260  other_acc->SetSwissprot().SetAccession("AY123457");
3261  other_acc->SetSwissprot().SetVersion(1);
3262  TestOneOtherAcc(other_acc, false, false);
3263 
3264  // prf
3265  other_acc->SetPrf().SetAccession("AY123457");
3266  other_acc->SetPrf().SetVersion(1);
3267  TestOneOtherAcc(other_acc, false, false);
3268 
3269  // tpg
3270  other_acc->SetTpg().SetAccession("AY123457");
3271  other_acc->SetTpg().SetVersion(1);
3272  TestOneOtherAcc(other_acc, false, true, true);
3273 
3274  // tpe
3275  other_acc->SetTpe().SetAccession("AY123457");
3276  other_acc->SetTpe().SetVersion(1);
3277  TestOneOtherAcc(other_acc, false, true, true);
3278 
3279  // tpd
3280  other_acc->SetTpd().SetAccession("AY123457");
3281  other_acc->SetTpd().SetVersion(1);
3282  TestOneOtherAcc(other_acc, false, true, true);
3283 
3284  // other
3286  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3287  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3288  CRef<CSeq_id> gi_id(new CSeq_id());
3289  gi_id->SetGi(GI_CONST(21914627));
3290  entry->SetSeq().SetId().push_back(gi_id);
3291  entry->SetSeq().SetId().push_back(other_acc);
3292  other_acc->SetOther().SetAccession("NC_123457");
3293  other_acc->SetOther().SetVersion(1);
3294 
3296 
3297  string acc_str = "gb|AY123456.1|";
3298  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "INSDRefSeqPackaging", "INSD and RefSeq records should not be present in the same set"));
3299  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "MultipleAccessions", "Multiple accessions on sequence with gi number"));
3300  // AddChromosomeNoLocation(expected_errors, acc_str);
3301  eval = validator.Validate(seh, options);
3302  CheckErrors(*eval, expected_errors);
3303 
3304  CLEAR_ERRORS
3305 }
3306 
3307 
3308 BOOST_AUTO_TEST_CASE(Test_HistAssemblyMissing)
3309 {
3311  tpg_entry->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3312  tpg_entry->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3313 
3315  tpe_entry->SetSeq().SetId().front()->SetTpe().SetAccession("AY123456");
3316  tpe_entry->SetSeq().SetId().front()->SetTpe().SetVersion(1);
3317 
3319  tpd_entry->SetSeq().SetId().front()->SetTpd().SetAccession("AY123456");
3320  tpd_entry->SetSeq().SetId().front()->SetTpd().SetVersion(1);
3321 
3322  STANDARD_SETUP_NAME(tpg_entry)
3323 
3324  // tpg
3325  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Info, "HistAssemblyMissing", "TPA record tpg|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3326  // AddChromosomeNoLocation(expected_errors, tpg_entry);
3327  eval = validator.Validate(seh, options);
3328  CheckErrors(*eval, expected_errors);
3329 
3330  // tpe
3331  scope.RemoveTopLevelSeqEntry(seh);
3332  seh = scope.AddTopLevelSeqEntry(*tpe_entry);
3333  ChangeErrorAcc(expected_errors, "tpe|AY123456.1|");
3334  expected_errors[0]->SetErrMsg("TPA record tpe|AY123456.1| should have Seq-hist.assembly for PRIMARY block");
3335  eval = validator.Validate(seh, options);
3336  CheckErrors(*eval, expected_errors);
3337 
3338 
3339  // tpd
3340  scope.RemoveTopLevelSeqEntry(seh);
3341  seh = scope.AddTopLevelSeqEntry(*tpd_entry);
3342  ChangeErrorAcc(expected_errors, "tpd|AY123456.1|");
3343  expected_errors[0]->SetErrMsg("TPA record tpd|AY123456.1| should have Seq-hist.assembly for PRIMARY block");
3344  eval = validator.Validate(seh, options);
3345  CheckErrors(*eval, expected_errors);
3346 
3347  CLEAR_ERRORS
3348 
3349  // error suppressed if keyword present
3350  CRef<CSeqdesc> block(new CSeqdesc());
3351  block->SetGenbank().SetKeywords().push_back("TPA:reassembly");
3352  tpg_entry->SetSeq().SetDescr().Set().push_back(block);
3353  scope.RemoveTopLevelSeqEntry(seh);
3354  seh = scope.AddTopLevelSeqEntry(*tpg_entry);
3355  eval = validator.Validate(seh, options);
3356  // AddChromosomeNoLocation(expected_errors, tpg_entry);
3357 
3358  CheckErrors(*eval, expected_errors);
3359  block->SetEmbl().SetKeywords().push_back("TPA:reassembly");
3360  eval = validator.Validate(seh, options);
3361  CheckErrors(*eval, expected_errors);
3362  CLEAR_ERRORS
3363 }
3364 
3365 BOOST_AUTO_TEST_CASE(Test_TerminalNs)
3366 {
3368  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNNAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAANNNNNNNNNN");
3369  entry->SetSeq().SetInst().SetLength(62);
3370 
3372 
3373  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at beginning of sequence"));
3374  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at end of sequence"));
3375  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
3376  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3377  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
3378  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3379  // AddChromosomeNoLocation(expected_errors, entry);
3380  eval = validator.Validate(seh, options);
3381  CheckErrors(*eval, expected_errors);
3382 
3383  // warning level changes if not local only
3384  scope.RemoveTopLevelSeqEntry(seh);
3385  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3386  seh = scope.AddTopLevelSeqEntry(*entry);
3387  ChangeErrorAcc(expected_errors, "gb|AY123456|");
3388  expected_errors[0]->SetSeverity(eDiag_Error);
3389  expected_errors[1]->SetSeverity(eDiag_Error);
3390  eval = validator.Validate(seh, options);
3391  CheckErrors(*eval, expected_errors);
3392 
3393  CLEAR_ERRORS
3394 
3395  // also try delta sequence
3396  scope.RemoveTopLevelSeqEntry(seh);
3398  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNNNNNNNCCC");
3399  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCNNNNNNNNN");
3400  seh = scope.AddTopLevelSeqEntry(*entry);
3401 
3402  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 3 bases"));
3403  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at beginning of sequence"));
3404  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at end of sequence"));
3405  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 52 percent Ns"));
3406  eval = validator.Validate(seh, options);
3407  CheckErrors(*eval, expected_errors);
3408 
3409  // 10 Ns but just local stays at warning
3410  scope.RemoveTopLevelSeqEntry(seh);
3412  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNNNNNNNNCC");
3413  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCNNNNNNNNNN");
3414  seh = scope.AddTopLevelSeqEntry(*entry);
3415  expected_errors[0]->SetErrMsg("Maximum contig length is 2 bases");
3416  expected_errors.back()->SetErrMsg("Sequence contains 58 percent Ns");
3417  eval = validator.Validate(seh, options);
3418  CheckErrors(*eval, expected_errors);
3419 
3420  // 10 Ns but now has non-local ID, error
3421  scope.RemoveTopLevelSeqEntry(seh);
3422  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3423  seh = scope.AddTopLevelSeqEntry(*entry);
3424  ChangeErrorAcc(expected_errors, "gb|AY123456|");
3425  expected_errors[1]->SetSeverity(eDiag_Error);
3426  expected_errors[2]->SetSeverity(eDiag_Error);
3427  eval = validator.Validate(seh, options);
3428  CheckErrors(*eval, expected_errors);
3429 
3430  // NC and patent IDs back to warning
3431  scope.RemoveTopLevelSeqEntry(seh);
3432  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3433  seh = scope.AddTopLevelSeqEntry(*entry);
3434  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3435  expected_errors[1]->SetSeverity(eDiag_Warning);
3436  expected_errors[2]->SetSeverity(eDiag_Warning);
3437  eval = validator.Validate(seh, options);
3438  CheckErrors(*eval, expected_errors);
3439 
3440  scope.RemoveTopLevelSeqEntry(seh);
3441  entry->SetSeq().SetId().front()->SetPatent().SetSeqid(1);
3442  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetCountry("USA");
3443  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetId().SetNumber("1");
3444  seh = scope.AddTopLevelSeqEntry(*entry);
3445  ChangeErrorAcc(expected_errors, "pat|USA|1|1");
3446  delete expected_errors.back();
3447  expected_errors.pop_back();
3448  eval = validator.Validate(seh, options);
3449  CheckErrors(*eval, expected_errors);
3450  CLEAR_ERRORS
3451 
3452  // no more TerminalNs warnings if circular
3453  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3455  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Error, "ContigsTooShort",
3456  "Maximum contig length is 2 bases"));
3457  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Warning, "UnwantedCompleteFlag",
3458  "Suspicious use of complete"));
3459  // AddChromosomeNoLocation(expected_errors, entry);
3460 
3461  eval = validator.Validate(seh, options);
3462  CheckErrors(*eval, expected_errors);
3463 
3464  CLEAR_ERRORS
3465 }
3466 
3467 
3468 BOOST_FIXTURE_TEST_CASE(Test_UnexpectedIdentifierChange, CGenBankFixture)
3469 {
3471  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123457");
3472  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3473  CRef<CSeq_id> gi_id(new CSeq_id());
3474  gi_id->SetGi(GI_CONST(21914627));
3475  entry->SetSeq().SetId().push_back(gi_id);
3476 
3478 
3479  expected_errors.push_back(new CExpectedError("gb|AY123457.1|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123457.1|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
3480  // AddChromosomeNoLocation(expected_errors, entry);
3481  eval = validator.Validate(seh, options);
3482  CheckErrors(*eval, expected_errors);
3483 
3484  CLEAR_ERRORS
3485  scope.RemoveTopLevelSeqEntry(seh);
3486  entry->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3487  entry->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3488  seh = scope.AddTopLevelSeqEntry(*entry);
3489  // AddChromosomeNoLocation(expected_errors, entry);
3490  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Info, "HistAssemblyMissing", "TPA record tpg|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3491  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "Loss of accession (gb|AY123456.1|) on gi (21914627) compared to the NCBI sequence repository"));
3492  eval = validator.Validate(seh, options);
3493  CheckErrors(*eval, expected_errors);
3494 
3495  // TODO - try to instigate other errors
3496 
3497  CLEAR_ERRORS
3498 }
3499 
3500 
3501 BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqLit)
3502 {
3504  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNGG");
3505  SetTech(entry, CMolInfo::eTech_wgs);
3506 
3508 
3509  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqLit", "Run of 20 Ns in delta component 5 that starts at base 45"));
3510  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
3511  /*
3512  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3513  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3514  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3515  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3516  */
3517  AddChromosomeNoLocation(expected_errors, entry);
3518 
3519  eval = validator.Validate(seh, options);
3520  CheckErrors(*eval, expected_errors);
3521 
3522  CLEAR_ERRORS
3523 
3524  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGG");
3526  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqLit",
3527  "Run of 81 Ns in delta component 7 that starts at base 79"));
3528  /*
3529  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3530  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3531  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3532  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3533  */
3534  // AddChromosomeNoLocation(expected_errors, entry);
3535 
3536  eval = validator.Validate(seh, options);
3537  CheckErrors(*eval, expected_errors);
3538 
3540  eval = validator.Validate(seh, options);
3541  CheckErrors(*eval, expected_errors);
3542 
3544  eval = validator.Validate(seh, options);
3545  CheckErrors(*eval, expected_errors);
3546 
3547  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGG");
3549  expected_errors[0]->SetErrMsg("Run of 101 Ns in delta component 9 that starts at base 174");
3550  eval = validator.Validate(seh, options);
3551  CheckErrors(*eval, expected_errors);
3552 
3553  CLEAR_ERRORS
3554 }
3555 
3556 
3557 BOOST_AUTO_TEST_CASE(Test_SeqLitGapLength0)
3558 {
3560  CRef<CDelta_seq> delta_seq(new CDelta_seq());
3561  delta_seq->SetLiteral().SetLength(0);
3562  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(delta_seq);
3563 
3565 
3566  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLitGapLength0", "Gap of length 0 in delta chain"));
3567  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "Last delta seq component is a gap"));
3568  // AddChromosomeNoLocation(expected_errors, entry);
3569  eval = validator.Validate(seh, options);
3570  CheckErrors(*eval, expected_errors);
3571 
3572  // some kinds of fuzz don't trigger other kind of error
3573  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_gt);
3574  eval = validator.Validate(seh, options);
3575  CheckErrors(*eval, expected_errors);
3576 
3577  delta_seq->SetLiteral().SetFuzz().Reset();
3578  delta_seq->SetLiteral().SetFuzz().SetP_m(10);
3579  eval = validator.Validate(seh, options);
3580  CheckErrors(*eval, expected_errors);
3581 
3582  // others will
3583  delta_seq->SetLiteral().SetFuzz().Reset();
3584  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3585  expected_errors[0]->SetErrMsg("Gap of length 0 with unknown fuzz in delta chain");
3586  eval = validator.Validate(seh, options);
3587  CheckErrors(*eval, expected_errors);
3588 
3589  // try again with swissprot, error goes to warning
3590  scope.RemoveTopLevelSeqEntry(seh);
3591  entry->SetSeq().SetId().front()->SetSwissprot().SetAccession("AY123456");
3592  seh = scope.AddTopLevelSeqEntry(*entry);
3593  expected_errors[0]->SetSeverity(eDiag_Warning);
3594  ChangeErrorAcc(expected_errors, "sp|AY123456|");
3595  eval = validator.Validate(seh, options);
3596  CheckErrors(*eval, expected_errors);
3597 
3598  delta_seq->SetLiteral().SetFuzz().SetP_m(10);
3599  expected_errors[0]->SetErrMsg("Gap of length 0 in delta chain");
3600  eval = validator.Validate(seh, options);
3601  CheckErrors(*eval, expected_errors);
3602 
3603  delta_seq->SetLiteral().SetFuzz().Reset();
3604  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_gt);
3605  eval = validator.Validate(seh, options);
3606  CheckErrors(*eval, expected_errors);
3607 
3608  delta_seq->SetLiteral().ResetFuzz();
3609  eval = validator.Validate(seh, options);
3610  CheckErrors(*eval, expected_errors);
3611 
3612  CLEAR_ERRORS
3613 }
3614 
3615 
3617 {
3618  CRef<CSeqdesc> desc(new CSeqdesc());
3619  desc->SetUser().SetType().SetStr("TpaAssembly");
3620  entry->SetSeq().SetDescr().Set().push_back(desc);
3621 
3622  CRef<CUser_field> field(new CUser_field());
3623  field->SetLabel().SetStr("Label");
3624  field->SetData().SetStr("Data");
3625  desc->SetUser().SetData().push_back(field);
3626 }
3627 
3628 
3629 BOOST_FIXTURE_TEST_CASE(Test_TpaAssemblyProblem, CGenBankFixture)
3630 {
3631  CRef<CSeq_entry> entry(new CSeq_entry());
3634  member1->SetSeq().SetId().front()->SetLocal().SetStr("good");
3635  AddTpaAssemblyUserObject(member1);
3636  entry->SetSet().SetSeq_set().push_back(member1);
3638  member2->SetSeq().SetId().front()->SetLocal().SetStr("good2");
3639  AddTpaAssemblyUserObject(member2);
3640  entry->SetSet().SetSeq_set().push_back(member2);
3641 
3643 
3644  // two Tpa sequences, but neither has assembly and neither has GI, so no errors expected
3645  // AddChromosomeNoLocation(expected_errors, "lcl|good");
3646  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
3647  eval = validator.Validate(seh, options);
3648  CheckErrors(*eval, expected_errors);
3649 
3650  // now one has hist, other does not
3651  member1->SetSeq().SetInst().SetHist().SetAssembly().push_back(unit_test_util::BuildGoodAlign());
3652  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TpaAssemblyProblem", "There are 1 TPAs with history and 1 without history in this record."));
3653  eval = validator.Validate(seh, options);
3654  CheckErrors(*eval, expected_errors);
3655 
3656  // now one has gi
3657  scope.RemoveTopLevelSeqEntry(seh);
3658  member1->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3659  member1->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3660  CRef<CSeq_id> gi_id(new CSeq_id());
3661  gi_id->SetGi(GI_CONST(21914627));
3662  member1->SetSeq().SetId().push_back(gi_id);
3663  seh = scope.AddTopLevelSeqEntry(*entry);
3664 
3665  CLEAR_ERRORS
3666 
3667  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "Loss of accession (gb|AY123456.1|) on gi (21914627) compared to the NCBI sequence repository"));
3668  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Error, "TpaAssemblyProblem", "There are 1 TPAs with history and 1 without history in this record."));
3669  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "TpaAssemblyProblem", "There are 1 TPAs without history in this record, but the record has a gi number assignment."));
3670  // AddChromosomeNoLocation(expected_errors, "tpg|AY123456.1|");
3671  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
3672  eval = validator.Validate(seh, options);
3673  CheckErrors(*eval, expected_errors);
3674 
3675  CLEAR_ERRORS
3676 }
3677 
3678 
3680 {
3681  // prepare entry
3683  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
3684  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
3685  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(9);
3686  entry->SetSeq().SetInst().SetLength(32);
3687 
3689 
3690  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SeqLocLength", "Short length (10) on seq-loc (gb|AY123456|:1-10) of delta seq_ext"));
3691  // AddChromosomeNoLocation(expected_errors, entry);
3692  eval = validator.Validate(seh, options);
3693  CheckErrors(*eval, expected_errors);
3694 
3695  scope.RemoveTopLevelSeqEntry(seh);
3696  // if length 11, should not be a problem
3698  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
3699  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
3700  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(10);
3701  entry->SetSeq().SetInst().SetLength(33);
3702  seh = scope.AddTopLevelSeqEntry(*entry);
3703  eval = validator.Validate(seh, options);
3704  CheckErrors(*eval, expected_errors);
3705 
3706  CLEAR_ERRORS
3707 }
3708 
3709 
3710 BOOST_AUTO_TEST_CASE(Test_MissingGaps)
3711 {
3712  // prepare entry
3714  // remove gaps
3716 
3718 
3719  // AddChromosomeNoLocation(expected_errors, entry);
3720  // only report errors for specific molinfo tech values
3721  eval = validator.Validate(seh, options);
3722  CheckErrors(*eval, expected_errors);
3723  // htgs_3 should not report
3725  eval = validator.Validate(seh, options);
3726  CheckErrors(*eval, expected_errors);
3727 
3729  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingGaps", "HTGS delta seq should have gaps between all sequence runs"));
3730  eval = validator.Validate(seh, options);
3731  CheckErrors(*eval, expected_errors);
3732 
3734  eval = validator.Validate(seh, options);
3735  CheckErrors(*eval, expected_errors);
3736 
3738  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 delta seq has no gaps and no graphs"));
3739  eval = validator.Validate(seh, options);
3740  CheckErrors(*eval, expected_errors);
3741 
3742  // RefGeneTracking changes severity
3743  scope.RemoveTopLevelSeqEntry(seh);
3744  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3746  seh = scope.AddTopLevelSeqEntry(*entry);
3747  expected_errors[0]->SetSeverity(eDiag_Info);
3748  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3749  eval = validator.Validate(seh, options);
3750  CheckErrors(*eval, expected_errors);
3751  delete expected_errors[1];
3752  expected_errors.pop_back();
3753 
3755  eval = validator.Validate(seh, options);
3756  CheckErrors(*eval, expected_errors);
3757 
3759  eval = validator.Validate(seh, options);
3760  CheckErrors(*eval, expected_errors);
3761 
3762  CLEAR_ERRORS
3763 }
3764 
3765 
3766 BOOST_AUTO_TEST_CASE(Test_CompleteTitleProblem)
3767 {
3768  // prepare entry
3769  CRef<CSeq_entry> entry = BuildGoodSeq();
3770  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3771  SetLineage(entry, "Viruses; foo");
3772  SetTitle(entry, "Foo complete genome");
3773 
3775 
3776  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning, "CompleteTitleProblem", "Complete genome in title without complete flag set"));
3777  // AddChromosomeNoLocation(expected_errors, entry);
3778 
3779  eval = validator.Validate(seh, options);
3780  CheckErrors(*eval, expected_errors);
3781 
3782  CLEAR_ERRORS
3783 
3784  // should be no error if complete
3786 
3787  eval = validator.Validate(seh, options);
3788  // AddChromosomeNoLocation(expected_errors, entry);
3789  CheckErrors(*eval, expected_errors);
3790 
3791  // different message and code if gaps
3792  scope.RemoveTopLevelSeqEntry(seh);
3793  entry = BuildGoodDeltaSeq();
3794  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3795  unit_test_util::SetLineage(entry, "Viruses; foo");
3796  SetTitle(entry, "Foo complete genome");
3798  seh = scope.AddTopLevelSeqEntry(*entry);
3799 
3800  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3801  "CompleteGenomeHasGaps", "Title contains 'complete genome' but sequence has gaps"));
3802 
3803  eval = validator.Validate(seh, options);
3804  CheckErrors(*eval, expected_errors);
3805 
3806  CLEAR_ERRORS
3807 }
3808 
3809 
3810 BOOST_AUTO_TEST_CASE(Test_CompleteCircleProblem)
3811 {
3812  // prepare entry
3814  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3815 
3817 
3818  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
3819  "CompleteCircleProblem",
3820  "Circular topology without complete flag set"));
3821  // AddChromosomeNoLocation(expected_errors, entry);
3822 
3823  eval = validator.Validate(seh, options);
3824  CheckErrors(*eval, expected_errors);
3825 
3826  CLEAR_ERRORS
3827 
3828  scope.RemoveTopLevelSeqEntry(seh);
3829  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3830  SetTitle(entry, "This is just a title");
3832  seh = scope.AddTopLevelSeqEntry(*entry);
3833  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3834  "CompleteCircleProblem",
3835  "Circular topology has complete flag set, but title should say complete sequence or complete genome"));
3836  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3837  "UnwantedCompleteFlag",
3838  "Suspicious use of complete"));
3839  // AddChromosomeNoLocation(expected_errors, entry);
3840 
3841  eval = validator.Validate(seh, options);
3842  CheckErrors(*eval, expected_errors);
3843 
3844  CLEAR_ERRORS
3845 }
3846 
3847 
3848 BOOST_AUTO_TEST_CASE(Test_BadHTGSeq)
3849 {
3850  // prepare entry
3852  // remove gaps
3854 
3855  STANDARD_SETUP_NAME(delta_entry)
3856 
3857  SetTech(delta_entry, CMolInfo::eTech_htgs_2);
3858  // AddChromosomeNoLocation(expected_errors, delta_entry);
3859  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingGaps", "HTGS delta seq should have gaps between all sequence runs"));
3860  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 delta seq has no gaps and no graphs"));
3861  eval = validator.Validate(seh, options);
3862  CheckErrors(*eval, expected_errors);
3863 
3864  delete expected_errors[1];
3865  expected_errors.pop_back();
3866 
3867  // HTGS_ACTIVEFIN keyword disables BadHTGSeq error
3868  AddGenbankKeyword(delta_entry, "HTGS_ACTIVEFIN");
3869  eval = validator.Validate(seh, options);
3870  CheckErrors(*eval, expected_errors);
3871 
3872  CLEAR_ERRORS
3873 
3874  scope.RemoveTopLevelSeqEntry(seh);
3876  SetTech(raw_entry, CMolInfo::eTech_htgs_2);
3877  seh = scope.AddTopLevelSeqEntry(*raw_entry);
3878  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 raw seq has no gaps and no graphs"));
3879  // AddChromosomeNoLocation(expected_errors, raw_entry);
3880  eval = validator.Validate(seh, options);
3881  CheckErrors(*eval, expected_errors);
3882 
3883  CLEAR_ERRORS
3884 
3885  // HTGS_ACTIVEFIN keyword disables error
3886  AddGenbankKeyword(raw_entry, "HTGS_ACTIVEFIN");
3887  // AddChromosomeNoLocation(expected_errors, raw_entry);
3888  eval = validator.Validate(seh, options);
3889  CheckErrors(*eval, expected_errors);
3890 
3891 
3892  // htg3 errors
3893  SetTech(raw_entry, CMolInfo::eTech_htgs_3);
3894  AddGenbankKeyword(raw_entry, "HTGS_DRAFT");
3895  AddGenbankKeyword(raw_entry, "HTGS_PREFIN");
3896  AddGenbankKeyword(raw_entry, "HTGS_FULLTOP");
3897  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_DRAFT keyword"));
3898  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_PREFIN keyword"));
3899  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_ACTIVEFIN keyword"));
3900  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_FULLTOP keyword"));
3901  eval = validator.Validate(seh, options);
3902  CheckErrors(*eval, expected_errors);
3903 
3904  scope.RemoveTopLevelSeqEntry(seh);
3905  seh = scope.AddTopLevelSeqEntry(*delta_entry);
3906  SetTech(delta_entry, CMolInfo::eTech_htgs_3);
3907  AddGenbankKeyword(delta_entry, "HTGS_DRAFT");
3908  AddGenbankKeyword(delta_entry, "HTGS_PREFIN");
3909  AddGenbankKeyword(delta_entry, "HTGS_FULLTOP");
3910  eval = validator.Validate(seh, options);
3911  CheckErrors(*eval, expected_errors);
3912 
3913  CLEAR_ERRORS
3914 }
3915 
3916 
3917 BOOST_AUTO_TEST_CASE(Test_GapInProtein_and_BadProteinStart)
3918 {
3919  // prepare entry
3921  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("PRK-EIN");
3922 
3924 
3925  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapInProtein", "[1] internal gap symbols in protein sequence (gene? - fake protein name)"));
3926  // AddChromosomeNoLocation(expected_errors, entry);
3927  eval = validator.Validate(seh, options);
3928  CheckErrors(*eval, expected_errors);
3929 
3930  CLEAR_ERRORS
3931 
3932  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("-RKTEIN");
3933  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadProteinStart", "gap symbol at start of protein sequence (gene? - fake protein name)"));
3934  // AddChromosomeNoLocation(expected_errors, entry);
3935  eval = validator.Validate(seh, options);
3936  CheckErrors(*eval, expected_errors);
3937 
3938  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("-RK-EIN");
3939  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapInProtein", "[1] internal gap symbols in protein sequence (gene? - fake protein name)"));
3940  eval = validator.Validate(seh, options);
3941  CheckErrors(*eval, expected_errors);
3942 
3943  CLEAR_ERRORS
3944 }
3945 
3946 
3947 BOOST_AUTO_TEST_CASE(Test_TerminalGap)
3948 {
3949  // prepare entry
3951  CRef<CDelta_seq> first_seg(new CDelta_seq());
3952  first_seg->SetLiteral().SetLength(9);
3953  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_front(first_seg);
3954  CRef<CDelta_seq> last_seg(new CDelta_seq());
3955  last_seg->SetLiteral().SetLength(9);
3956  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(last_seg);
3957  entry->SetSeq().SetInst().SetLength(entry->SetSeq().SetInst().GetLength() + 18);
3958 
3960 
3961  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "First delta seq component is a gap"));
3962  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "Last delta seq component is a gap"));
3963  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at beginning of sequence"));
3964  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at end of sequence"));
3965  /*
3966  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3967  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3968  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3969  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3970  */
3971  // AddChromosomeNoLocation(expected_errors, entry);
3972 
3973  eval = validator.Validate(seh, options);
3974  CheckErrors(*eval, expected_errors);
3975 
3976  // if gap length is 10, severity is still warning because still all local IDS
3977  scope.RemoveTopLevelSeqEntry(seh);
3978  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetLength(10);
3979  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetLength(10);
3980  entry->SetSeq().SetInst().SetLength(entry->SetSeq().SetInst().GetLength() + 2);
3981  seh = scope.AddTopLevelSeqEntry(*entry);
3982  eval = validator.Validate(seh, options);
3983  CheckErrors(*eval, expected_errors);
3984 
3985 
3986  scope.RemoveTopLevelSeqEntry(seh);
3987  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3988  seh = scope.AddTopLevelSeqEntry(*entry);
3989  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3990  /*
3991  expected_errors[2]->SetSeverity(eDiag_Warning);
3992  expected_errors[3]->SetSeverity(eDiag_Warning);
3993  */
3994  eval = validator.Validate(seh, options);
3995  CheckErrors(*eval, expected_errors);
3996 
3997  scope.RemoveTopLevelSeqEntry(seh);
3998  entry->SetSeq().SetId().front()->SetPatent().SetSeqid(1);
3999  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetCountry("USA");
4000  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetId().SetNumber("1");
4001  seh = scope.AddTopLevelSeqEntry(*entry);
4002  ChangeErrorAcc(expected_errors, "pat|USA|1|1");
4003  eval = validator.Validate(seh, options);
4004  CheckErrors(*eval, expected_errors);
4005 
4006  CLEAR_ERRORS
4007 
4008  // no more terminal gap warnings if circular - changed to still show first/last delta component
4009  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
4011  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Warning, "UnwantedCompleteFlag",
4012  "Suspicious use of complete"));
4013  // AddChromosomeNoLocation(expected_errors, entry);
4014 
4015  eval = validator.Validate(seh, options);
4016  CheckErrors(*eval, expected_errors);
4017  CLEAR_ERRORS
4018 }
4019 
4020 
4021 BOOST_FIXTURE_TEST_CASE(Test_OverlappingDeltaRange, CGenBankFixture)
4022 {
4023  // prepare entry
4025  entry->SetSeq().SetInst().ResetExt();
4026  CRef<CSeq_id> seqid(new CSeq_id());
4027  seqid->SetGenbank().SetAccession("AY123456");
4028  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 0, 10);
4029  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 5, 15);
4030  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 20, 30);
4031  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 25, 35);
4032  entry->SetSeq().SetInst().SetLength(44);
4033 
4035 
4036  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingDeltaRange", "Overlapping delta range 6-16 and 1-11 on a Bioseq gb|AY123456|"));
4037  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingDeltaRange", "Overlapping delta range 26-36 and 21-31 on a Bioseq gb|AY123456|"));
4038  // AddChromosomeNoLocation(expected_errors, entry);
4039  eval = validator.Validate(seh, options);
4040  CheckErrors(*eval, expected_errors);
4041 
4042  CLEAR_ERRORS
4043 }
4044 
4045 
4046 BOOST_AUTO_TEST_CASE(Test_LeadingX)
4047 {
4048  // prepare entry
4050  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("XROTEIN");
4051 
4053 
4054  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LeadingX", "Sequence starts with leading X"));
4055  // AddChromosomeNoLocation(expected_errors, entry);
4056  eval = validator.Validate(seh, options);
4057  CheckErrors(*eval, expected_errors);
4058 
4059  CLEAR_ERRORS
4060 }
4061 
4062 BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqRaw)
4063 {
4064  // prepare entry
4066  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTTT");
4067  entry->SetSeq().SetInst().SetLength(110);
4068 
4070 
4071  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqRaw", "Run of 100 Ns in raw sequence starting at base 6"));
4072  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4073  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 90 percent Ns"));
4074  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4075  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4076  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4077  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4078  // AddChromosomeNoLocation(expected_errors, entry);
4079  eval = validator.Validate(seh, options);
4080  CheckErrors(*eval, expected_errors);
4081 
4082  CLEAR_ERRORS
4083 
4084  // expect no InternalNsInSeqRaw error
4085  scope.RemoveTopLevelSeqEntry(seh);
4086  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAANNNNNNNNNNNNNNNNNNNNTTTTT");
4087  entry->SetSeq().SetInst().SetLength(30);
4088  seh = scope.AddTopLevelSeqEntry(*entry);
4089  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4090  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 66 percent Ns"));
4091  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4092  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4093  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4094  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4095  // AddChromosomeNoLocation(expected_errors, entry);
4096  eval = validator.Validate(seh, options);
4097  CheckErrors(*eval, expected_errors);
4098 
4099  CLEAR_ERRORS
4100 
4101  // WGS has lower threshold
4102  SetTech(entry, CMolInfo::eTech_wgs);
4103  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqRaw", "Run of 20 Ns in raw sequence starting at base 6"));
4104  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4105  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 66 percent Ns"));
4106  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4107  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4108  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4109  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4110  AddChromosomeNoLocation(expected_errors, entry);
4111  eval = validator.Validate(seh, options);
4112  CheckErrors(*eval, expected_errors);
4113 
4114  CLEAR_ERRORS
4115 }
4116 
4117 
4118 BOOST_AUTO_TEST_CASE(Test_InternalNsAdjacentToGap)
4119 {
4120  // prepare entry
4122  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("ATGATGATGNNN");
4123  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNATGATGATG");
4124 
4126 
4127  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 9 bases"));
4128  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InternalNsAdjacentToGap", "Ambiguous residue N is adjacent to a gap around position 13"));
4129  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InternalNsAdjacentToGap", "Ambiguous residue N is adjacent to a gap around position 23"));
4130 // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4131 // "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4132 // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4133 // "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4134  // AddChromosomeNoLocation(expected_errors, entry);
4135 
4136  eval = validator.Validate(seh, options);
4137  CheckErrors(*eval, expected_errors);
4138 
4139  CLEAR_ERRORS
4140 }
4141 
4142 BOOST_AUTO_TEST_CASE(Test_DeltaComponentIsGi0)
4143 {
4144  // prepare entry
4146  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
4147  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
4148  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGi(ZERO_GI);
4149 
4151 
4152  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "DeltaComponentIsGi0", "Delta component is gi|0"));
4153  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DeltaSeqError", "Unable to find far delta sequence component"));
4154  // AddChromosomeNoLocation(expected_errors, entry);
4155 
4156  eval = validator.Validate(seh, options);
4157  CheckErrors(*eval, expected_errors);
4158 
4159  CLEAR_ERRORS
4160 }
4161 
4162 
4163 BOOST_AUTO_TEST_CASE(Test_InternalGapsInSeqRaw)
4164 {
4165  // prepare entry
4167  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGGCCAAAATTGGCCAAAATTGG-CAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
4168 
4170 
4171  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue '-' at position [27]"));
4172  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalGapsInSeqRaw", "Raw nucleotide should not contain gap characters"));
4173  // AddChromosomeNoLocation(expected_errors, entry);
4174 
4175  eval = validator.Validate(seh, options);
4176  CheckErrors(*eval, expected_errors);
4177 
4178  CLEAR_ERRORS
4179 }
4180 
4181 
4182 BOOST_AUTO_TEST_CASE(Test_SelfReferentialSequence)
4183 {
4184  // prepare entry
4186  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
4187  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
4188  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetLocal().SetStr("good");
4189 
4191 
4192  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "SelfReferentialSequence", "Self-referential delta sequence"));
4193  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InstantiatedGapMismatch", "Exception 4 in GapByGapInst"));
4194  // AddChromosomeNoLocation(expected_errors, entry);
4195 
4196  eval = validator.Validate(seh, options);
4197  CheckErrors(*eval, expected_errors);
4198 
4199  CLEAR_ERRORS
4200 }
4201 
4202 
4204 {
4205  // prepare entry
4207  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetWhole().SetGenbank().SetAccession("AY123456");
4208  entry->SetSeq().SetInst().SetLength(507);
4209 
4211 
4212  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WholeComponent", "Delta seq component should not be of type whole"));
4213  // AddChromosomeNoLocation(expected_errors, entry);
4214 
4215  eval = validator.Validate(seh, options);
4216  CheckErrors(*eval, expected_errors);
4217 
4218  CLEAR_ERRORS
4219 }
4220 
4221 
4223 {
4224  CRef<CSeq_id> gnl(new CSeq_id());
4225  gnl->SetGeneral().SetDb("a");
4226  gnl->SetGeneral().SetTag().SetStr("b");
4227  seq.SetId().front()->Assign(*gnl);
4228  CRef<CSeq_id> lcl(new CSeq_id());
4229  lcl->SetLocal().SetStr("x");
4230  seq.SetId().push_back(lcl);
4231  seq.SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().Assign(*gnl);
4232 }
4233 
4234 
4235 BOOST_AUTO_TEST_CASE(Test_ProteinsHaveGeneralID)
4236 {
4237  // prepare entry
4239  s_AddGeneralAndLocal(entry->SetSeq());
4240 
4242 
4243  // no error unless part of nuc-prot set
4244  // AddChromosomeNoLocation(expected_errors,entry);
4245  eval = validator.Validate(seh, options);
4246  CheckErrors(*eval, expected_errors);
4247  CLEAR_ERRORS
4248 
4249  scope.RemoveTopLevelSeqEntry(seh);
4252  s_AddGeneralAndLocal(prot->SetSeq());
4253 
4255  cds->SetProduct().SetWhole().SetGeneral().SetDb("a");
4256  cds->SetProduct().SetWhole().SetGeneral().SetTag().SetStr("b");
4257  seh = scope.AddTopLevelSeqEntry(*entry);
4258 
4259  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "ProteinsHaveGeneralID", "INDEXER_ONLY - Protein bioseqs have general seq-id."));
4260  // AddChromosomeNoLocation(expected_errors, entry);
4261 
4262  eval = validator.Validate(seh, options);
4263  CheckErrors(*eval, expected_errors);
4264 
4265  CLEAR_ERRORS
4266 }
4267 
4268 
4269 BOOST_AUTO_TEST_CASE(Test_HighNContentPercent_and_HighNContentStretch)
4270 {
4271  // prepare entry
4273  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCNNNNNNNNNNNAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTT");
4274  entry->SetSeq().SetInst().SetLength(100);
4275  SetTech(entry, CMolInfo::eTech_tsa);
4277  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
4278 
4280 
4281  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 11 percent Ns"));
4282  // AddChromosomeNoLocation(expected_errors, entry);
4283  eval = validator.Validate(seh, options);
4284  CheckErrors(*eval, expected_errors);
4285 
4286  scope.RemoveTopLevelSeqEntry(seh);
4287  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCNNNNNNNNNNNNNNNNTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTT");
4288  seh = scope.AddTopLevelSeqEntry(*entry);
4289  expected_errors[0]->SetErrMsg("Sequence contains 16 percent Ns");
4290  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentStretch", "Sequence has a stretch of 16 Ns"));
4291  eval = validator.Validate(seh, options);
4292  CheckErrors(*eval, expected_errors);
4293 
4294  CLEAR_ERRORS
4295 
4296  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentStretch", "Sequence has a stretch of 16 Ns"));
4297  eval = validator.GetTSANStretchErrors(seh);
4298  CheckErrors(*eval, expected_errors);
4299  eval = validator.GetTSANStretchErrors(entry->GetSeq());
4300  CheckErrors(*eval, expected_errors);
4301 
4302  CLEAR_ERRORS
4303 
4304  scope.RemoveTopLevelSeqEntry(seh);
4305  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AANNNNNNNNNNGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGTTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCNNNNNNNNNNAAA");
4306  seh = scope.AddTopLevelSeqEntry(*entry);
4307  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4308  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4309  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4310  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4311  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent",
4312  "Sequence contains 20 percent Ns"));
4313  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent5Prime",
4314  "Sequence has a stretch of at least 10 Ns within the first 20 bases"));
4315  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent3Prime",
4316  "Sequence has a stretch of at least 10 Ns within the last 20 bases"));
4317  // AddChromosomeNoLocation(expected_errors, entry);
4318  eval = validator.Validate(seh, options);
4319  CheckErrors(*eval, expected_errors);
4320 
4321  CLEAR_ERRORS
4322 
4323  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent5Prime", "Sequence has a stretch of at least 10 Ns within the first 20 bases"));
4324  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent3Prime", "Sequence has a stretch of at least 10 Ns within the last 20 bases"));
4325  eval = validator.GetTSANStretchErrors(seh);
4326  CheckErrors(*eval, expected_errors);
4327  eval = validator.GetTSANStretchErrors(entry->GetSeq());
4328  CheckErrors(*eval, expected_errors);
4329 
4330  CLEAR_ERRORS
4331 
4332  scope.RemoveTopLevelSeqEntry(seh);
4334  CRef<CDelta_seq> gap_seg(new CDelta_seq());
4335  gap_seg->SetLiteral().SetSeq_data().SetGap();
4336  gap_seg->SetLiteral().SetLength(10);
4337  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
4338  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGA", CSeq_inst::eMol_dna);
4339  entry->SetSeq().SetInst().SetLength(entry->GetSeq().GetInst().GetLength() + 20);
4340  seh = scope.AddTopLevelSeqEntry(*entry);
4341 
4342  /*
4343  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4344  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4345  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4346  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4347  */
4348  // AddChromosomeNoLocation(expected_errors, entry);
4349 
4350  eval = validator.Validate(seh, options);
4351  CheckErrors(*eval, expected_errors);
4352 
4353  CLEAR_ERRORS
4354 }
4355 
4356 
4357 BOOST_AUTO_TEST_CASE(Test_SeqLitDataLength0)
4358 {
4359  // prepare entry
4361 
4362  CDelta_ext::Tdata::iterator seg_it = entry->SetSeq().SetInst().SetExt().SetDelta().Set().begin();
4363  ++seg_it;
4364  (*seg_it)->SetLiteral().SetSeq_data().SetIupacna().Set();
4365  (*seg_it)->SetLiteral().SetLength(0);
4366 
4367  entry->SetSeq().SetInst().SetLength(24);
4368 
4370 
4371  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLitDataLength0", "Seq-lit of length 0 in delta chain"));
4372  // AddChromosomeNoLocation(expected_errors, entry);
4373  eval = validator.Validate(seh, options);
4374  CheckErrors(*eval, expected_errors);
4375 
4376  CLEAR_ERRORS
4377 }
4378 
4379 
4381 {
4383 
4384  entry->SetSeq().SetInst().ResetSeq_data();
4385  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
4386  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("ATGATGATGCCC", CSeq_inst::eMol_dna);
4387  CRef<CDelta_seq> gap_seg(new CDelta_seq());
4388  gap_seg->SetLiteral().SetLength(101);
4389  gap_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
4390  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
4391  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATG", CSeq_inst::eMol_dna);
4392  entry->SetSeq().SetInst().SetLength(125);
4393 
4394  return entry;
4395 }
4396 
4397 
4398 BOOST_AUTO_TEST_CASE(Test_UnknownLengthGapNot100)
4399 {
4401 
4403 
4404  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnknownLengthGapNot100", "Gap of unknown length should have length 100"));
4405  /*
4406  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4407  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4408  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4409  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4410  */
4411  // AddChromosomeNoLocation(expected_errors, entry);
4412  eval = validator.Validate(seh, options);
4413  CheckErrors(*eval, expected_errors);
4414 
4415  CLEAR_ERRORS
4416 }
4417 
4418 
4420 {
4421  // prepare entry
4423  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
4425  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ds);
4426 
4428 
4429  // double strand
4430  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "mRNAshouldBeSingleStranded", "mRNA should be single stranded not double stranded"));
4431  // AddChromosomeNoLocation(expected_errors, entry);
4432  eval = validator.Validate(seh, options);
4433  CheckErrors(*eval, expected_errors);
4434 
4435  // mixed strand
4436  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_mixed);
4437  eval = validator.Validate(seh, options);
4438  CheckErrors(*eval, expected_errors);
4439 
4440  // mixed strand
4441  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_other);
4442  eval = validator.Validate(seh, options);
4443  CheckErrors(*eval, expected_errors);
4444 
4445  CLEAR_ERRORS
4446 
4447  // these should not produce errors
4448 
4449  // strand not set
4450  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_not_set);
4451  eval = validator.Validate(seh, options);
4452  // AddChromosomeNoLocation(expected_errors, entry);
4453 
4454  CheckErrors(*eval, expected_errors);
4455 
4456  entry->SetSeq().SetInst().ResetStrand();
4457  eval = validator.Validate(seh, options);
4458  CheckErrors(*eval, expected_errors);
4459 
4460  // single strand
4461  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ss);
4462  eval = validator.Validate(seh, options);
4463  CheckErrors(*eval, expected_errors);
4464 
4465  CLEAR_ERRORS
4466 }
4467 
4468 
4469 BOOST_AUTO_TEST_CASE(Test_BioSourceMissing)
4470 {
4471  // prepare entry
4474  unit_test_util::AddGoodSource(entry->SetSet().SetSeq_set().front());
4475 
4477 
4478  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing", "Nuc-prot set does not contain expected BioSource descriptor"));
4479  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Fatal, "NoOrgFound", "No organism name included in the source. Other qualifiers may exist."));
4480  // AddChromosomeNoLocation(expected_errors, entry);
4481 
4482  eval = validator.Validate(seh, options);
4483  CheckErrors(*eval, expected_errors);
4484 
4485  CLEAR_ERRORS
4486 }
4487 
4488 
4489 BOOST_AUTO_TEST_CASE(Test_Descr_InvalidForType)
4490 {
4491  // prepare entry
4493  CRef<CSeqdesc> desc;
4494  desc.Reset(new CSeqdesc());
4496  entry->SetDescr().Set().push_back(desc);
4497  desc.Reset(new CSeqdesc());
4498  desc->SetModif().push_back(eGIBB_mod_dna);
4499  entry->SetDescr().Set().push_back(desc);
4500  desc.Reset(new CSeqdesc());
4502  entry->SetDescr().Set().push_back(desc);
4503  desc.Reset(new CSeqdesc());
4504  desc->SetOrg().SetTaxname("Sebaea microphylla");
4505  entry->SetDescr().Set().push_back(desc);
4506  AddTpaAssemblyUserObject(entry);
4507 
4509 
4510  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide",
4511  "Nucleic acid with protein sequence method"));
4512  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4513  "MolType descriptor is obsolete"));
4514  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4515  "Modif descriptor is obsolete"));
4516  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4517  "Method descriptor is obsolete"));
4518  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4519  "OrgRef descriptor is obsolete"));
4520  // AddChromosomeNoLocation(expected_errors, entry);
4521 
4522  // won't complain about TPA assembly if only local ID
4523  eval = validator.Validate(seh, options);
4524  CheckErrors(*eval, expected_errors);
4525 
4526  CLEAR_ERRORS
4527 
4528  scope.RemoveTopLevelSeqEntry(seh);
4529  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
4534  seh = scope.AddTopLevelSeqEntry(*entry);
4535  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TPAassemblyWithoutTPAKeyword",
4536  "Non-TPA record gb|AY123456| should not have TpaAssembly object"));
4537  // AddChromosomeNoLocation(expected_errors, entry);
4538  SetErrorsAccessions(expected_errors, "gb|AY123456|");
4539  eval = validator.Validate(seh, options);
4540  CheckErrors(*eval, expected_errors);
4541 
4542  scope.RemoveTopLevelSeqEntry(seh);
4543  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
4544  seh = scope.AddTopLevelSeqEntry(*entry);
4545  SetErrorsAccessions(expected_errors, "ref|NC_123456|");
4546  expected_errors[0]->SetErrMsg("Non-TPA record ref|NC_123456| should not have TpaAssembly object");
4547  eval = validator.Validate(seh, options);
4548  CheckErrors(*eval, expected_errors);
4549 
4550  desc.Reset(new CSeqdesc());
4552  entry->SetDescr().Set().push_back(desc);
4553  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InvalidForTypeGIBB",
4554  "Nucleic acid with GIBB-mol = peptide"));
4555  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InvalidForType",
4556  "MolType descriptor is obsolete"));
4557  eval = validator.Validate(seh, options);
4558  CheckErrors(*eval, expected_errors);
4559 
4561  expected_errors[1]->SetErrMsg("GIBB-mol unknown or other used");
4562  eval = validator.Validate(seh, options);
4563  CheckErrors(*eval, expected_errors);
4564 
4566  eval = validator.Validate(seh, options);
4567  CheckErrors(*eval, expected_errors);
4568 
4569  CLEAR_ERRORS
4570 
4571  scope.RemoveTopLevelSeqEntry(seh);
4573  desc.Reset(new CSeqdesc());
4575  entry->SetDescr().Set().push_back(desc);
4576  seh = scope.AddTopLevelSeqEntry(*entry);
4577  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4578  "GIBB-mol [1] used on protein"));
4579  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4580  "MolType descriptor is obsolete"));
4581  // AddChromosomeNoLocation(expected_errors, entry);
4582  eval = validator.Validate(seh, options);
4583  CheckErrors(*eval, expected_errors);
4584 
4586  expected_errors[0]->SetErrMsg("GIBB-mol [2] used on protein");
4587  eval = validator.Validate(seh, options);
4588  CheckErrors(*eval, expected_errors);
4589 
4590  desc->SetMol_type(eGIBB_mol_mRNA);
4591  expected_errors[0]->SetErrMsg("GIBB-mol [3] used on protein");
4592  eval = validator.Validate(seh, options);
4593  CheckErrors(*eval, expected_errors);
4594 
4595  desc->SetMol_type(eGIBB_mol_rRNA);
4596  expected_errors[0]->SetErrMsg("GIBB-mol [4] used on protein");
4597  eval = validator.Validate(seh, options);
4598  CheckErrors(*eval, expected_errors);
4599 
4600  desc->SetMol_type(eGIBB_mol_tRNA);
4601  expected_errors[0]->SetErrMsg("GIBB-mol [5] used on protein");
4602  eval = validator.Validate(seh, options);
4603  CheckErrors(*eval, expected_errors);
4604 
4606  expected_errors[0]->SetErrMsg("GIBB-mol [6] used on protein");
4607  eval = validator.Validate(seh, options);
4608  CheckErrors(*eval, expected_errors);
4609 
4611  expected_errors[0]->SetErrMsg("GIBB-mol [7] used on protein");
4612  eval = validator.Validate(seh, options);
4613  CheckErrors(*eval, expected_errors);
4614 
4616  expected_errors[0]->SetErrMsg("GIBB-mol [9] used on protein");
4617  eval = validator.Validate(seh, options);
4618  CheckErrors(*eval, expected_errors);
4619 
4621  expected_errors[0]->SetErrMsg("GIBB-mol [10] used on protein");
4622  eval = validator.Validate(seh, options);
4623  CheckErrors(*eval, expected_errors);
4624 
4625  CLEAR_ERRORS
4626 
4627  // invalid modif
4628  desc->SetModif().push_back(eGIBB_mod_dna);
4629  desc->SetModif().push_back(eGIBB_mod_rna);
4630  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4631  "Nucleic acid GIBB-mod [0] on protein"));
4632  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4633  "Nucleic acid GIBB-mod [1] on protein"));
4634  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4635  "Modif descriptor is obsolete"));
4636  // AddChromosomeNoLocation(expected_errors, entry);
4637  eval = validator.Validate(seh, options);
4638  CheckErrors(*eval, expected_errors);
4639 
4640  CLEAR_ERRORS
4641 
4642  scope.RemoveTopLevelSeqEntry(seh);
4643  entry = unit_test_util::BuildGoodSeq();
4644  for (auto& it : entry->SetSeq().SetDescr().Set()) {
4645  if (it->IsSource()) {
4646  it->SetSource().SetOrigin(CBioSource::eOrigin_synthetic);
4647  }
4648  }
4649  seh = scope.AddTopLevelSeqEntry(*entry);
4650  // if biomol not other, should generate error
4651  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidForType",
4652  "Molinfo-biomol other should be used if Biosource-location is synthetic"));
4653  // AddChromosomeNoLocation(expected_errors, entry);
4654  eval = validator.Validate(seh, options);
4655  CheckErrors(*eval, expected_errors);
4656 
4657  CLEAR_ERRORS
4658 
4659  for (auto& it : entry->SetSeq().SetDescr().Set()) {
4660  if (it->IsSource()) {
4661  it->SetSource().ResetOrigin();
4662  }
4663  }
4664 
4666  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidMolInfo",
4667  "Nucleic acid with Molinfo = peptide"));
4668  // AddChromosomeNoLocation(expected_errors, entry);
4669  eval = validator.Validate(seh, options);
4670  CheckErrors(*eval, expected_errors);
4671  CLEAR_ERRORS
4672 
4674  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
4675  "MoltypeOtherGenetic", "Molinfo-biomol = other genetic"));
4676  // AddChromosomeNoLocation(expected_errors, entry);
4677  eval = validator.Validate(seh, options);
4678  CheckErrors(*eval, expected_errors);
4679  CLEAR_ERRORS
4680 
4682  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4683  "MoltypeUnknown", "Molinfo-biomol unknown used"));
4684  // AddChromosomeNoLocation(expected_errors, entry);
4685  eval = validator.Validate(seh, options);
4686  CheckErrors(*eval, expected_errors);
4687  CLEAR_ERRORS
4688 
4690  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
4691  "MoltypeOther", "Molinfo-biomol other used"));
4692  // AddChromosomeNoLocation(expected_errors, entry);
4693  eval = validator.Validate(seh, options);
4694  CheckErrors(*eval, expected_errors);
4695  CLEAR_ERRORS
4696 
4697  scope.RemoveTopLevelSeqEntry(seh);
4699  seh = scope.AddTopLevelSeqEntry(*entry);
4700 
4701  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4702  "InvalidForType", "Molinfo-biomol [1] used on protein"));
4703  // AddChromosomeNoLocation(expected_errors, entry);
4705  expected_errors[0]->SetErrMsg("Molinfo-biomol [1] used on protein");
4706  eval = validator.Validate(seh, options);
4707  CheckErrors(*eval, expected_errors);
4708 
4710  expected_errors[0]->SetErrMsg("Molinfo-biomol [2] used on protein");
4711  eval = validator.Validate(seh, options);
4712  CheckErrors(*eval, expected_errors);
4713 
4715  expected_errors[0]->SetErrMsg("Molinfo-biomol [3] used on protein");
4716  eval = validator.Validate(seh, options);
4717  CheckErrors(*eval, expected_errors);
4718 
4720  expected_errors[0]->SetErrMsg("Molinfo-biomol [4] used on protein");
4721  eval = validator.Validate(seh, options);
4722  CheckErrors(*eval, expected_errors);
4723 
4725  expected_errors[0]->SetErrMsg("Molinfo-biomol [5] used on protein");
4726  eval = validator.Validate(seh, options);
4727  CheckErrors(*eval, expected_errors);
4728 
4730  expected_errors[0]->SetErrMsg("Molinfo-biomol [6] used on protein");
4731  eval = validator.Validate(seh, options);
4732  CheckErrors(*eval, expected_errors);
4733 
4735  expected_errors[0]->SetErrMsg("Molinfo-biomol [7] used on protein");
4736  eval = validator.Validate(seh, options);
4737  CheckErrors(*eval, expected_errors);
4738 
4740  expected_errors[0]->SetErrMsg("Molinfo-biomol [10] used on protein");
4741  eval = validator.Validate(seh, options);
4742  CheckErrors(*eval, expected_errors);
4743 
4745  expected_errors[0]->SetErrMsg("Molinfo-biomol [11] used on protein");
4746  eval = validator.Validate(seh, options);
4747  CheckErrors(*eval, expected_errors);
4748 
4750  expected_errors[0]->SetErrMsg("Molinfo-biomol [12] used on protein");
4751  eval = validator.Validate(seh, options);
4752  CheckErrors(*eval, expected_errors);
4753 
4755  expected_errors[0]->SetErrMsg("Molinfo-biomol [13] used on protein");
4756  eval = validator.Validate(seh, options);
4757  CheckErrors(*eval, expected_errors);
4758 
4760  expected_errors[0]->SetErrMsg("Molinfo-biomol [14] used on protein");
4761  eval = validator.Validate(seh, options);
4762  CheckErrors(*eval, expected_errors);
4763 
4765  expected_errors[0]->SetErrMsg("Molinfo-biomol [15] used on protein");
4766  eval = validator.Validate(seh, options);
4767  CheckErrors(*eval, expected_errors);
4768 
4769  CLEAR_ERRORS
4770 
4771  scope.RemoveTopLevelSeqEntry(seh);
4772  entry = unit_test_util::BuildGoodSeq();
4773  seh = scope.AddTopLevelSeqEntry(*entry);
4775  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SyntheticConstructWrongMolType",
4776  "synthetic construct should have other-genetic"));
4777  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SyntheticConstructNeedsArtificial",
4778  "synthetic construct should have artificial origin"));
4779  // AddChromosomeNoLocation(expected_errors, entry);
4780  eval = validator.Validate(seh, options);<