NCBI C++ ToolKit
unit_test_validator.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: unit_test_validator.cpp 102041 2024-03-21 18:50:50Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin, NCBI
27  *
28  * File Description:
29  * Unit tests for the validator.
30  *
31  * ===========================================================================
32  */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "unit_test_validator.hpp"
37 
38 #include <corelib/ncbi_system.hpp>
39 
40 // This macro should be defined before inclusion of test_boost.hpp in all
41 // "*.cpp" files inside executable except one. It is like function main() for
42 // non-Boost.Test executables is defined only in one *.cpp file - other files
43 // should not include it. If NCBI_BOOST_NO_AUTO_TEST_MAIN will not be defined
44 // then test_boost.hpp will define such "main()" function for tests.
45 //
46 // Usually if your unit tests contain only one *.cpp file you should not
47 // care about this macro at all.
48 //
49 //#define NCBI_BOOST_NO_AUTO_TEST_MAIN
50 
51 #define BAD_VALIDATOR
52 
53 // This header must be included before all Boost.Test headers if there are any
54 #include <corelib/test_boost.hpp>
55 
56 // for ignoring external config files
57 #include <util/util_misc.hpp>
58 
60 #include <objects/biblio/Title.hpp>
66 #include <objects/pub/Pub.hpp>
68 #include <objects/seq/GIBB_mol.hpp>
69 #include <objects/seq/Seq_ext.hpp>
73 #include <objects/seq/Ref_ext.hpp>
74 #include <objects/seq/Map_ext.hpp>
75 #include <objects/seq/Seg_ext.hpp>
76 #include <objects/seq/Seq_gap.hpp>
77 #include <objects/seq/Seq_data.hpp>
79 #include <objects/seq/Seqdesc.hpp>
80 #include <objects/seq/MolInfo.hpp>
81 #include <objects/seq/Pubdesc.hpp>
82 #include <objects/seq/Seq_hist.hpp>
100 #include <objmgr/object_manager.hpp>
101 #include <objmgr/scope.hpp>
102 #include <objmgr/bioseq_ci.hpp>
103 #include <objmgr/feat_ci.hpp>
104 #include <objmgr/seq_vector.hpp>
105 #include <objmgr/util/sequence.hpp>
106 #include <objmgr/seqdesc_ci.hpp>
107 #include <objmgr/util/sequence.hpp>
115 #include <corelib/ncbiapp.hpp>
116 #include <common/ncbi_export.h>
120 #include <objtools/edit/cds_fix.hpp>
122 
123 // for writing out tmp files
124 #include <serial/objostrasn.hpp>
125 #include <serial/objostrasnb.hpp>
126 
128 
131 
132 using namespace validator;
133 using namespace unit_test_util;
134 
135 
136 CExpectedError::CExpectedError(string accession, EDiagSev severity, string err_code, string err_msg)
137  : m_Accession(accession), m_Severity(severity), m_ErrCode(err_code), m_ErrMsg(err_msg)
138 {
139 }
140 
142 {
143 }
144 
145 
146 bool CExpectedError::Match(const CValidErrItem& err_item, bool ignore_severity)
147 {
148  if (!NStr::IsBlank(m_Accession) && !NStr::IsBlank(err_item.GetAccnver())
149  && !NStr::Equal(err_item.GetAccnver(), m_Accession)) {
150  return false;
151  }
152  if (!NStr::Equal(err_item.GetErrCode(), m_ErrCode)) {
153  return false;
154  }
155  string msg = err_item.GetMsg();
156  size_t pos = NStr::Find(msg, " EXCEPTION: NCBI C++ Exception:");
157  if (pos != string::npos) {
158  msg = msg.substr(0, pos);
159  }
160 
161  if (!NStr::Equal(msg, m_ErrMsg)) {
162  return false;
163  }
164  if (!ignore_severity && m_Severity != err_item.GetSeverity()) {
165  return false;
166  }
167  return true;
168 }
169 
170 
171 void CExpectedError::Test(const CValidErrItem& err_item)
172 {
173  if (!NStr::IsBlank(m_Accession) && !NStr::IsBlank(err_item.GetAccnver())) {
174  BOOST_CHECK_EQUAL(err_item.GetAccnver(), m_Accession);
175  }
176  BOOST_CHECK_EQUAL(err_item.GetSeverity(), m_Severity);
177  BOOST_CHECK_EQUAL(err_item.GetErrCode(), m_ErrCode);
178  string msg = err_item.GetMsg();
179  size_t pos = NStr::Find(msg, " EXCEPTION: NCBI C++ Exception:");
180  if (pos != string::npos) {
181  msg = msg.substr(0, pos);
182  }
183  BOOST_CHECK_EQUAL(msg, m_ErrMsg);
184 }
185 
186 
188 {
189  string description = err_item.GetAccnver() + ":"
190  + CValidErrItem::ConvertSeverity(err_item.GetSeverity()) + ":"
191  + err_item.GetErrCode() + ":"
192  + err_item.GetMsg();
193  printf("%s\n", description.c_str());
194 }
195 
196 
198 {
199  string description = m_Accession + ":"
201  + m_ErrCode + ":"
202  + m_ErrMsg;
203  printf("%s\n", description.c_str());
204 }
205 
206 
207 static bool s_debugMode = false;
208 
209 void WriteErrors(const CValidError& eval, bool debug_mode)
210 {
211  if (debug_mode) {
212  printf("\n-\n");
213  }
214  for (CValidError_CI vit(eval); vit; ++vit) {
216  }
217  if (debug_mode) {
218  printf("\n\n");
219  }
220  printf("\n\n");
221 }
222 
223 
224 void CheckErrors(const CValidError& eval,
225  vector<CExpectedError*>& expected_errors)
226 {
227  //static int count(1);
228  //if (count == 1367) {
229  // cerr << "";
230  //}
231  //cerr << count++ << "\n";
232 
233  bool problem_found = false;
234 
235  if (s_debugMode) {
236  WriteErrors(eval, true);
237  return;
238  }
239 
240  vector<bool> expected_found;
241  for (size_t i = 0; i < expected_errors.size(); i++) {
242  if (expected_errors[i]) {
243  expected_found.push_back(false);
244  } else {
245  expected_found.push_back(true);
246  }
247  }
248 
249  for (CValidError_CI vit(eval); vit; ++vit) {
250  bool found = false;
251  for (size_t i = 0; i < expected_errors.size(); i++) {
252  if (!expected_found[i] && expected_errors[i]->Match(*vit)) {
253  expected_found[i] = true;
254  found = true;
255  break;
256  }
257  }
258  if (!found) {
259  for (size_t i = 0; i < expected_errors.size(); i++) {
260  if (!expected_found[i] && expected_errors[i]->Match(*vit, true)) {
261  printf("Problem with ");
263  expected_errors[i]->Test(*vit);
264  expected_found[i] = true;
265  found = true;
266  problem_found = true;
267  break;
268  }
269  }
270  }
271  if (!found) {
272  BOOST_CHECK_EQUAL("Unexpected error", "Error not found");
274  problem_found = true;
275  }
276  }
277 
278  for (size_t i = 0; i < expected_errors.size(); i++) {
279  if (!expected_found[i]) {
280  BOOST_CHECK_EQUAL(expected_errors[i]->GetErrMsg(), "Expected error not found");
281  problem_found = true;
282  }
283  }
284 
285  if (problem_found) {
286  WriteErrors(eval, false);
287 
288  printf("Expected:\n");
289  for (auto it : expected_errors) {
290  if (it) {
291  it->Print();
292  }
293  }
294  }
295 }
296 
297 
298 void CheckStrings(const vector<string>& seen, const vector<string>& expected)
299 {
300  auto it1 = seen.begin();
301  auto it2 = expected.begin();
302  bool any = false;
303  while (it1 != seen.end() && it2 != expected.end()) {
304  BOOST_CHECK_EQUAL(*it1, *it2);
305  if (!NStr::Equal(*it1, *it2)) {
306  any = true;
307  }
308  it1++;
309  it2++;
310  }
311  while (it1 != seen.end()) {
312  BOOST_CHECK_EQUAL(*it1, "Unexpected string");
313  it1++;
314  any = true;
315  }
316  while (it2 != expected.end()) {
317  BOOST_CHECK_EQUAL("Missing string", *it2);
318  it2++;
319  any = true;
320  }
321 
322  if (any) {
323  printf("Seen:\n");
324  auto it1 = seen.begin();
325  while (it1 != seen.end()) {
326  printf("%s\n", (*it1).c_str());
327  it1++;
328  }
329  printf("Expected:\n");
330  auto it2 = expected.begin();
331  while (it2 != expected.end()) {
332  printf("%s\n", (*it2).c_str());
333  it2++;
334  }
335  }
336 }
337 
338 
339 // Not currently used, but I'll leave it here in case
340 // it's useful in the future.
341 
342 #if 0
343 static void SetCountryOnSrc(CBioSource& src, string country)
344 {
345  if (NStr::IsBlank(country)) {
346  if (src.IsSetSubtype()) {
347  auto& cont = src.SetSubtype();
348  cont.remove_if([](CSubSource* it) {
349  return (it->IsSetSubtype() && it->GetSubtype() == CSubSource::eSubtype_country);
350  });
351  }
352  } else {
354  src.SetSubtype().push_back(sub);
355  }
356 }
357 #endif
358 
360 static string ToAsn1(const CRef<CSeq_entry>& entry)
361 {
362  CNcbiOstrstream os;
363  os << MSerial_AsnText << entry;
364  return os.str();
365 }
366 
369 
372 
374 {
375  if (!CNcbiApplication::Instance()->GetConfig().HasEntry("NCBI", "Data")) {
376  NCBITEST_DISABLE(Test_Descr_BadStructuredCommentFormat);
377  NCBITEST_DISABLE(Test_Descr_MissingKeyword);
378  }
379 }
380 
381 
382 static void SetErrorsAccessions(vector<CExpectedError*>& expected_errors, string accession)
383 {
384  size_t i, len = expected_errors.size();
385  for (i = 0; i < len; i++) {
386  expected_errors[i]->SetAccession(accession);
387  }
388 }
389 
391 {
392  // Here we make descriptions of command line parameters that we are
393  // going to use.
394 
395  arg_desc->AddFlag(
396  "debug_mode", "Debugging mode writes errors seen for each test");
397 }
398 
400 {
401  // initialization function body
402 
403  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
404  if (args["debug_mode"]) {
405  s_debugMode = true;
406  }
407  g_IgnoreDataFile("institution_codes.txt");
408 }
409 
410 void AddChromosomeNoLocation(vector<CExpectedError*>& expected_errors, const string& id)
411 {
412  expected_errors.push_back(new CExpectedError(id, eDiag_Error,
413  "ChromosomeWithoutLocation",
414  "INDEXER_ONLY - source contains chromosome value '1' but the BioSource location is not set to chromosome"));
415 }
416 
417 void AddChromosomeNoLocation(vector<CExpectedError*>& expected_errors, CRef<CSeq_entry> entry)
418 {
419  if (entry->IsSeq()) {
420  CConstRef<CSeq_id> seqid = sequence::GetId(entry->GetSeq(), sequence::eGetId_Best).GetSeqId();
421  AddChromosomeNoLocation(expected_errors, seqid->AsFastaString());
422  } else if (entry->IsSet()) {
423  if (entry->GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
425  AddChromosomeNoLocation(expected_errors, nuc_entry);
426  } else {
427  for (auto it : entry->SetSet().SetSeq_set()) {
428  AddChromosomeNoLocation(expected_errors, it);
429  }
430  }
431  }
432 }
433 
434 
435 // new case test ground
436 
437 BOOST_AUTO_TEST_CASE(Test_Descr_MissingKeyword)
438 {
439  // prepare entry
441  CRef<CSeqdesc> sdesc(new CSeqdesc());
442  sdesc->SetUser().SetType().SetStr("StructuredComment");
443  entry->SetSeq().SetDescr().Set().push_back(sdesc);
444 
445  sdesc->SetUser().AddField("StructuredCommentPrefix", "##MIGS-Data-START##", CUser_object::eParse_String);
446  sdesc->SetUser().AddField("alt_elev", "foo", CUser_object::eParse_String);
447  sdesc->SetUser().AddField("assembly", "foo", CUser_object::eParse_String);
448  sdesc->SetUser().AddField("collection_date", "foo", CUser_object::eParse_String);
449  sdesc->SetUser().AddField("country", "foo", CUser_object::eParse_String);
450  sdesc->SetUser().AddField("depth", "foo", CUser_object::eParse_String);
451  sdesc->SetUser().AddField("environment", "foo", CUser_object::eParse_String);
452  sdesc->SetUser().AddField("investigation_type", "eukaryote", CUser_object::eParse_String);
453  sdesc->SetUser().AddField("isol_growth_condt", "foo", CUser_object::eParse_String);
454  sdesc->SetUser().AddField("sequencing_meth", "foo", CUser_object::eParse_String);
455  sdesc->SetUser().AddField("project_name", "foo", CUser_object::eParse_String);
456  sdesc->SetUser().AddField("ploidy", "foo", CUser_object::eParse_String);
457  sdesc->SetUser().AddField("num_replicons", "foo", CUser_object::eParse_String);
458  sdesc->SetUser().AddField("estimated_size", "foo", CUser_object::eParse_String);
459  sdesc->SetUser().AddField("trophic_level", "foo", CUser_object::eParse_String);
460  sdesc->SetUser().AddField("propagation", "foo", CUser_object::eParse_String);
461  sdesc->SetUser().AddField("lat_lon", "foo", CUser_object::eParse_String);
462 
463  CRef<CSeqdesc> gdesc(new CSeqdesc());
464  gdesc->SetGenbank().SetKeywords().push_back("GSC:MIGS:2.1");
465  entry->SetSeq().SetDescr().Set().push_back(gdesc);
466 
468 
469  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadKeywordForStrucComm",
470  "Structured Comment is non-compliant, keyword should be removed"));
471  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommMissingField",
472  "Required field finishing_strategy is missing when investigation_type has value 'eukaryote'"));
473  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue",
474  "Structured Comment invalid; the field value and/or name are incorrect"));
475  // AddChromosomeNoLocation(expected_errors, entry);
476  eval = validator.Validate(seh, options);
477  CheckErrors(*eval, expected_errors);
478 
479  // if no keyword, no badkeyword error
480  entry->SetSeq().SetDescr().Set().pop_back();
481  delete expected_errors[0];
482  expected_errors[0] = nullptr;
483  eval = validator.Validate(seh, options);
484  CheckErrors(*eval, expected_errors);
485 
487 
488  // make the comment valid, should complain about missing keyword
489  sdesc->SetUser().AddField("finishing_strategy", "foo", CUser_object::eParse_String);
490  // AddChromosomeNoLocation(expected_errors, entry);
491  eval = validator.Validate(seh, options);
492  CheckErrors(*eval, expected_errors);
493 
495  // put keyword back, should have no errors
496  entry->SetSeq().SetDescr().Set().push_back(gdesc);
497  // AddChromosomeNoLocation(expected_errors, entry);
498  eval = validator.Validate(seh, options);
499  CheckErrors(*eval, expected_errors);
501 }
502 
503 
504 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonValue)
505 {
506  // prepare entry
510 
512 
513  /*
514  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
515  "Latitude should be set to N (northern hemisphere)"));
516  eval = validator.Validate(seh, options);
517  CheckErrors(*eval, expected_errors);
518  */
519 
522  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
523  "Longitude should be set to W (western hemisphere)"));
524  eval = validator.Validate(seh, options);
525  CheckErrors(*eval, expected_errors);
526 
531  expected_errors[0]->SetErrMsg("Latitude should be set to S (southern hemisphere)");
532  eval = validator.Validate(seh, options);
533  CheckErrors(*eval, expected_errors);
534 
535  /*
536  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
537  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "25 S 47 W");
538  expected_errors[0]->SetErrMsg("Longitude should be set to E (eastern hemisphere)");
539  eval = validator.Validate(seh, options);
540  CheckErrors(*eval, expected_errors);
541  */
542 
544 
549  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
550  "Latitude and longitude values appear to be exchanged"));
551  eval = validator.Validate(seh, options);
552  CheckErrors(*eval, expected_errors);
553 
555 }
556 
557 
558 void TestOneLatLonCountry(const string& country, const string& lat_lon, const string& error, bool use_state = false, const string& err_code = "LatLonCountry")
559 {
560  // prepare entry
564 
566 
567  if (use_state) {
569  }
570 
571  if (!error.empty()) {
572  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, err_code, error));
573  }
574  eval = validator.Validate(seh, options);
575  CheckErrors(*eval, expected_errors);
576 
577  if (!error.empty()) {
578  CValidErrorFormat format(*objmgr);
579  vector<string> expected;
580  expected.push_back("LatLonCountry Errors");
581  expected.push_back("lcl|good:" + error);
582  expected.push_back("");
583 
584  vector<string> seen;
585  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
586  for (const string& it : cat_list) {
587  vector<string> sublist;
588  NStr::Split(it, "\n", sublist);
589  for (const string& sit : sublist) {
590  seen.push_back(sit);
591  }
592  }
593 
594  CheckStrings(seen, expected);
595  }
596 
598 }
599 
600 
602 {
603  TestOneLatLonCountry("Portugal", "37.7715 N 25.3097 W", "", true);
604 }
605 
606 
607 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonCountry)
608 {
609  TestOneLatLonCountry("Romania", "46.5 N 20 E",
610  "Lat_lon '46.5 N 20 E' maps to 'Hungary' instead of 'Romania' - claimed region 'Romania' is at distance 45 km");
611  TestOneLatLonCountry("Romania", "34 N 65 E", "Lat_lon '34 N 65 E' maps to 'Afghanistan' instead of 'Romania'");
612  TestOneLatLonCountry("Romania", "48 N 15 E", "Lat_lon '48 N 15 E' maps to 'Austria' instead of 'Romania'");
613  TestOneLatLonCountry("Romania", "48 N 15 W", "Lat_lon '48 N 15 W' is in water 'Atlantic Ocean'", false, "LatLonWater");
614  // RW-1137 this had inconsistent behavior in production vs. development tests, possibly due to version skew in
615  // Puerto Rico cleanup code, so commenting out to avoid spurious error reports
616  /*
617  TestOneLatLonCountry("Puerto Rico: Rio Mameyes in Luquillo", "18.47 N 64.23000000000002 W",
618  "Lat_lon '18.47 N 64.23000000000002 W' is in water 'Caribbean Sea', 'Puerto Rico: Rio Mameyes in Luquillo' is 108 km away",
619  false, "LatLonWater");
620  */
621 
622 }
623 
624 
625 BOOST_AUTO_TEST_CASE(Test_ValidError_Format)
626 {
628 
629  // Create consensus splice problems
632  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
633  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
634  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'G';
635  CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
636  unit_test_util::AddFeat(intron, nuc);
637 
639  other_intron->SetData().SetImp().SetKey("intron");
641  gene->SetData().SetGene().SetLocus_tag("fake_locustag");
642  AddFeat(gene, nuc);
643 
644  // create EC number problems
645  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
646  prot->SetData().SetProt().SetEc().push_back("1.2.3.10");
647  prot->SetData().SetProt().SetEc().push_back("1.1.3.22");
648  prot->SetData().SetProt().SetEc().push_back("1.1.99.n");
649  prot->SetData().SetProt().SetEc().push_back("1.1.1.17");
650  prot->SetData().SetProt().SetEc().push_back("11.22.33.44");
651  prot->SetData().SetProt().SetEc().push_back("11.22.n33.44");
652  prot->SetData().SetProt().SetEc().push_back("11.22.33.n44");
653 
654 
655  // create bad institution code errors
659 
660  // create lat-lon country error
663 
665 
666  eval = validator.Validate(seh, options);
667 
668  CValidErrorFormat format(*objmgr);
669 
670  vector<string> expected;
671  expected.push_back("intron\tlcl|nuc\tGT at 17");
672  expected.push_back("intron\tlcl|nuc\tGT at 1");
673  expected.push_back("intron\tlcl|nuc\tAG at 11");
674  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
675  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
676  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
677  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
678  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
679  expected.push_back("CDS\tlcl|nuc\tGT at 16");
680  expected.push_back("lcl|nuc:Lat_lon '30 N 30 E' maps to 'Egypt' instead of 'Panama'");
681  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
682  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
683  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
684 
685  vector<string> seen;
686  for (CValidError_CI vit(*eval); vit; ++vit) {
687  string val = format.FormatForSubmitterReport(*vit, scope);
688  seen.push_back(val);
689  }
690  CheckStrings(seen, expected);
691 
692  expected.clear();
693  seen.clear();
694  for (CValidError_CI vit(*eval); vit; ++vit) {
695  seen.push_back(vit->GetErrCode());
696  }
697  expected.push_back("NotSpliceConsensusDonor");
698  expected.push_back("NotSpliceConsensusDonorTerminalIntron");
699  expected.push_back("NotSpliceConsensusAcceptor");
700  expected.push_back("DeletedEcNumber");
701  expected.push_back("ReplacedEcNumber");
702  expected.push_back("BadEcNumberValue");
703  expected.push_back("BadEcNumberFormat");
704  expected.push_back("BadEcNumberValue");
705  expected.push_back("NotSpliceConsensusDonor");
706  expected.push_back("LatLonCountry");
707  expected.push_back("BadInstitutionCode");
708  expected.push_back("BadInstitutionCode");
709  expected.push_back("BadInstitutionCode");
710  CheckStrings(seen, expected);
711 
712  seen.clear();
713  expected.clear();
714  vector<CValidErrItem::TErrIndex> codes = format.GetListOfErrorCodes(*eval);
715  for (CValidErrItem::TErrIndex it : codes) {
716  string val = CValidErrItem::ConvertErrCode(it);
717  seen.push_back(val);
718  }
719  expected.push_back("LatLonCountry");
720  expected.push_back("BadInstitutionCode");
721  expected.push_back("BadEcNumberFormat");
722  expected.push_back("BadEcNumberValue");
723  expected.push_back("NotSpliceConsensusDonor");
724  expected.push_back("NotSpliceConsensusAcceptor");
725  expected.push_back("DeletedEcNumber");
726  expected.push_back("ReplacedEcNumber");
727  expected.push_back("NotSpliceConsensusDonorTerminalIntron");
728  CheckStrings(seen, expected);
729 
730  string rval = format.FormatForSubmitterReport(*eval, scope, eErr_SEQ_FEAT_NotSpliceConsensusDonor);
731  expected.clear();
732  seen.clear();
733  NStr::Split(rval, "\n", seen);
734  expected.push_back("Not Splice Consensus");
735  expected.push_back("intron\tlcl|nuc\tGT at 17");
736  expected.push_back("CDS\tlcl|nuc\tGT at 16");
737  expected.push_back("");
738  CheckStrings(seen, expected);
739 
740  rval = format.FormatCategoryForSubmitterReport(*eval, scope, eSubmitterFormatErrorGroup_ConsensusSplice);
741  expected.clear();
742  seen.clear();
743  NStr::Split(rval, "\n", seen);
744  expected.push_back("Not Splice Consensus");
745  expected.push_back("intron\tlcl|nuc\tGT at 17");
746  expected.push_back("intron\tlcl|nuc\tGT at 1");
747  expected.push_back("intron\tlcl|nuc\tAG at 11");
748  expected.push_back("CDS\tlcl|nuc\tGT at 16");
749  expected.push_back("");
750  CheckStrings(seen, expected);
751 
752  expected.clear();
753  seen.clear();
754  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
755  for (const string& it : cat_list) {
756  vector<string> sublist;
757  NStr::Split(it, "\n", sublist);
758  for (const string& sit : sublist) {
759  seen.push_back(sit);
760  }
761  }
762  expected.push_back("Not Splice Consensus");
763  expected.push_back("intron\tlcl|nuc\tGT at 17");
764  expected.push_back("intron\tlcl|nuc\tGT at 1");
765  expected.push_back("intron\tlcl|nuc\tAG at 11");
766  expected.push_back("CDS\tlcl|nuc\tGT at 16");
767  expected.push_back("");
768  expected.push_back("EC Number Format");
769  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
770  expected.push_back("");
771  expected.push_back("EC Number Value");
772  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
773  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
774  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
775  expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
776  expected.push_back("");
777  expected.push_back("Bad Institution Codes");
778  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
779  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
780  expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
781  expected.push_back("");
782  expected.push_back("LatLonCountry Errors");
783  expected.push_back("lcl|nuc:Lat_lon '30 N 30 E' maps to 'Egypt' instead of 'Panama'");
784  expected.push_back("");
785  CheckStrings(seen, expected);
786 }
787 
788 
789 BOOST_AUTO_TEST_CASE(Test_GB_6395)
790 {
791  // prepare entry
793  unit_test_util::SetTaxon(entry, 0);
794 
796 
797  eval = validator.Validate(seh, options);
798 
799  CValidErrorFormat format(*objmgr);
800  vector<string> expected;
801  vector<string> seen;
802 
803  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
804  for (const string& it : cat_list) {
805  vector<string> sublist;
806  NStr::Split(it, "\n", sublist);
807  for (const string& sit : sublist) {
808  seen.push_back(sit);
809  }
810  }
811  expected.push_back("NoTaxonID");
812  expected.push_back("lcl|good:Sebaea microphylla");
813  expected.push_back("");
814 
815  CheckStrings(seen, expected);
816 }
817 
818 
819 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonState)
820 {
821  // prepare entry
823  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "USA: South Carolina");
825 
827 
828  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "LatLonState",
829  "Lat_lon '36 N 80 W' maps to 'USA: North Carolina' instead of 'USA: South Carolina' - claimed region 'USA: South Carolina' is at distance 130 km"));
830  // AddChromosomeNoLocation(expected_errors, "lcl|good");
832  eval = validator.Validate(seh, options);
833  CheckErrors(*eval, expected_errors);
834 
836 }
837 
838 
840 {
842  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
843  prot->SetData().SetProt().SetEc().push_back("1.2.3.10");
844  prot->SetData().SetProt().SetEc().push_back("1.1.3.22");
845  prot->SetData().SetProt().SetEc().push_back("1.1.99.n");
846  prot->SetData().SetProt().SetEc().push_back("1.1.1.17");
847  prot->SetData().SetProt().SetEc().push_back("11.22.33.44");
848  prot->SetData().SetProt().SetEc().push_back("11.22.n33.44");
849  prot->SetData().SetProt().SetEc().push_back("11.22.33.n44");
850  return entry;
851 }
852 
853 
854 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadEcNumberValue)
855 {
857  CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
858 
860 
861  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "DeletedEcNumber",
862  "EC_number 1.2.3.10 was deleted"));
863  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ReplacedEcNumber",
864  "EC_number 1.1.3.22 was transferred and is no longer valid"));
865  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadEcNumberValue",
866  "11.22.33.44 is not a legal value for qualifier EC_number"));
867  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadEcNumberFormat",
868  "11.22.n33.44 is not in proper EC_number format"));
869  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Info, "BadEcNumberValue",
870  "11.22.33.n44 is not a legal preliminary value for qualifier EC_number"));
871  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
872  eval = validator.Validate(seh, options);
873  CheckErrors(*eval, expected_errors);
874 
875  scope.RemoveTopLevelSeqEntry(seh);
876  prot->SetData().SetProt().ResetEc();
878  misc->SetData().SetImp().SetKey("exon");
879  misc->AddQualifier("EC_number", "1.2.3.10");
880  misc->AddQualifier("EC_number", "1.1.3.22");
881  misc->AddQualifier("EC_number", "1.1.99.n");
882  misc->AddQualifier("EC_number", "1.1.1.17");
883  misc->AddQualifier("EC_number", "11.22.33.44");
884  misc->AddQualifier("EC_number", "11.22.n33.44");
885  misc->AddQualifier("EC_number", "11.22.33.n44");
886  SetErrorsAccessions(expected_errors, "lcl|nuc");
887  expected_errors[1]->SetErrMsg("EC_number 1.1.3.22 was replaced");
888  seh = scope.AddTopLevelSeqEntry(*entry);
889  eval = validator.Validate(seh, options);
890  CheckErrors(*eval, expected_errors);
891 
893 }
894 
895 
896 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidQualifierValue)
897 {
900  misc->SetData().SetImp().SetKey("repeat_region");
901  misc->AddQualifier("rpt_unit_seq", "ATA");
902 
904 
905  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "RepeatSeqDoNotMatch",
906  "repeat_region /rpt_unit and underlying sequence do not match"));
907  // AddChromosomeNoLocation(expected_errors, "lcl|good");
908  eval = validator.Validate(seh, options);
909  CheckErrors(*eval, expected_errors);
910 
911  scope.RemoveTopLevelSeqEntry(seh);
913  misc = unit_test_util::AddMiscFeature(entry);
914  misc->SetData().SetImp().SetKey("repeat_region");
915  misc->AddQualifier("rpt_unit_seq", "ATAGTGATAGTG");
916  seh = scope.AddTopLevelSeqEntry(*entry);
917  expected_errors[0]->SetErrCode("InvalidRepeatUnitLength");
918  expected_errors[0]->SetErrMsg("Length of rpt_unit_seq is greater than feature length");
919  expected_errors[0]->SetSeverity(eDiag_Info);
920  eval = validator.Validate(seh, options);
921  CheckErrors(*eval, expected_errors);
922 
924 }
925 
926 
927 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ExtNotAllowed)
928 {
930 
932 
933  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "ExtNotAllowed", "Bioseq-ext not allowed on virtual Bioseq"));
934  // AddChromosomeNoLocation(expected_errors, "lcl|good");
935 
936  // repr = virtual
937  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_virtual);
938  entry->SetSeq().SetInst().ResetSeq_data();
939  entry->SetSeq().SetInst().SetExt().SetDelta();
940  eval = validator.Validate(seh, options);
941  CheckErrors(*eval, expected_errors);
942 
943  // repr = raw
944  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_raw);
945  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
946  expected_errors[0]->SetErrMsg("Bioseq-ext not allowed on raw Bioseq");
947  eval = validator.Validate(seh, options);
948  CheckErrors(*eval, expected_errors);
949 
950  entry->SetSeq().SetInst().ResetExt();
951  entry->SetSeq().SetInst().ResetSeq_data();
952  expected_errors[0]->SetErrCode("SeqDataNotFound");
953  expected_errors[0]->SetErrMsg("Missing Seq-data on raw Bioseq");
954  expected_errors[0]->SetSeverity(eDiag_Critical);
955  eval = validator.Validate(seh, options);
956  CheckErrors(*eval, expected_errors);
957 
958  entry->SetSeq().SetInst().SetSeq_data().SetGap();
959  eval = validator.Validate(seh, options);
960  CheckErrors(*eval, expected_errors);
961 
962  // repr = const
963  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_const);
964  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
965  entry->SetSeq().SetInst().SetExt().SetDelta();
966  expected_errors[0]->SetErrCode("ExtNotAllowed");
967  expected_errors[0]->SetErrMsg("Bioseq-ext not allowed on constructed Bioseq");
968  eval = validator.Validate(seh, options);
969  CheckErrors(*eval, expected_errors);
970 
971  entry->SetSeq().SetInst().ResetExt();
972  entry->SetSeq().SetInst().ResetSeq_data();
973  expected_errors[0]->SetErrCode("SeqDataNotFound");
974  expected_errors[0]->SetErrMsg("Missing Seq-data on constructed Bioseq");
975  expected_errors[0]->SetSeverity(eDiag_Critical);
976  eval = validator.Validate(seh, options);
977  CheckErrors(*eval, expected_errors);
978 
979  entry->SetSeq().SetInst().SetSeq_data().SetGap();
980  eval = validator.Validate(seh, options);
981  CheckErrors(*eval, expected_errors);
982 
983  // repr = map
984  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_map);
985  entry->SetSeq().SetInst().ResetSeq_data();
986  expected_errors[0]->SetErrCode("ExtBadOrMissing");
987  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on map Bioseq");
988  expected_errors[0]->SetSeverity(eDiag_Error);
989  eval = validator.Validate(seh, options);
990  CheckErrors(*eval, expected_errors);
991 
992  entry->SetSeq().SetInst().SetExt().SetDelta();
993  eval = validator.Validate(seh, options);
994  CheckErrors(*eval, expected_errors);
995 
996  entry->SetSeq().SetInst().SetExt().SetRef();
997  eval = validator.Validate(seh, options);
998  CheckErrors(*eval, expected_errors);
999 
1000  entry->SetSeq().SetInst().SetExt().SetMap();
1001  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
1002  expected_errors[0]->SetErrCode("SeqDataNotAllowed");
1003  expected_errors[0]->SetErrMsg("Seq-data not allowed on map Bioseq");
1004  eval = validator.Validate(seh, options);
1005  CheckErrors(*eval, expected_errors);
1006 
1007 
1008  // repr = ref
1009  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_ref);
1010  entry->SetSeq().SetInst().ResetExt();
1011  entry->SetSeq().SetInst().ResetSeq_data();
1012  expected_errors[0]->SetErrCode("ExtBadOrMissing");
1013  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on reference Bioseq");
1014  eval = validator.Validate(seh, options);
1015  CheckErrors(*eval, expected_errors);
1016 
1017  /*
1018  // repr = seg
1019  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
1020  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on seg Bioseq");
1021  eval = validator.Validate(seh, options);
1022  CheckErrors(*eval, expected_errors);
1023  */
1024 
1025  // repr = consen
1026  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_consen);
1027  expected_errors[0]->SetSeverity(eDiag_Critical);
1028  expected_errors[0]->SetErrCode("ReprInvalid");
1029  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 6");
1030  eval = validator.Validate(seh, options);
1031  CheckErrors(*eval, expected_errors);
1032 
1033  // repr = notset
1034  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_not_set);
1035  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 0");
1036  eval = validator.Validate(seh, options);
1037  CheckErrors(*eval, expected_errors);
1038 
1039  // repr = other
1040  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_other);
1041  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 255");
1042  eval = validator.Validate(seh, options);
1043  CheckErrors(*eval, expected_errors);
1044 
1045  // repr = delta
1046  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1047  entry->SetSeq().SetInst().SetExt().SetDelta();
1048  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
1049  expected_errors[0]->SetSeverity(eDiag_Error);
1050  expected_errors[0]->SetErrCode("SeqDataNotAllowed");
1051  expected_errors[0]->SetErrMsg("Seq-data not allowed on delta Bioseq");
1052  eval = validator.Validate(seh, options);
1053  CheckErrors(*eval, expected_errors);
1054 
1055  entry->SetSeq().SetInst().ResetExt();
1056  entry->SetSeq().SetInst().ResetSeq_data();
1057  expected_errors[0]->SetSeverity(eDiag_Error);
1058  expected_errors[0]->SetErrCode("ExtBadOrMissing");
1059  expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on delta Bioseq");
1060  eval = validator.Validate(seh, options);
1061  CheckErrors(*eval, expected_errors);
1062 
1063  CLEAR_ERRORS
1064 }
1065 
1066 
1067 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ReprInvalid)
1068 {
1070 
1072  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "ReprInvalid", "Invalid Bioseq->repr = 0"));
1073  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1074 
1075  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_not_set);
1076  eval = validator.Validate(seh, options);
1077  CheckErrors(*eval, expected_errors);
1078 
1079  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 255");
1080  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_other);
1081  eval = validator.Validate(seh, options);
1082  CheckErrors(*eval, expected_errors);
1083 
1084  expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 6");
1085  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_consen);
1086  eval = validator.Validate(seh, options);
1087  CheckErrors(*eval, expected_errors);
1088 
1089  CLEAR_ERRORS
1090 }
1091 
1092 
1093 BOOST_AUTO_TEST_CASE(Test_CollidingLocusTags)
1094 {
1095  CRef<CSeq_entry> entry(new CSeq_entry());
1096  {
1098  istr >> MSerial_AsnText >> *entry;
1099  }
1100 
1102  CScope scope(*objmgr);
1103  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
1104 
1105  CValidator validator(*objmgr);
1106 
1107  // Set validator options
1108  unsigned int options = CValidator::eVal_need_isojta
1112 
1113  // list of expected errors
1114  vector<CExpectedError*> expected_errors;
1115  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Warning, "TerminalNs", "N at end of sequence"));
1116  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Warning, "GeneLocusCollidesWithLocusTag", "locus collides with locus_tag in another gene"));
1117  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "CollidingLocusTags", "Colliding locus_tags in gene features"));
1118  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "CollidingLocusTags", "Colliding locus_tags in gene features"));
1119  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoMolInfoFound", "No Mol-info applies to this Bioseq"));
1120  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "LocusTagGeneLocusMatch", "Gene locus and locus_tag 'foo' match"));
1121  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoPubFound", "No publications anywhere on this entire record."));
1122  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Info, "MissingPubRequirement", "No submission citation anywhere on this entire record."));
1123  expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoSourceDescriptor", "No source information included on this record."));
1124 
1125  CConstRef<CValidError> eval = validator.Validate(seh, options);
1126  CheckErrors(*eval, expected_errors);
1127 
1128  CLEAR_ERRORS
1129 }
1130 
1131 
1132 const string sc_TestEntryCollidingLocusTags = "Seq-entry ::= seq {\
1133  id {\
1134  local str \"LocusCollidesWithLocusTag\" } ,\
1135  inst {\
1136  repr raw ,\
1137  mol dna ,\
1138  length 24 ,\
1139  seq-data\
1140  iupacna \"AATTGGCCAANNAATTGGCCAANN\" } ,\
1141  annot {\
1142  {\
1143  data\
1144  ftable {\
1145  {\
1146  data\
1147  gene {\
1148  locus \"foo\" ,\
1149  locus-tag \"foo\" } ,\
1150  location\
1151  int {\
1152  from 0 ,\
1153  to 4 ,\
1154  strand plus ,\
1155  id\
1156  local str \"LocusCollidesWithLocusTag\" } } ,\
1157  {\
1158  data\
1159  gene {\
1160  locus \"bar\" ,\
1161  locus-tag \"foo\" } ,\
1162  location\
1163  int {\
1164  from 5 ,\
1165  to 9 ,\
1166  strand plus ,\
1167  id\
1168  local str \"LocusCollidesWithLocusTag\" } } ,\
1169  {\
1170  data\
1171  gene {\
1172  locus \"bar\" ,\
1173  locus-tag \"baz\" } ,\
1174  location\
1175  int {\
1176  from 10 ,\
1177  to 14 ,\
1178  strand plus ,\
1179  id\
1180  local str \"LocusCollidesWithLocusTag\" } } ,\
1181  {\
1182  data\
1183  gene {\
1184  locus \"quux\" ,\
1185  locus-tag \"baz\" } ,\
1186  location\
1187  int {\
1188  from 15 ,\
1189  to 19 ,\
1190  strand plus ,\
1191  id\
1192  local str \"LocusCollidesWithLocusTag\" } } } } } }\
1193 ";
1194 
1195 
1196 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_CircularProtein)
1197 {
1199 
1201 
1202  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "CircularProtein", "Non-linear topology set on protein"));
1203  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1204 
1206 
1207  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
1208  eval = validator.Validate(seh, options);
1209  CheckErrors(*eval, expected_errors);
1210 
1211  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_tandem);
1212  eval = validator.Validate(seh, options);
1213  CheckErrors(*eval, expected_errors);
1214 
1215  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_other);
1216  eval = validator.Validate(seh, options);
1217  CheckErrors(*eval, expected_errors);
1218 
1219  // should be no error for not set or linear
1220  CLEAR_ERRORS
1221 
1222  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_not_set);
1223  eval = validator.Validate(seh, options);
1224  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1225  CheckErrors(*eval, expected_errors);
1226 
1227  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_linear);
1228  eval = validator.Validate(seh, options);
1229  CheckErrors(*eval, expected_errors);
1230 
1231  CLEAR_ERRORS
1232 }
1233 
1234 
1235 BOOST_AUTO_TEST_CASE(Test_BadProteinMoltype)
1236 {
1238 
1240 
1241  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadProteinMoltype", "Protein not single stranded"));
1242  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1243 
1244  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ds);
1245  eval = validator.Validate(seh, options);
1246  CheckErrors(*eval, expected_errors);
1247 
1248  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_mixed);
1249  eval = validator.Validate(seh, options);
1250  CheckErrors(*eval, expected_errors);
1251 
1252  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_other);
1253  eval = validator.Validate(seh, options);
1254  CheckErrors(*eval, expected_errors);
1255 
1256  // no errors expected for not set or single strand
1257  CLEAR_ERRORS
1258 
1259  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1260 
1261  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_not_set);
1262  eval = validator.Validate(seh, options);
1263  CheckErrors(*eval, expected_errors);
1264 
1265  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ss);
1266  eval = validator.Validate(seh, options);
1267  CheckErrors(*eval, expected_errors);
1268 
1269  CLEAR_ERRORS
1270 }
1271 
1272 
1273 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNotSet)
1274 {
1276 
1278 
1279  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MolNotSet", "Bioseq.mol is 0"));
1280  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1281 
1282  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_not_set);
1283  eval = validator.Validate(seh, options);
1284  CheckErrors(*eval, expected_errors);
1285 
1286  expected_errors[0]->SetErrCode("MolOther");
1287  expected_errors[0]->SetErrMsg("Bioseq.mol is type other");
1288  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_other);
1289  eval = validator.Validate(seh, options);
1290  CheckErrors(*eval, expected_errors);
1291 
1292  expected_errors[0]->SetErrCode("MolNuclAcid");
1293  expected_errors[0]->SetErrMsg("Bioseq.mol is type nucleic acid");
1294  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_na);
1295  eval = validator.Validate(seh, options);
1296  CheckErrors(*eval, expected_errors);
1297 
1298  CLEAR_ERRORS
1299 }
1300 
1301 
1302 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_FuzzyLen)
1303 {
1305 
1307 
1308  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "FuzzyLen", "Fuzzy length on raw Bioseq"));
1309  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1310 
1311  entry->SetSeq().SetInst().SetFuzz();
1312  eval = validator.Validate(seh, options);
1313  CheckErrors(*eval, expected_errors);
1314 
1315  expected_errors[0]->SetErrMsg("Fuzzy length on const Bioseq");
1316  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_const);
1317  eval = validator.Validate(seh, options);
1318  CheckErrors(*eval, expected_errors);
1319 
1320  // shouldn't get fuzzy length if gap
1321  expected_errors[0]->SetErrCode("SeqDataNotFound");
1322  expected_errors[0]->SetErrMsg("Missing Seq-data on constructed Bioseq");
1323  expected_errors[0]->SetSeverity(eDiag_Critical);
1324  entry->SetSeq().SetInst().SetSeq_data().SetGap();
1325  eval = validator.Validate(seh, options);
1326  CheckErrors(*eval, expected_errors);
1327 
1328  CLEAR_ERRORS
1329 }
1330 
1331 
1332 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidAlphabet)
1333 {
1335 
1337  CScope scope(*objmgr);
1338  scope.AddDefaults();
1339  CSeq_entry_Handle prot_seh = scope.AddTopLevelSeqEntry(*prot_entry);
1340 
1341  CValidator validator(*objmgr);
1342 
1343  // Set validator options
1344  unsigned int options = CValidator::eVal_need_isojta
1348 
1349  // list of expected errors
1350  vector<CExpectedError*> expected_errors;
1351  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidAlphabet", "Using a nucleic acid alphabet on a protein sequence"));
1352  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1353  prot_entry->SetSeq().SetInst().SetSeq_data().SetIupacna();
1354  CConstRef<CValidError> eval = validator.Validate(prot_seh, options);
1355  CheckErrors(*eval, expected_errors);
1356 
1357  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi2na();
1358  eval = validator.Validate(prot_seh, options);
1359  CheckErrors(*eval, expected_errors);
1360 
1361  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na();
1362  eval = validator.Validate(prot_seh, options);
1363  CheckErrors(*eval, expected_errors);
1364 
1365  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi8na();
1366  eval = validator.Validate(prot_seh, options);
1367  CheckErrors(*eval, expected_errors);
1368 
1369  prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbipna();
1370  eval = validator.Validate(prot_seh, options);
1371  CheckErrors(*eval, expected_errors);
1372 
1374  CScope scope2(*objmgr);
1375  scope2.AddDefaults();
1376  CSeq_entry_Handle seh = scope2.AddTopLevelSeqEntry(*entry);
1377 
1378  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa();
1379  expected_errors[0]->SetErrMsg("Using a protein alphabet on a nucleic acid");
1380 
1381  eval = validator.Validate(seh, options);
1382  CheckErrors(*eval, expected_errors);
1383 
1384  entry->SetSeq().SetInst().SetSeq_data().SetNcbi8aa();
1385  eval = validator.Validate(seh, options);
1386  CheckErrors(*eval, expected_errors);
1387 
1388  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa();
1389  eval = validator.Validate(seh, options);
1390  CheckErrors(*eval, expected_errors);
1391 
1392  entry->SetSeq().SetInst().SetSeq_data().SetNcbipaa();
1393  eval = validator.Validate(seh, options);
1394  CheckErrors(*eval, expected_errors);
1395 
1396  entry->SetSeq().SetInst().SetSeq_data().SetNcbistdaa();
1397  eval = validator.Validate(seh, options);
1398  CheckErrors(*eval, expected_errors);
1399 
1400  CLEAR_ERRORS
1401 }
1402 
1403 
1404 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidResidue)
1405 {
1407 
1409 
1410  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1411  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1412  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1413  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1414  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1415  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1416  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1417  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1418  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1419  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1420  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFE');
1421  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFE');
1422  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFF');
1423  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFF');
1424  entry->SetSeq().SetInst().SetLength(65);
1425  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'E' at position [5]"));
1426  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'F' at position [6]"));
1427  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'I' at position [9]"));
1428  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'J' at position [10]"));
1429  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'L' at position [12]"));
1430  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'O' at position [15]"));
1431  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'P' at position [16]"));
1432  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Q' at position [17]"));
1433  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'U' at position [21]"));
1434  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'X' at position [24]"));
1435  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Z' at position [26]"));
1436  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'E' at position [31]"));
1437  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'F' at position [32]"));
1438  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'I' at position [35]"));
1439  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'J' at position [36]"));
1440  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'L' at position [38]"));
1441  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'O' at position [41]"));
1442  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'P' at position [42]"));
1443  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Q' at position [43]"));
1444  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'U' at position [47]"));
1445  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'X' at position [50]"));
1446  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Z' at position [52]"));
1447  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [53]"));
1448  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [54]"));
1449  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [55]"));
1450  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [56]"));
1451  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [57]"));
1452  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [58]"));
1453  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [59]"));
1454  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [60]"));
1455  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [61]"));
1456  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [254] at position [62]"));
1457  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "More than 10 invalid residues. Checking stopped"));
1458  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
1459  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1460 
1461  eval = validator.Validate(seh, options);
1462  CheckErrors(*eval, expected_errors);
1463 
1464  // now repeat test, but with mRNA - this time Us should not be reported
1465  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
1466  delete expected_errors[8];
1467  expected_errors[8] = nullptr;
1468  delete expected_errors[19];
1469  expected_errors[19] = nullptr;
1470  eval = validator.Validate(seh, options);
1471  CheckErrors(*eval, expected_errors);
1472 
1473  // now repeat test, but with protein
1474  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_aa);
1475  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1476  if (it->IsMolinfo()) {
1477  it->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1478  }
1479  }
1480  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1481  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1482  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1483  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1484  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1485  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1486  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1487  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1488  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1489  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1490  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFE');
1491  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFE');
1492  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFF');
1493  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFF');
1494  entry->SetSeq().SetInst().SetLength(65);
1495  CRef<CSeq_feat> feat(new CSeq_feat());
1496  feat->SetData().SetProt().SetName().push_back("fake protein name");
1497  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
1498  feat->SetLocation().SetInt().SetFrom(0);
1499  feat->SetLocation().SetInt().SetTo(64);
1500  unit_test_util::AddFeat(feat, entry);
1501  scope.RemoveEntry(*entry);
1502  seh = scope.AddTopLevelSeqEntry(*entry);
1503 
1504  for (int j = 0; j < 22; j++) {
1505  if (expected_errors[j]) {
1506  delete expected_errors[j];
1507  expected_errors[j] = nullptr;
1508  }
1509  }
1510  eval = validator.Validate(seh, options);
1511  CheckErrors(*eval, expected_errors);
1512 
1513  CLEAR_ERRORS
1514 
1515  // now look for lowercase characters
1516  scope.RemoveEntry(*entry);
1517  entry = unit_test_util::BuildGoodSeq();
1518  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("abcdefghijklmnopqrstuvwxyz");
1519  entry->SetSeq().SetInst().SetLength(26);
1520  seh = scope.AddTopLevelSeqEntry(*entry);
1521  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Sequence contains lower-case characters"));
1522  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1523  eval = validator.Validate(seh, options);
1524  CheckErrors(*eval, expected_errors);
1525 
1526  scope.RemoveEntry(*entry);
1528  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("protein");
1529  seh = scope.AddTopLevelSeqEntry(*entry);
1530  eval = validator.Validate(seh, options);
1531  CheckErrors(*eval, expected_errors);
1532 
1533  CLEAR_ERRORS
1534 
1535  // now try delta sequence
1536  scope.RemoveEntry(*entry);
1537  entry = unit_test_util::BuildGoodSeq();
1538  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1539  entry->SetSeq().SetInst().ResetSeq_data();
1540  CRef<CDelta_seq> seg(new CDelta_seq());
1541  seg->SetLiteral().SetSeq_data().SetIupacna().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1542  seg->SetLiteral().SetLength(52);
1543  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(seg);
1544  entry->SetSeq().SetInst().SetLength(52);
1545  seh = scope.AddTopLevelSeqEntry(*entry);
1546 
1547  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [E] at position [5]"));
1548  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [F] at position [6]"));
1549  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [I] at position [9]"));
1550  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [J] at position [10]"));
1551  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [L] at position [12]"));
1552  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [O] at position [15]"));
1553  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [P] at position [16]"));
1554  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Q] at position [17]"));
1555  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [U] at position [21]"));
1556  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [X] at position [24]"));
1557  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Z] at position [26]"));
1558  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [E] at position [31]"));
1559  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [F] at position [32]"));
1560  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [I] at position [35]"));
1561  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [J] at position [36]"));
1562  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [L] at position [38]"));
1563  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [O] at position [41]"));
1564  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [P] at position [42]"));
1565  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Q] at position [43]"));
1566  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [U] at position [47]"));
1567  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [X] at position [50]"));
1568  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Z] at position [52]"));
1569  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1570  eval = validator.Validate(seh, options);
1571  CheckErrors(*eval, expected_errors);
1572 
1573  CLEAR_ERRORS
1574 
1575  // try protein delta sequence
1576  scope.RemoveEntry(*entry);
1578  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1579  entry->SetSeq().SetInst().ResetSeq_data();
1580  CRef<CDelta_seq> seg2(new CDelta_seq());
1581  seg2->SetLiteral().SetSeq_data().SetIupacaa().Set("1234567");
1582  seg2->SetLiteral().SetLength(7);
1583  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(seg2);
1584  entry->SetSeq().SetInst().SetLength(7);
1585  seh = scope.AddTopLevelSeqEntry(*entry);
1586 
1587  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [1] at position [1]"));
1588  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [2] at position [2]"));
1589  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [3] at position [3]"));
1590  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [4] at position [4]"));
1591  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [5] at position [5]"));
1592  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [6] at position [6]"));
1593  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [7] at position [7]"));
1594  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1595 
1596  eval = validator.Validate(seh, options);
1597  CheckErrors(*eval, expected_errors);
1598 
1599  CLEAR_ERRORS
1600 }
1601 
1602 
1603 /*
1604 static void WriteOutTemp(CRef<CSeq_entry> entry)
1605 {
1606  // construct a temp file name
1607  CNcbiOstrstream oss;
1608  oss << "test.asn";
1609  string filename = CNcbiOstrstreamToString(oss);
1610  string fullPath = CDirEntry::MakePath(".", filename);
1611 
1612  // initialize a binary output stream
1613  unique_ptr<CNcbiOstream> outStream;
1614  outStream.reset(new CNcbiOfstream(
1615  fullPath.c_str(),
1616  IOS_BASE::out));
1617  if (!(*outStream)) {
1618  return;
1619  }
1620 
1621  unique_ptr<CObjectOStream> outObject;
1622  // Associate ASN.1 text serialization methods with the input
1623  outObject.reset(new CObjectOStreamAsn(*outStream));
1624 
1625  // write the asn data
1626  try {
1627  *outObject << *entry;
1628  outStream->flush();
1629  } catch (exception&) {
1630  }
1631 }
1632 */
1633 
1634 
1635 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_StopInProtein)
1636 {
1638 
1640 
1641  entry->SetSet().SetSeq_set().back()->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MP*K*E*N");
1642  entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("GTGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
1644  cds->SetExcept(true);
1645  cds->SetExcept_text("unclassified translation discrepancy");
1646 
1647  BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
1648  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
1649 
1650  // list of expected errors
1651  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
1652  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "ExceptionProblem", "unclassified translation discrepancy is not a legal exception explanation"));
1653  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
1654  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
1655  "CDS has unnecessary translated product replaced exception"));
1656  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
1657 
1658  eval = validator.Validate(seh, options);
1659  CheckErrors(*eval, expected_errors);
1660  // WriteOutTemp(entry);
1661 
1662  CLEAR_ERRORS
1663  cds->ResetExcept();
1664  cds->ResetExcept_text();
1665  BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
1666  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
1667  BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
1668 
1669  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
1670  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon", "Illegal start codon (and 3 internal stops). Probably wrong genetic code [0]"));
1671  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
1672  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
1673 
1674  eval = validator.Validate(seh, options);
1675  CheckErrors(*eval, expected_errors);
1676  // WriteOutTemp(entry);
1677 
1679  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
1680 
1681  // write out seq-entry
1682  // WriteOutTemp(entry);
1683 
1684  delete expected_errors[1];
1685  expected_errors[1] = nullptr;
1686  expected_errors[2]->SetErrMsg("3 internal stops. Genetic code [0]");
1687  eval = validator.Validate(seh, options);
1688  CheckErrors(*eval, expected_errors);
1689 
1690  CLEAR_ERRORS
1691 }
1692 
1693 
1694 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_PartialInconsistent)
1695 {
1696 #if 0
1697  //We don't care about segmented sets any more
1699 
1701 
1702  entry->SetSeq().SetInst().ResetSeq_data();
1703  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
1704  CRef<CSeq_id> id(new CSeq_id("gb|AY123456"));
1705  CRef<CSeq_loc> loc1(new CSeq_loc(*id, 0, 3));
1706  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc1);
1707  CRef<CSeq_id> id2(new CSeq_id("gb|AY123457"));
1708  CRef<CSeq_loc> loc2(new CSeq_loc(*id2, 0, 2));
1709  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc2);
1710 
1711  // list of expected errors
1712  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "PartialInconsistent", "Partial segmented sequence without MolInfo partial"));
1713 
1714  // not-set
1715  loc1->SetPartialStart(true, eExtreme_Biological);
1716  loc2->SetPartialStop(true, eExtreme_Biological);
1717  eval = validator.Validate(seh, options);
1718  CheckErrors(*eval, expected_errors);
1719  loc1->SetPartialStart(true, eExtreme_Biological);
1720  loc2->SetPartialStop(false, eExtreme_Biological);
1721  eval = validator.Validate(seh, options);
1722  CheckErrors(*eval, expected_errors);
1723  loc1->SetPartialStart(false, eExtreme_Biological);
1724  loc2->SetPartialStop(true, eExtreme_Biological);
1725  eval = validator.Validate(seh, options);
1726  CheckErrors(*eval, expected_errors);
1727 
1728  // unknown
1730 
1731  loc1->SetPartialStart(true, eExtreme_Biological);
1732  loc2->SetPartialStop(true, eExtreme_Biological);
1733  eval = validator.Validate(seh, options);
1734  CheckErrors(*eval, expected_errors);
1735  loc1->SetPartialStart(true, eExtreme_Biological);
1736  loc2->SetPartialStop(false, eExtreme_Biological);
1737  eval = validator.Validate(seh, options);
1738  CheckErrors(*eval, expected_errors);
1739  loc1->SetPartialStart(false, eExtreme_Biological);
1740  loc2->SetPartialStop(true, eExtreme_Biological);
1741  eval = validator.Validate(seh, options);
1742  CheckErrors(*eval, expected_errors);
1743 
1744  // complete
1746 
1747  loc1->SetPartialStart(true, eExtreme_Biological);
1748  loc2->SetPartialStop(true, eExtreme_Biological);
1749  eval = validator.Validate(seh, options);
1750  CheckErrors(*eval, expected_errors);
1751  loc1->SetPartialStart(true, eExtreme_Biological);
1752  loc2->SetPartialStop(false, eExtreme_Biological);
1753  eval = validator.Validate(seh, options);
1754  CheckErrors(*eval, expected_errors);
1755  loc1->SetPartialStart(false, eExtreme_Biological);
1756  loc2->SetPartialStop(true, eExtreme_Biological);
1757  eval = validator.Validate(seh, options);
1758  CheckErrors(*eval, expected_errors);
1759 
1760  // partial
1762 
1763  loc1->SetPartialStart(false, eExtreme_Biological);
1764  loc2->SetPartialStop(false, eExtreme_Biological);
1765  expected_errors[0]->SetErrMsg("Complete segmented sequence with MolInfo partial");
1766  eval = validator.Validate(seh, options);
1767  CheckErrors(*eval, expected_errors);
1768 
1769  // no-left
1771 
1772  loc1->SetPartialStart(true, eExtreme_Biological);
1773  loc2->SetPartialStop(true, eExtreme_Biological);
1774  expected_errors[0]->SetErrMsg("No-left inconsistent with segmented SeqLoc");
1775  eval = validator.Validate(seh, options);
1776  CheckErrors(*eval, expected_errors);
1777  loc1->SetPartialStart(false, eExtreme_Biological);
1778  loc2->SetPartialStop(true, eExtreme_Biological);
1779  eval = validator.Validate(seh, options);
1780  CheckErrors(*eval, expected_errors);
1781  loc1->SetPartialStart(false, eExtreme_Biological);
1782  loc2->SetPartialStop(false, eExtreme_Biological);
1783  eval = validator.Validate(seh, options);
1784  CheckErrors(*eval, expected_errors);
1785 
1786  // no-right
1788 
1789  loc1->SetPartialStart(true, eExtreme_Biological);
1790  loc2->SetPartialStop(true, eExtreme_Biological);
1791  expected_errors[0]->SetErrMsg("No-right inconsistent with segmented SeqLoc");
1792  eval = validator.Validate(seh, options);
1793  CheckErrors(*eval, expected_errors);
1794  loc1->SetPartialStart(true, eExtreme_Biological);
1795  loc2->SetPartialStop(false, eExtreme_Biological);
1796  eval = validator.Validate(seh, options);
1797  CheckErrors(*eval, expected_errors);
1798  loc1->SetPartialStart(false, eExtreme_Biological);
1799  loc2->SetPartialStop(false, eExtreme_Biological);
1800  eval = validator.Validate(seh, options);
1801  CheckErrors(*eval, expected_errors);
1802 
1803  // no-ends
1805 
1806  expected_errors[0]->SetErrMsg("No-ends inconsistent with segmented SeqLoc");
1807  loc1->SetPartialStart(true, eExtreme_Biological);
1808  loc2->SetPartialStop(false, eExtreme_Biological);
1809  eval = validator.Validate(seh, options);
1810  CheckErrors(*eval, expected_errors);
1811  loc1->SetPartialStart(false, eExtreme_Biological);
1812  loc2->SetPartialStop(true, eExtreme_Biological);
1813  eval = validator.Validate(seh, options);
1814  CheckErrors(*eval, expected_errors);
1815  loc1->SetPartialStart(false, eExtreme_Biological);
1816  loc2->SetPartialStop(false, eExtreme_Biological);
1817  eval = validator.Validate(seh, options);
1818  CheckErrors(*eval, expected_errors);
1819 
1820  CLEAR_ERRORS
1821 #endif
1822 }
1823 
1824 
1825 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ShortSeq)
1826 {
1828 
1830 
1831  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPR");
1832  entry->SetSeq().SetInst().SetLength(3);
1833  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetTo(2);
1834 
1835  // don't report if pdb
1836  CRef<CPDB_seq_id> pdb_id(new CPDB_seq_id());
1837  pdb_id->SetMol().Set("foo");
1838  entry->SetSeq().SetId().front()->SetPdb(*pdb_id);
1839  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetPdb(*pdb_id);
1840  scope.RemoveTopLevelSeqEntry(seh);
1841  seh = scope.AddTopLevelSeqEntry(*entry);
1842  eval = validator.Validate(seh, options);
1843  // AddChromosomeNoLocation(expected_errors, "pdb|foo| ");
1844  CheckErrors(*eval, expected_errors);
1845 
1846  // new test if no coding region
1847  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PartialsInconsistent", "Molinfo completeness and protein feature partials conflict"));
1848  expected_errors[0]->SetAccession("lcl|good");
1849  entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
1850  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
1851  scope.RemoveTopLevelSeqEntry(seh);
1852  seh = scope.AddTopLevelSeqEntry(*entry);
1854 
1855  eval = validator.Validate(seh, options);
1856  CheckErrors(*eval, expected_errors);
1858  eval = validator.Validate(seh, options);
1859  CheckErrors(*eval, expected_errors);
1861  eval = validator.Validate(seh, options);
1862  CheckErrors(*eval, expected_errors);
1864  eval = validator.Validate(seh, options);
1865  CheckErrors(*eval, expected_errors);
1866 
1867  CLEAR_ERRORS
1868 
1869  // for all other completeness, report
1870  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortSeq", "Sequence only 3 residues"));
1871  // AddChromosomeNoLocation(expected_errors, "lcl|good");
1872  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1873  if (it->IsMolinfo()) {
1874  it->SetMolinfo().ResetCompleteness();
1875  }
1876  }
1877  eval = validator.Validate(seh, options);
1878  CheckErrors(*eval, expected_errors);
1880  eval = validator.Validate(seh, options);
1881  CheckErrors(*eval, expected_errors);
1883  eval = validator.Validate(seh, options);
1884  CheckErrors(*eval, expected_errors);
1886  eval = validator.Validate(seh, options);
1887  CheckErrors(*eval, expected_errors);
1888 
1889  // nucleotide
1890  scope.RemoveTopLevelSeqEntry(seh);
1891  entry = unit_test_util::BuildGoodSeq();
1892  seh = scope.AddTopLevelSeqEntry(*entry);
1893  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCTTT");
1894  entry->SetSeq().SetInst().SetLength(9);
1895  expected_errors[0]->SetErrMsg("Sequence only 9 residues");
1896  eval = validator.Validate(seh, options);
1897  CheckErrors(*eval, expected_errors);
1898 
1899  CLEAR_ERRORS
1900 
1901  // don't report if pdb
1902  entry->SetSeq().SetId().front()->SetPdb(*pdb_id);
1903  scope.RemoveTopLevelSeqEntry(seh);
1904  seh = scope.AddTopLevelSeqEntry(*entry);
1905  eval = validator.Validate(seh, options);
1906  // AddChromosomeNoLocation(expected_errors, "pdb|foo| ");
1907  CheckErrors(*eval, expected_errors);
1908 
1909  CLEAR_ERRORS
1910 }
1911 
1912 
1914 {
1915  bool rval = false;
1916 
1917  switch (tech) {
1920  case CMolInfo::eTech_both:
1924  rval = true;
1925  break;
1926  default:
1927  break;
1928  }
1929  return rval;
1930 }
1931 
1932 
1934 {
1935  CRef<CSeqdesc> desc(new CSeqdesc());
1938  if (entry->IsSeq()) {
1939  entry->SetSeq().SetDescr().Set().push_back(desc);
1940  } else if (entry->IsSet()) {
1941  entry->SetSet().SetDescr().Set().push_back(desc);
1942  }
1943 }
1944 
1945 
1946 static void SetRefGeneTrackingStatus(CRef<CSeq_entry> entry, string status)
1947 {
1948  if (entry->IsSeq()) {
1949  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1950  if (it->IsUser() && it->GetUser().IsRefGeneTracking()) {
1951  it->SetUser().SetData().front()->SetData().SetStr(status);
1952  }
1953  }
1954  } else if (entry->IsSet()) {
1955  for (auto& it : entry->SetSet().SetDescr().Set()) {
1956  if (it->IsUser() && it->GetUser().IsRefGeneTracking()) {
1957  it->SetUser().SetData().front()->SetData().SetStr(status);
1958  }
1959  }
1960  }
1961 }
1962 
1963 
1964 static void SetTitle(CRef<CSeq_entry> entry, string title)
1965 {
1966  bool found = false;
1967 
1968  if (entry->IsSetDescr()) {
1969  auto& cont = entry->SetDescr().Set();
1970  for (auto it = cont.begin(); it != cont.end();) {
1971  if ((*it)->IsTitle()) {
1972  found = true;
1973  if (NStr::IsBlank((*it)->GetTitle())) {
1974  it = cont.erase(it);
1975  continue;
1976  } else {
1977  (*it)->SetTitle(title);
1978  }
1979  }
1980  ++it;
1981  }
1982  }
1983 
1984  if (!found && !NStr::IsBlank(title)) {
1985  CRef<CSeqdesc> desc(new CSeqdesc());
1986  desc->SetTitle(title);
1987  entry->SetSeq().SetDescr().Set().push_back(desc);
1988  }
1989 }
1990 
1991 
1992 static void AddGenbankKeyword(CRef<CSeq_entry> entry, string keyword)
1993 {
1994  bool found = false;
1995 
1996  for (auto& it : entry->SetSeq().SetDescr().Set()) {
1997  if (it->IsGenbank()) {
1998  it->SetGenbank().SetKeywords().push_back(keyword);
1999  found = true;
2000  }
2001  }
2002  if (!found) {
2003  CRef<CSeqdesc> desc(new CSeqdesc());
2004  desc->SetGenbank().SetKeywords().push_back(keyword);
2005  entry->SetSeq().SetDescr().Set().push_back(desc);
2006  }
2007 }
2008 
2009 
2011 {
2014 
2015  SetTech(entry, tech);
2016  eval = validator.Validate(seh, options);
2017  if (tech == CMolInfo::eTech_barcode) {
2018  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
2019  } else if (tech == CMolInfo::eTech_tsa) {
2020  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TSAseqGapProblem", "TSA Seq_gap NULL"));
2021  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2022  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"genomic\" is not appropriate for sequences that use the TSA technique."));
2023  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAseqGapProblem", "TSA submission includes wrong gap type. Gaps for TSA should be Assembly Gaps with linkage evidence."));
2024  } else if (tech == CMolInfo::eTech_wgs) {
2025  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
2026  }
2027  if (tech == CMolInfo::eTech_wgs) {
2028  AddChromosomeNoLocation(expected_errors, "lcl|good");
2029  }
2030 
2031  CheckErrors(*eval, expected_errors);
2032 
2033  CLEAR_ERRORS
2034 }
2035 
2036 
2038 {
2041 
2042  SetTech(entry, tech);
2043  eval = validator.Validate(seh, options);
2044  if (IsProteinTech(tech)) {
2045  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide", "Nucleic acid with protein sequence method"));
2046  } else if (tech == CMolInfo::eTech_est) {
2047  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA", "EST sequence should be mRNA"));
2048  }
2049  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDeltaSeq", "Delta seq technique should not be [" + NStr::UIntToString(tech) + "]"));
2050  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2051  eval = validator.Validate(seh, options);
2052  CheckErrors(*eval, expected_errors);
2053  CLEAR_ERRORS
2054 }
2055 
2056 
2058 {
2060 
2062 
2063  CRef<CDelta_seq> start_gap_seg(new CDelta_seq());
2064  start_gap_seg->SetLiteral().SetLength(10);
2065  start_gap_seg->SetLiteral().SetSeq_data().SetGap();
2066  entry->SetSeq().SetInst().SetExt().SetDelta().Set().insert(entry->SetSeq().SetInst().SetExt().SetDelta().Set().begin(), start_gap_seg);
2067  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2068  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2069  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("AAATTTGGGC", CSeq_inst::eMol_dna);
2070  CRef<CDelta_seq> end_gap_seg(new CDelta_seq());
2071  end_gap_seg->SetLiteral().SetLength(10);
2072  end_gap_seg->SetLiteral().SetSeq_data().SetGap();
2073  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(end_gap_seg);
2074  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
2075  entry->SetSeq().SetInst().SetLength(94);
2076  SetTech(entry, tech);
2077  if (tech == CMolInfo::eTech_wgs) {
2078  AddChromosomeNoLocation(expected_errors, "lcl|good");
2079  }
2080  // expected_errors.push_back(new CExpectedError("lcl|good", tech == CMolInfo::eTech_wgs ? eDiag_Warning : eDiag_Error, "BadDeltaSeq", "First delta seq component is a gap"));
2081  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDeltaSeq", "There is 1 adjacent gap in delta seq"));
2082  // expected_errors.push_back(new CExpectedError("lcl|good", tech == CMolInfo::eTech_wgs ? eDiag_Warning : eDiag_Error, "BadDeltaSeq", "Last delta seq component is a gap"));
2083  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at beginning of sequence"));
2084  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at end of sequence"));
2085  /*
2086  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
2087  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
2088  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
2089  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
2090  */
2091  if (tech == CMolInfo::eTech_wgs) {
2092  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
2093  }
2094  eval = validator.Validate(seh, options);
2095  CheckErrors(*eval, expected_errors);
2096 
2097  CLEAR_ERRORS
2098 }
2099 
2100 
2101 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadDeltaSeq)
2102 {
2104 
2106 
2107  for (auto& it : entry->SetSeq().SetDescr().Set()) {
2108  if (it->IsMolinfo()) {
2109  it->SetMolinfo().SetTech(CMolInfo::eTech_derived);
2110  }
2111  }
2112 
2113  // don't report if NT or NC
2114  scope.RemoveTopLevelSeqEntry(seh);
2115  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
2116  seh = scope.AddTopLevelSeqEntry(*entry);
2117  eval = validator.Validate(seh, options);
2118  // AddChromosomeNoLocation(expected_errors, "ref|NC_123456|");
2119  CheckErrors(*eval, expected_errors);
2120  CLEAR_ERRORS
2121 
2122  entry->SetSeq().SetId().front()->SetOther().SetAccession("NT_123456");
2123  scope.RemoveTopLevelSeqEntry(seh);
2124  seh = scope.AddTopLevelSeqEntry(*entry);
2125  eval = validator.Validate(seh, options);
2126  // AddChromosomeNoLocation(expected_errors, "ref|NT_123456|");
2127  CheckErrors(*eval, expected_errors);
2128  CLEAR_ERRORS
2129 
2130  // don't report if gen-prod-set
2131 
2132  entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
2133  scope.RemoveTopLevelSeqEntry(seh);
2134  seh = scope.AddTopLevelSeqEntry(*entry);
2135 
2136  // allowed tech values
2137  vector<CMolInfo::TTech> allowed_list;
2138  allowed_list.push_back(CMolInfo::eTech_htgs_0);
2139  allowed_list.push_back(CMolInfo::eTech_htgs_1);
2140  allowed_list.push_back(CMolInfo::eTech_htgs_2);
2141  allowed_list.push_back(CMolInfo::eTech_htgs_3);
2142  allowed_list.push_back(CMolInfo::eTech_wgs);
2143  allowed_list.push_back(CMolInfo::eTech_composite_wgs_htgs);
2144  allowed_list.push_back(CMolInfo::eTech_unknown);
2145  allowed_list.push_back(CMolInfo::eTech_standard);
2146  allowed_list.push_back(CMolInfo::eTech_htc);
2147  allowed_list.push_back(CMolInfo::eTech_barcode);
2148  allowed_list.push_back(CMolInfo::eTech_tsa);
2149 
2151  bool allowed = false;
2152  for (CMolInfo::TTech it : allowed_list) {
2153  if (it == i) {
2154  allowed = true;
2155  break;
2156  }
2157  }
2158  if (allowed) {
2159  // don't report for htgs_0
2161  } else {
2163  }
2164  }
2165 
2166  CLEAR_ERRORS
2167 
2170 
2171  CLEAR_ERRORS
2172 }
2173 
2174 
2175 void AdjustGap(CSeq_gap& gap, CSeq_gap::EType gap_type, bool is_linked, vector<CLinkage_evidence::EType> linkage_evidence)
2176 {
2177  gap.Reset();
2178  gap.SetType(gap_type);
2179  if (is_linked) {
2181  } else {
2182  gap.ResetLinkage();
2183  }
2184  gap.ResetLinkage_evidence();
2185  for (auto it : linkage_evidence) {
2187  ev->SetType(it);
2188  gap.SetLinkage_evidence().push_back(ev);
2189  }
2190 }
2191 
2192 
2193 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqGapBadLinkage)
2194 {
2196 
2197  vector<CLinkage_evidence::EType> evidence;
2198  evidence.push_back(CLinkage_evidence::eType_align_genus);
2199  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2200  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2201  && it->GetLiteral().GetSeq_data().IsGap()) {
2202  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2203  CSeq_gap::eType_short_arm, true, evidence);
2204  }
2205  }
2206 
2208 
2209  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2210  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
2211  "SeqGapBadLinkage", "Seq-gap of type 3 should not have linkage evidence"));
2212 
2213  eval = validator.Validate(seh, options);
2214  CheckErrors(*eval, expected_errors);
2215 
2216  CLEAR_ERRORS
2217 
2218  scope.RemoveTopLevelSeqEntry(seh);
2219  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2220  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2221  && it->GetLiteral().GetSeq_data().IsGap()) {
2222  CSeq_gap& gap = it->SetLiteral().SetSeq_data().SetGap();
2223  gap.ResetLinkage();
2224  gap.ResetType();
2225  }
2226  }
2227  seh = scope.AddTopLevelSeqEntry(*entry);
2228 
2229  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2230  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
2231  "SeqGapBadLinkage", "Seq-gap with linkage evidence must have linkage field set to linked"));
2232 
2233  eval = validator.Validate(seh, options);
2234  CheckErrors(*eval, expected_errors);
2235 
2236  CLEAR_ERRORS
2237 
2238  scope.RemoveTopLevelSeqEntry(seh);
2239  evidence.push_back(CLinkage_evidence::eType_align_genus);
2240  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2241  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2242  && it->GetLiteral().GetSeq_data().IsGap()) {
2243  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2244  CSeq_gap::eType_fragment, true, evidence);
2245  }
2246  }
2247  seh = scope.AddTopLevelSeqEntry(*entry);
2248 
2249  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2250  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
2251  "SeqGapBadLinkage", "Linkage evidence 'align genus' appears 2 times"));
2252 
2253  eval = validator.Validate(seh, options);
2254  CheckErrors(*eval, expected_errors);
2255 
2256  CLEAR_ERRORS
2257 
2258  evidence.pop_back();
2259  evidence.push_back(CLinkage_evidence::eType_unspecified);
2260  scope.RemoveTopLevelSeqEntry(seh);
2261  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2262  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2263  && it->GetLiteral().GetSeq_data().IsGap()) {
2264  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2265  CSeq_gap::eType_fragment, true, evidence);
2266  }
2267  }
2268  seh = scope.AddTopLevelSeqEntry(*entry);
2269 
2270  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2271  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
2272  "SeqGapBadLinkage", "Seq-gap type has unspecified and additional linkage evidence"));
2273 
2274  eval = validator.Validate(seh, options);
2275  CheckErrors(*eval, expected_errors);
2276 
2277  CLEAR_ERRORS
2278 
2279  scope.RemoveTopLevelSeqEntry(seh);
2280  evidence.clear();
2281  evidence.push_back(CLinkage_evidence::eType_unspecified);
2282  for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2283  if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data()
2284  && it->GetLiteral().GetSeq_data().IsGap()) {
2285  AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2286  CSeq_gap::eType_unknown, true, evidence);
2287  }
2288  }
2289  seh = scope.AddTopLevelSeqEntry(*entry);
2290 
2291  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2292  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
2293  "SeqGapBadLinkage", "Single Seq-gap has unknown type and unspecified linkage"));
2294 
2295  eval = validator.Validate(seh, options);
2296  CheckErrors(*eval, expected_errors);
2297 
2298  CLEAR_ERRORS
2299 
2300  scope.RemoveTopLevelSeqEntry(seh);
2301  CRef<CDelta_seq> gap_seg(new CDelta_seq());
2302  gap_seg->SetLiteral().SetLength(10);
2303  AdjustGap(gap_seg->SetLiteral().SetSeq_data().SetGap(),
2304  CSeq_gap::eType_unknown, true, evidence);
2305 
2306  // adjust delta to avoid errors about large number of Ns in first and last 50 bp
2307  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCATGATGATGTACCGTACGTTTTCCCATGATGATGTACCGTACGTTTT");
2308  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetLength(50);
2309  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
2310  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATGTACCGTACGTTTTCCCATGATGATGTACCGTACGTTTT", CSeq_inst::eMol_dna);
2311  entry->SetSeq().SetInst().SetLength(132);
2312 
2313  seh = scope.AddTopLevelSeqEntry(*entry);
2314 
2315  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2316  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
2317  "SeqGapBadLinkage", "All 2 Seq-gaps have unknown type and unspecified linkage"));
2318 
2319  eval = validator.Validate(seh, options);
2320  CheckErrors(*eval, expected_errors);
2321 
2322  CLEAR_ERRORS
2323 }
2324 
2325 
2326 void ChangeErrorAcc(vector<CExpectedError*> expected_errors, const string& acc)
2327 {
2328  for (auto it : expected_errors) {
2329  if (it) {
2330  it->SetAccession(acc);
2331  }
2332  }
2333 }
2334 
2335 
2336 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingIdsOnBioseq)
2337 {
2339 
2341 
2342  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (lcl|good - lcl|bad)"));
2343  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2344 
2345  // local IDs
2346  scope.RemoveTopLevelSeqEntry(seh);
2347  CRef<CSeq_id> id2(new CSeq_id());
2348  id2->SetLocal().SetStr("bad");
2349  entry->SetSeq().SetId().push_back(id2);
2350  seh = scope.AddTopLevelSeqEntry(*entry);
2351  eval = validator.Validate(seh, options);
2352  CheckErrors(*eval, expected_errors);
2353 
2354  // GIBBSQ
2355  scope.RemoveTopLevelSeqEntry(seh);
2356  CRef<CSeq_id> id1 = entry->SetSeq().SetId().front();
2357  id1->SetGibbsq(1);
2358  id2->SetGibbsq(2);
2359  seh = scope.AddTopLevelSeqEntry(*entry);
2360  ChangeErrorAcc(expected_errors, "bbs|1");
2361  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (bbs|1 - bbs|2)");
2362  eval = validator.Validate(seh, options);
2363  CheckErrors(*eval, expected_errors);
2364 
2365  // GIBBSQ
2366  scope.RemoveTopLevelSeqEntry(seh);
2367  id1->SetGibbmt(1);
2368  id2->SetGibbmt(2);
2369  seh = scope.AddTopLevelSeqEntry(*entry);
2370  ChangeErrorAcc(expected_errors, "bbm|1");
2371  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (bbm|1 - bbm|2)");
2372  eval = validator.Validate(seh, options);
2373  CheckErrors(*eval, expected_errors);
2374 
2375  // GI
2376  scope.RemoveTopLevelSeqEntry(seh);
2377  id1->SetGi(GI_CONST(1));
2378  id2->SetGi(GI_CONST(2));
2379  CRef<CSeq_id> id3(new CSeq_id("gb|AY123456.1"));
2380  entry->SetSeq().SetId().push_back(id3);
2381  seh = scope.AddTopLevelSeqEntry(*entry);
2382  ChangeErrorAcc(expected_errors, "gb|AY123456.1|");
2383  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (gi|1 - gi|2)");
2384  eval = validator.Validate(seh, options);
2385  CheckErrors(*eval, expected_errors);
2386  entry->SetSeq().SetId().pop_back();
2387 
2388  // GIIM
2389  scope.RemoveTopLevelSeqEntry(seh);
2390  id1->SetGiim().SetId(1);
2391  id1->SetGiim().SetDb("foo");
2392  id2->SetGiim().SetId(2);
2393  id2->SetGiim().SetDb("foo");
2394  seh = scope.AddTopLevelSeqEntry(*entry);
2395  CLEAR_ERRORS
2396 
2397  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "IdOnMultipleBioseqs", "BioseqFind (gim|1) unable to find itself - possible internal error"));
2398  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gim|1 - gim|2)"));
2399  expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "IdOnMultipleBioseqs", "BioseqFind (gim|2) unable to find itself - possible internal error"));
2400  // AddChromosomeNoLocation(expected_errors, "gim|1");
2401  eval = validator.Validate(seh, options);
2402  CheckErrors(*eval, expected_errors);
2403  CLEAR_ERRORS
2404 
2405  // patent
2406  scope.RemoveTopLevelSeqEntry(seh);
2407  id1->SetPatent().SetSeqid(1);
2408  id1->SetPatent().SetCit().SetCountry("USA");
2409  id1->SetPatent().SetCit().SetId().SetNumber("1");
2410  id2->SetPatent().SetSeqid(2);
2411  id2->SetPatent().SetCit().SetCountry("USA");
2412  id2->SetPatent().SetCit().SetId().SetNumber("2");
2413  seh = scope.AddTopLevelSeqEntry(*entry);
2414  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (pat|USA|1|1 - pat|USA|2|2)"));
2415  // AddChromosomeNoLocation(expected_errors, "pat|USA|1|1");
2416  eval = validator.Validate(seh, options);
2417  CheckErrors(*eval, expected_errors);
2418 
2419  // pdb
2420  scope.RemoveTopLevelSeqEntry(seh);
2421  id1->SetPdb().SetMol().Set("good");
2422  id2->SetPdb().SetMol().Set("badd");
2423  seh = scope.AddTopLevelSeqEntry(*entry);
2424  ChangeErrorAcc(expected_errors, "pdb|good| ");
2425  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (pdb|good| - pdb|badd| )");
2426  eval = validator.Validate(seh, options);
2427  CheckErrors(*eval, expected_errors);
2428 
2429  // general
2430  scope.RemoveTopLevelSeqEntry(seh);
2431  id1->SetGeneral().SetDb("a");
2432  id1->SetGeneral().SetTag().SetStr("good");
2433  id2->SetGeneral().SetDb("a");
2434  id2->SetGeneral().SetTag().SetStr("bad");
2435  seh = scope.AddTopLevelSeqEntry(*entry);
2436  ChangeErrorAcc(expected_errors, "gnl|a|good");
2437  expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (gnl|a|good - gnl|a|bad)");
2438  eval = validator.Validate(seh, options);
2439  CheckErrors(*eval, expected_errors);
2440 
2441  CLEAR_ERRORS
2442  // should get no error if db values are different
2443  scope.RemoveTopLevelSeqEntry(seh);
2444  id2->SetGeneral().SetDb("b");
2445  seh = scope.AddTopLevelSeqEntry(*entry);
2446  // AddChromosomeNoLocation(expected_errors, "gnl|a|good");
2447  eval = validator.Validate(seh, options);
2448  CheckErrors(*eval, expected_errors);
2449 
2450  // genbank
2451  scope.RemoveTopLevelSeqEntry(seh);
2452  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gb|AY222222|)"));
2453  id1->SetGenbank().SetAccession("AY123456");
2454  id2->SetGenbank().SetAccession("AY222222");
2455  seh = scope.AddTopLevelSeqEntry(*entry);
2456  eval = validator.Validate(seh, options);
2457  CheckErrors(*eval, expected_errors);
2458 
2459  // try genbank with accession same, versions different
2460  scope.RemoveTopLevelSeqEntry(seh);
2461  id2->SetGenbank().SetAccession("AY123456");
2462  id2->SetGenbank().SetVersion(2);
2463  seh = scope.AddTopLevelSeqEntry(*entry);
2464  CLEAR_ERRORS
2465  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.2|");
2466  expected_errors.push_back(new CExpectedError("gb|AY123456.2|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gb|AY123456.2|)"));
2467  eval = validator.Validate(seh, options);
2468  CheckErrors(*eval, expected_errors);
2469 
2470  // try similar id type
2471  scope.RemoveTopLevelSeqEntry(seh);
2472  id2->SetGpipe().SetAccession("AY123456");
2473  seh = scope.AddTopLevelSeqEntry(*entry);
2474  CLEAR_ERRORS
2475  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gpp|AY123456|)"));
2476  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
2477  eval = validator.Validate(seh, options);
2478  CheckErrors(*eval, expected_errors);
2479 
2480  // LRG
2481  scope.RemoveTopLevelSeqEntry(seh);
2482  id1->SetGeneral().SetDb("LRG");
2483  id1->SetGeneral().SetTag().SetStr("good");
2484  seh = scope.AddTopLevelSeqEntry(*entry);
2485  ChangeErrorAcc(expected_errors, "gpp|AY123456|");
2486  expected_errors[0]->SetErrMsg("LRG sequence needs NG_ accession");
2487  expected_errors[0]->SetSeverity(eDiag_Critical);
2488  eval = validator.Validate(seh, options);
2489  CheckErrors(*eval, expected_errors);
2490  // no error if has NG
2491  scope.RemoveTopLevelSeqEntry(seh);
2492  id2->SetOther().SetAccession("NG_123456");
2493  seh = scope.AddTopLevelSeqEntry(*entry);
2494  CLEAR_ERRORS
2495  // AddChromosomeNoLocation(expected_errors, "ref|NG_123456|");
2496  eval = validator.Validate(seh, options);
2497  CheckErrors(*eval, expected_errors);
2498 
2499  CLEAR_ERRORS
2500 }
2501 
2502 
2503 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNuclAcid)
2504 {
2506 
2508 
2509  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2510  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MolNuclAcid", "Bioseq.mol is type nucleic acid"));
2511 
2512  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_na);
2513  eval = validator.Validate(seh, options);
2514  CheckErrors(*eval, expected_errors);
2515 
2516  CLEAR_ERRORS
2517 }
2518 
2519 
2520 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingBiomolTech)
2521 {
2523 
2525 
2526  // allowed tech values
2527  vector<CMolInfo::TTech> genomic_list;
2528  genomic_list.push_back(CMolInfo::eTech_sts);
2529  genomic_list.push_back(CMolInfo::eTech_survey);
2530  genomic_list.push_back(CMolInfo::eTech_wgs);
2531  genomic_list.push_back(CMolInfo::eTech_htgs_0);
2532  genomic_list.push_back(CMolInfo::eTech_htgs_1);
2533  genomic_list.push_back(CMolInfo::eTech_htgs_2);
2534  genomic_list.push_back(CMolInfo::eTech_htgs_3);
2535  genomic_list.push_back(CMolInfo::eTech_composite_wgs_htgs);
2536 
2538  bool genomic = false;
2539  for (CMolInfo::TTech it : genomic_list) {
2540  if (it == i) {
2541  genomic = true;
2542  break;
2543  }
2544  }
2545  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
2546  SetTech(entry, i);
2548  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolType", "Molecule type (DNA) does not match biomol (RNA)"));
2549  if (i == CMolInfo::eTech_wgs) {
2550  AddChromosomeNoLocation(expected_errors, "lcl|good");
2551  }
2552  if (i == CMolInfo::eTech_est) {
2553  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA", "EST sequence should be mRNA"));
2554  }
2555  if (i == CMolInfo::eTech_htgs_2) {
2556  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 raw seq has no gaps and no graphs"));
2557  }
2558  if (genomic) {
2559  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic", "HTGS/STS/GSS/WGS sequence should be genomic"));
2560  eval = validator.Validate(seh, options);
2561  CheckErrors(*eval, expected_errors);
2563  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
2564  delete expected_errors[0];
2565  expected_errors[0] = nullptr;
2566  expected_errors.back()->SetErrCode("HTGS_STS_GSS_WGSshouldNotBeRNA");
2567  expected_errors.back()->SetErrMsg("HTGS/STS/GSS/WGS sequence should not be RNA");
2568  eval = validator.Validate(seh, options);
2569  CheckErrors(*eval, expected_errors);
2570  } else {
2571  if (IsProteinTech(i)) {
2572  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide", "Nucleic acid with protein sequence method"));
2573  }
2574  if (i == CMolInfo::eTech_barcode) {
2575  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
2576  } else if (i == CMolInfo::eTech_tsa) {
2577  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2578  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"cRNA\" is not appropriate for sequences that use the TSA technique."));
2579  }
2580  eval = validator.Validate(seh, options);
2581  CheckErrors(*eval, expected_errors);
2582  }
2583  CLEAR_ERRORS
2584  }
2585 
2586  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
2587  SetTech(entry, CMolInfo::eTech_tsa);
2588  // AddChromosomeNoLocation(expected_errors, "lcl|good");
2589  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolType", "Molecule type (DNA) does not match biomol (RNA)"));
2590  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2591  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"cRNA\" is not appropriate for sequences that use the TSA technique."));
2592  eval = validator.Validate(seh, options);
2593  CheckErrors(*eval, expected_errors);
2594 
2595  CLEAR_ERRORS
2596 
2597  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2598  eval = validator.GetTSAConflictingBiomolTechErrors(seh);
2599  CheckErrors(*eval, expected_errors);
2600  eval = validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
2601  CheckErrors(*eval, expected_errors);
2602  CLEAR_ERRORS
2603 }
2604 
2605 
2606 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqIdNameHasSpace)
2607 {
2609  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
2610  entry->SetSeq().SetId().front()->SetOther().SetName("good one");
2611 
2613 
2614  expected_errors.push_back(new CExpectedError("ref|NC_123456|good one", eDiag_Critical, "SeqIdNameHasSpace", "Seq-id.name 'good one' should be a single word without any spaces"));
2615  // AddChromosomeNoLocation(expected_errors, "ref|NC_123456|good one");
2616 
2617  eval = validator.Validate(seh, options);
2618  CheckErrors(*eval, expected_errors);
2619 
2620  CLEAR_ERRORS
2621 }
2622 
2623 
2624 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_DuplicateSegmentReferences)
2625 {
2626 #if 0
2627  // removed per VR-779
2629  entry->SetSeq().SetInst().ResetSeq_data();
2630  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
2631  CRef<CSeq_loc> seg1(new CSeq_loc());
2632  seg1->SetWhole().SetGenbank().SetAccession("AY123456");
2633  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(seg1);
2634  CRef<CSeq_loc> seg2(new CSeq_loc());
2635  seg2->SetWhole().SetGenbank().SetAccession("AY123456");
2636  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(seg2);
2637  entry->SetSeq().SetInst().SetLength(970);
2638 
2640  // need to call this statement before calling AddDefaults
2641  // to make sure that we can fetch the sequence referenced by the
2642  // delta sequence so that we can detect that the loc in the
2643  // delta sequence is longer than the referenced sequence
2645  CScope scope(*objmgr);
2646  scope.AddDefaults();
2647  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
2648 
2649  CValidator validator(*objmgr);
2650 
2651  // Set validator options
2652  unsigned int options = CValidator::eVal_need_isojta
2656 
2657  // list of expected errors
2658  vector<CExpectedError*> expected_errors;
2659  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLocOrder", "Segmented BioseqIntervals out of order in SeqLoc [[gb|AY123456|, gb|AY123456|]]"));
2660  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DuplicateSegmentReferences", "Segmented sequence has multiple references to gb|AY123456"));
2662 
2663  eval = validator.Validate(seh, options);
2664  CheckErrors(*eval, expected_errors);
2665 
2666  seg2->SetInt().SetId().SetGenbank().SetAccession("AY123456");
2667  seg2->SetInt().SetFrom(0);
2668  seg2->SetInt().SetTo(484);
2669  expected_errors[0]->SetErrMsg("Segmented BioseqIntervals out of order in SeqLoc [[gb|AY123456|, 1-485]]");
2670  expected_errors[1]->SetSeverity(eDiag_Warning);
2671  expected_errors[1]->SetErrMsg("Segmented sequence has multiple references to gb|AY123456 that are not SEQLOC_WHOLE");
2672  eval = validator.Validate(seh, options);
2673  CheckErrors(*eval, expected_errors);
2674 
2675  CLEAR_ERRORS
2676 #endif
2677 }
2678 
2679 
2680 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_TrailingX)
2681 {
2683  CRef<CSeq_entry> nuc = entry->SetSet().SetSeq_set().front();
2684  CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
2685  CRef<CSeq_feat> prot_feat = prot->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
2687  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATANNNNNN");
2688  nuc->SetSeq().SetInst().SetLength(27);
2689  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEIXX");
2690  prot->SetSeq().SetInst().SetLength(9);
2692  prot_feat->SetLocation().SetInt().SetTo(8);
2693  prot_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
2694  prot_feat->SetPartial(true);
2695  cds_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
2696  cds_feat->SetPartial(true);
2697 
2699 
2700  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TerminalNs", "N at end of sequence"));
2701  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "TrailingX", "Sequence ends in 2 trailing Xs"));
2702  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "HighNpercent3Prime",
2703  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
2704  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2705 
2706  eval = validator.Validate(seh, options);
2707  CheckErrors(*eval, expected_errors);
2708 
2709  CLEAR_ERRORS
2710 }
2711 
2712 
2713 void TestBadProtId(const string& id_str)
2714 {
2715  // bad for just prots
2717  CRef<CSeq_id> bad_id(new CSeq_id());
2718  bad_id->SetGenbank().SetAccession(id_str);
2719  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2720  good_nuc_id->SetLocal().SetStr("nuc");
2721  CRef<CSeq_id> good_prot_id(new CSeq_id());
2722  good_prot_id->SetLocal().SetStr("prot");
2723 
2724  unit_test_util::ChangeNucId(entry, good_nuc_id);
2725  unit_test_util::ChangeProtId(entry, bad_id);
2726 
2728 
2729  expected_errors.push_back(new CExpectedError("gb|" + id_str + "|", eDiag_Error, "BadSeqIdFormat", "Bad accession " + id_str));
2730  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2731 
2732  eval = validator.Validate(seh, options);
2733  CheckErrors(*eval, expected_errors);
2734  CLEAR_ERRORS
2735 }
2736 
2737 
2738 void TestGoodProtId(const string& id_str)
2739 {
2741  CRef<CSeq_id> bad_id(new CSeq_id());
2742  bad_id->SetGenbank().SetAccession(id_str);
2743  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2744  good_nuc_id->SetLocal().SetStr("nuc");
2745  CRef<CSeq_id> good_prot_id(new CSeq_id());
2746  good_prot_id->SetLocal().SetStr("prot");
2747 
2748  unit_test_util::ChangeNucId(entry, good_nuc_id);
2749  unit_test_util::ChangeProtId(entry, bad_id);
2750 
2752 
2753  eval = validator.Validate(seh, options);
2754  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2755 
2756  CheckErrors(*eval, expected_errors);
2757  CLEAR_ERRORS
2758 }
2759 
2760 
2761 void TestGoodNucId(const string& id_str)
2762 {
2764  CRef<CSeq_id> bad_id(new CSeq_id());
2765  bad_id->SetGenbank().SetAccession(id_str);
2766  CRef<CSeq_id> good_prot_id(new CSeq_id());
2767  good_prot_id->SetLocal().SetStr("prot");
2768  unit_test_util::ChangeNucId(entry, bad_id);
2769  unit_test_util::ChangeProtId(entry, good_prot_id);
2770  bool is_wgs = false;
2771  if (id_str.length() == 12 || id_str.length() == 13 || id_str.length() == 14 || id_str.length() == 15) {
2772  SetTech(entry->SetSet().SetSeq_set().front(), CMolInfo::eTech_wgs);
2773  is_wgs = true;
2774  }
2775 
2777 
2778  if (is_wgs) {
2779  AddChromosomeNoLocation(expected_errors, "gb|" + id_str + "|");
2780  }
2781  eval = validator.Validate(seh, options);
2782  CheckErrors(*eval, expected_errors);
2783  CLEAR_ERRORS
2784 }
2785 
2786 
2787 BOOST_FIXTURE_TEST_CASE(Test_SEQ_INST_BadSeqIdFormat, CGenBankFixture)
2788 {
2790  CRef<CSeq_entry> nuc_entry = entry->SetSet().SetSeq_set().front();
2791  CRef<CSeq_entry> prot_entry = entry->SetSet().SetSeq_set().back();
2792  CRef<CSeq_feat> prot_feat = prot_entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
2794 
2796 
2797  expected_errors.push_back(new CExpectedError("",eDiag_Error, "BadSeqIdFormat", "Bad accession"));
2798 
2799  vector<string> bad_ids;
2800  bad_ids.push_back("AY123456ABC"); // can't have letters after digits
2801  bad_ids.push_back("A1234"); // for a single letter, only acceptable number of digits is 5
2802  bad_ids.push_back("A123456");
2803  bad_ids.push_back("AY12345"); // for two letters, only acceptable number of digits is 6
2804  bad_ids.push_back("AY1234567");
2805  bad_ids.push_back("ABC1234"); // three letters bad unless prot and 5 digits
2806  bad_ids.push_back("ABC123456");
2807  bad_ids.push_back("ABCD1234567"); // four letters
2808  bad_ids.push_back("ABCDE123456"); // five letters
2809  bad_ids.push_back("ABCDE12345678");
2810 
2811  vector<string> bad_nuc_ids;
2812  bad_nuc_ids.push_back("ABC12345");
2813 
2814  vector<string> bad_prot_ids;
2815  bad_prot_ids.push_back("AY123456");
2816  bad_prot_ids.push_back("A12345");
2817 
2818  vector<string> good_ids;
2819 
2820  vector<string> good_nuc_ids;
2821  good_nuc_ids.push_back("AY123456");
2822  good_nuc_ids.push_back("A12345");
2823  good_nuc_ids.push_back("ABCD123456789");
2824  good_nuc_ids.push_back("ABCD1234567890");
2825 
2826  vector<string> good_prot_ids;
2827  good_prot_ids.push_back("ABC12345");
2828 
2829 
2830  CRef<CSeq_id> good_nuc_id(new CSeq_id());
2831  good_nuc_id->SetLocal().SetStr("nuc");
2832  CRef<CSeq_id> good_prot_id(new CSeq_id());
2833  good_prot_id->SetLocal().SetStr("prot");
2834 
2835  CRef<CSeq_id> bad_id(new CSeq_id());
2836 
2837  // bad for both
2838  for (const string& id_str : bad_ids) {
2839  const string acc_str = "gb|" + id_str + "|";
2840  ChangeErrorAcc(expected_errors, acc_str);
2841  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2842 
2843  // GenBank
2844  scope.RemoveTopLevelSeqEntry(seh);
2845  scope.ResetDataAndHistory();
2846  bad_id->SetGenbank().SetAccession(id_str);
2847  unit_test_util::ChangeNucId(entry, bad_id);
2848  unit_test_util::ChangeProtId(entry, good_prot_id);
2849  seh = scope.AddTopLevelSeqEntry(*entry);
2850  eval = validator.Validate(seh, options);
2851  CheckErrors(*eval, expected_errors);
2852  scope.RemoveTopLevelSeqEntry(seh);
2853  scope.ResetDataAndHistory();
2854  unit_test_util::ChangeNucId(entry, good_nuc_id);
2855  unit_test_util::ChangeProtId(entry, bad_id);
2856  seh = scope.AddTopLevelSeqEntry(*entry);
2857  eval = validator.Validate(seh, options);
2858  CheckErrors(*eval, expected_errors);
2859  }
2860 
2861  for (const string& id_it : bad_ids) {
2862  const string id_str = "B" + id_it.substr(1);
2863  expected_errors[0]->SetAccession("embl|" + id_str + "|");
2864  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2865 
2866  // EMBL
2867  scope.RemoveTopLevelSeqEntry(seh);
2868  scope.ResetDataAndHistory();
2869  bad_id->SetEmbl().SetAccession(id_str);
2870  unit_test_util::ChangeNucId(entry, bad_id);
2871  unit_test_util::ChangeProtId(entry, good_prot_id);
2872  seh = scope.AddTopLevelSeqEntry(*entry);
2873  eval = validator.Validate(seh, options);
2874  expected_errors[0]->SetAccession("emb|" + id_str + "|");
2875  CheckErrors(*eval, expected_errors);
2876  scope.RemoveTopLevelSeqEntry(seh);
2877  scope.ResetDataAndHistory();
2878  unit_test_util::ChangeNucId(entry, good_nuc_id);
2879  unit_test_util::ChangeProtId(entry, bad_id);
2880  seh = scope.AddTopLevelSeqEntry(*entry);
2881  eval = validator.Validate(seh, options);
2882  CheckErrors(*eval, expected_errors);
2883  }
2884 
2885  for (const string& id_it : bad_ids) {
2886  const string id_str = "C" + id_it.substr(1);
2887  expected_errors[0]->SetAccession("dbj|" + id_str + "|");
2888  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2889 
2890  // DDBJ
2891  scope.RemoveTopLevelSeqEntry(seh);
2892  scope.ResetDataAndHistory();
2893  bad_id->SetDdbj().SetAccession(id_str);
2894  unit_test_util::ChangeNucId(entry, bad_id);
2895  unit_test_util::ChangeProtId(entry, good_prot_id);
2896  seh = scope.AddTopLevelSeqEntry(*entry);
2897  eval = validator.Validate(seh, options);
2898  expected_errors[0]->SetAccession("dbj|" + id_str + "|");
2899  CheckErrors(*eval, expected_errors);
2900  scope.RemoveTopLevelSeqEntry(seh);
2901  scope.ResetDataAndHistory();
2902  unit_test_util::ChangeNucId(entry, good_nuc_id);
2903  unit_test_util::ChangeProtId(entry, bad_id);
2904  seh = scope.AddTopLevelSeqEntry(*entry);
2905  eval = validator.Validate(seh, options);
2906  CheckErrors(*eval, expected_errors);
2907  }
2908 
2909  // bad for just nucs
2910  for (const string& id_str : bad_nuc_ids) {
2911  bad_id->SetGenbank().SetAccession(id_str);
2912  scope.RemoveTopLevelSeqEntry(seh);
2913  unit_test_util::ChangeNucId(entry, bad_id);
2914  unit_test_util::ChangeProtId(entry, good_prot_id);
2915  expected_errors[0]->SetAccession("gb|" + id_str + "|");
2916  expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2917  seh = scope.AddTopLevelSeqEntry(*entry);
2918  eval = validator.Validate(seh, options);
2919  CheckErrors(*eval, expected_errors);
2920  }
2921 
2922  // bad for just prots
2923  for (auto id_it : bad_prot_ids) {
2924  TestBadProtId(id_it);
2925  }
2926 
2927  CLEAR_ERRORS
2928 
2929  // good for both
2930  for (const string& id_str : good_ids) {
2931  bad_id->SetGenbank().SetAccession(id_str);
2932  scope.RemoveTopLevelSeqEntry(seh);
2933  unit_test_util::ChangeNucId(entry, bad_id);
2934  unit_test_util::ChangeProtId(entry, good_prot_id);
2935  seh = scope.AddTopLevelSeqEntry(*entry);
2936  eval = validator.Validate(seh, options);
2937  // AddChromosomeNoLocation(expected_errors, "gb|" + *id_it + "|");
2938  CheckErrors(*eval, expected_errors);
2939  scope.RemoveTopLevelSeqEntry(seh);
2940  unit_test_util::ChangeNucId(entry, good_nuc_id);
2941  unit_test_util::ChangeProtId(entry, bad_id);
2942  seh = scope.AddTopLevelSeqEntry(*entry);
2943  eval = validator.Validate(seh, options);
2944  CheckErrors(*eval, expected_errors);
2945  CLEAR_ERRORS
2946  }
2947 
2948  // good for nucs
2949  for (const string& id_it : good_nuc_ids) {
2950  TestGoodNucId(id_it);
2951  }
2952 
2953  // good for just prots
2954  for (const string& id_it : good_prot_ids) {
2955  TestGoodProtId(id_it);
2956  }
2957 
2958  // if GI, needs version
2959  scope.RemoveTopLevelSeqEntry(seh);
2960  bad_id->SetGenbank().SetAccession("AY123456");
2961  bad_id->SetGenbank().SetVersion(0);
2962  unit_test_util::ChangeNucId(entry, bad_id);
2963  unit_test_util::ChangeProtId(entry, good_prot_id);
2964  CRef<CSeq_id> gi_id(new CSeq_id("gi|21914627"));
2965  nuc_entry->SetSeq().SetId().push_back(gi_id);
2966  seh = scope.AddTopLevelSeqEntry(*entry);
2967  eval = validator.Validate(seh, options);
2968  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Critical, "BadSeqIdFormat",
2969  "Accession AY123456 has 0 version"));
2970  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123456|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
2971  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
2972  CheckErrors(*eval, expected_errors);
2973 
2974  CLEAR_ERRORS
2975 
2976  nuc_entry->SetSeq().SetId().pop_back();
2977 
2978  // id that is too long
2979  scope.RemoveTopLevelSeqEntry(seh);
2980  bad_id->SetLocal().SetStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234");
2981  unit_test_util::ChangeNucId(entry, bad_id);
2982  seh = scope.AddTopLevelSeqEntry(*entry);
2983  eval = validator.Validate(seh, options);
2984  // AddChromosomeNoLocation(expected_errors, "lcl|ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234");
2985  CheckErrors(*eval, expected_errors);
2986 
2987  CLEAR_ERRORS
2988 
2989  // shouldn't report if ncbifile ID
2990  scope.RemoveTopLevelSeqEntry(seh);
2991  CRef<CSeq_id> ncbifile(new CSeq_id("gnl|NCBIFILE|ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234"));
2992  unit_test_util::ChangeNucId(entry, good_nuc_id);
2993  nuc_entry->SetSeq().SetId().push_back(ncbifile);
2994  seh = scope.AddTopLevelSeqEntry(*entry);
2995  eval = validator.Validate(seh, options);
2996  // AddChromosomeNoLocation(expected_errors, entry);
2997  CheckErrors(*eval, expected_errors);
2998  nuc_entry->SetSeq().SetId().pop_back();
2999  CLEAR_ERRORS
3000 
3001  // report if database name len too long
3002  scope.RemoveTopLevelSeqEntry(seh);
3003  entry = unit_test_util::BuildGoodSeq();
3004  CRef<CSeq_id> general(new CSeq_id());
3005  general->SetGeneral().SetDb("thisdatabasevalueislong");
3006  general->SetGeneral().SetTag().SetStr("b");
3007  entry->SetSeq().ResetId();
3008  entry->SetSeq().SetId().push_back(general);
3009  seh = scope.AddTopLevelSeqEntry(*entry);
3010  expected_errors.push_back(new CExpectedError("gnl|thisdatabasevalueislong|b", eDiag_Critical, "BadSeqIdFormat",
3011  "General database longer than 20 characters"));
3012 
3013  // AddChromosomeNoLocation(expected_errors, "gnl|thisdatabasevalueislong|b");
3014  eval = validator.Validate(seh, options);
3015  CheckErrors(*eval, expected_errors);
3016 
3017  CLEAR_ERRORS
3018 
3019  // do not report forward slash
3020  scope.RemoveTopLevelSeqEntry(seh);
3021  entry = unit_test_util::BuildGoodSeq();
3022  entry->SetSeq().SetId().front()->SetLocal().SetStr("a/b");
3023  seh = scope.AddTopLevelSeqEntry(*entry);
3024  eval = validator.Validate(seh, options);
3025  // AddChromosomeNoLocation(expected_errors, "lcl|a/b");
3026  CheckErrors(*eval, expected_errors);
3027 
3028  CLEAR_ERRORS
3029 }
3030 
3031 
3032 void TestOneGeneralSeqId(const string& db, const string& tag, const string& errmsg)
3033 {
3035  CRef<CSeq_id> id(new CSeq_id());
3036  id->SetGeneral().SetDb(db);
3037  id->SetGeneral().SetTag().SetStr(tag);
3038  entry->SetSeq().SetId().push_back(id);
3039 
3041 
3042  string acc_str = "lcl|good";
3043  if (!errmsg.empty()) {
3044  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Warning, "BadSeqIdCharacter",
3045  errmsg));
3046  }
3047  // AddChromosomeNoLocation(expected_errors, entry);
3048  eval = validator.Validate(seh, options);
3049  CheckErrors(*eval, expected_errors);
3050 
3051  CLEAR_ERRORS
3052 }
3053 
3054 
3056 {
3057  TestOneGeneralSeqId("PRJNA318798", " CpPA02_0001", "Bad character ' ' in sequence ID 'gnl|PRJNA318798| CpPA02_0001'");
3058  TestOneGeneralSeqId("PRJNA3 18798", "CpPA02_0001", "Bad character ' ' in sequence ID 'gnl|PRJNA3 18798|CpPA02_0001'");
3059 }
3060 
3061 
3062 void TestOneLongGeneral(bool emb, bool err)
3063 {
3065  CRef<CSeq_id> id(new CSeq_id());
3066  id->SetGeneral().SetDb("lgsi");
3067  id->SetGeneral().SetTag().SetStr("thisidentifierismorethanfiftycharactersinlengthsoitshouldberejected");
3068  entry->SetSeq().SetId().push_back(id);
3069 
3070  if (emb) {
3071  CRef<CSeq_id> emb(new CSeq_id());
3072  emb->SetEmbl().SetAccession("AY123457");
3073  emb->SetEmbl().SetVersion(1);
3074  entry->SetSeq().SetId().push_back(emb);
3075  }
3076 
3078 
3079  if (err) {
3080  string acc_str = "lcl|good";
3081  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Critical, "BadSeqIdLength",
3082  "General identifier longer than 50 characters"));
3083  }
3084 
3085  eval = validator.Validate(seh, options);
3086  CheckErrors(*eval, expected_errors);
3087 
3088  CLEAR_ERRORS
3089 }
3090 
3091 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_LongGeneralSeqId)
3092 {
3093  TestOneLongGeneral(false, true);
3094  TestOneLongGeneral(true, false);
3095 }
3096 
3097 
3098 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadSecondaryAccn)
3099 {
3101  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3102 
3104 
3105  CRef<CSeqdesc> gbdesc(new CSeqdesc());
3106  gbdesc->SetGenbank().SetExtra_accessions().push_back("AY123456");
3107  entry->SetSeq().SetDescr().Set().push_back(gbdesc);
3108 
3109  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "BadSecondaryAccn", "AY123456 used for both primary and secondary accession"));
3110  // AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
3111  eval = validator.Validate(seh, options);
3112  CheckErrors(*eval, expected_errors);
3113 
3114  gbdesc->SetEmbl().SetExtra_acc().push_back("AY123456");
3115  eval = validator.Validate(seh, options);
3116  CheckErrors(*eval, expected_errors);
3117 
3118  CLEAR_ERRORS
3119 }
3120 
3121 
3122 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ZeroGiNumber)
3123 {
3125  entry->SetSeq().SetId().front()->SetGi(ZERO_GI);
3126 
3128 
3129  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Critical, "ZeroGiNumber", "Invalid GI number"));
3130  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Error, "GiWithoutAccession", "No accession on sequence with gi number"));
3131  // AddChromosomeNoLocation(expected_errors, "gi|0");
3132  eval = validator.Validate(seh, options);
3133  CheckErrors(*eval, expected_errors);
3134 
3135  CLEAR_ERRORS
3136 }
3137 
3138 
3139 BOOST_AUTO_TEST_CASE(Test_HistoryGiCollision)
3140 {
3142  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3143  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3144  CRef<CSeq_id> gi_id(new CSeq_id());
3145  gi_id->SetGi(GI_CONST(21914627));
3146  entry->SetSeq().SetId().push_back(gi_id);
3147 
3149 
3150  CRef<CSeq_id> hist_id(new CSeq_id());
3151  hist_id->SetGi(GI_CONST(21914627));
3152  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetIds().push_back(hist_id);
3153  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetDate().SetStd().SetYear(2008);
3154 
3155  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error, "HistoryGiCollision", "Replaced by gi (21914627) is same as current Bioseq"));
3156  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
3157  eval = validator.Validate(seh, options);
3158  CheckErrors(*eval, expected_errors);
3159 
3160  entry->SetSeq().SetInst().SetHist().ResetReplaced_by();
3161  entry->SetSeq().SetInst().SetHist().SetReplaces().SetIds().push_back(hist_id);
3162  entry->SetSeq().SetInst().SetHist().SetReplaces().SetDate().SetStd().SetYear(2008);
3163  expected_errors[0]->SetErrMsg("Replaces gi (21914627) is same as current Bioseq");
3164  eval = validator.Validate(seh, options);
3165  CheckErrors(*eval, expected_errors);
3166 
3167  CLEAR_ERRORS
3168 
3169  // should not generate errors if date has not been set
3170  entry->SetSeq().SetInst().SetHist().ResetReplaces();
3171  entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetIds().push_back(hist_id);
3172  eval = validator.Validate(seh, options);
3173  // AddChromosomeNoLocation(expected_errors, entry);
3174  CheckErrors(*eval, expected_errors);
3175 
3176  entry->SetSeq().SetInst().SetHist().ResetReplaced_by();
3177  entry->SetSeq().SetInst().SetHist().SetReplaces().SetIds().push_back(hist_id);
3178  eval = validator.Validate(seh, options);
3179  CheckErrors(*eval, expected_errors);
3180 
3181  CLEAR_ERRORS
3182 }
3183 
3184 
3185 BOOST_AUTO_TEST_CASE(Test_GiWithoutAccession)
3186 {
3188  entry->SetSeq().SetId().front()->SetGi(GI_CONST(123456));
3189 
3191 
3192  expected_errors.push_back(new CExpectedError("gi|123456", eDiag_Error, "GiWithoutAccession", "No accession on sequence with gi number"));
3193  // AddChromosomeNoLocation(expected_errors, entry);
3194  eval = validator.Validate(seh, options);
3195  CheckErrors(*eval, expected_errors);
3196 
3197  CLEAR_ERRORS
3198 }
3199 
3200 
3201 void TestOneOtherAcc(CRef<CSeq_id> other_acc, bool id_change, bool conflict, bool need_hist = false)
3202 {
3204  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3205  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3206  CRef<CSeq_id> gi_id(new CSeq_id());
3207  gi_id->SetGi(GI_CONST(21914627));
3208  entry->SetSeq().SetId().push_back(gi_id);
3209  entry->SetSeq().SetId().push_back(other_acc);
3210  string acc_str = "gb|AY123456.1|";
3211 
3213 
3214  if (conflict) {
3215  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error, "ConflictingIdsOnBioseq",
3216  "Conflicting ids on a Bioseq: (gb|AY123456.1| - " + other_acc->AsFastaString() + ")"));
3217  }
3218  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "MultipleAccessions", "Multiple accessions on sequence with gi number"));
3219  if (id_change) {
3220  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123457.1|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
3221  }
3222  if (need_hist) {
3223  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Info, "HistAssemblyMissing",
3224  "TPA record gb|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3225  }
3226  // AddChromosomeNoLocation(expected_errors, acc_str);
3227  eval = validator.Validate(seh, options);
3228  CheckErrors(*eval, expected_errors);
3229 
3230  CLEAR_ERRORS
3231 }
3232 
3233 
3234 BOOST_FIXTURE_TEST_CASE(Test_MultipleAccessions, CGenBankFixture)
3235 {
3236  CRef<CSeq_id> other_acc(new CSeq_id());
3237 
3238  // genbank, ddbj, embl, tpg, tpe, tpd, other, pir, swissprot, and prf all count as accessionts
3239  // genbank
3240  other_acc->SetGenbank().SetAccession("AY123457");
3241  other_acc->SetGenbank().SetVersion(1);
3242  TestOneOtherAcc(other_acc, true, true);
3243 
3244  // ddbj
3245  other_acc->SetDdbj().SetAccession("AY123457");
3246  other_acc->SetDdbj().SetVersion(1);
3247  TestOneOtherAcc(other_acc, false, true);
3248 
3249  // embl
3250  other_acc->SetEmbl().SetAccession("AY123457");
3251  other_acc->SetEmbl().SetVersion(1);
3252  TestOneOtherAcc(other_acc, false, true);
3253 
3254  // pir
3255  other_acc->SetPir().SetAccession("AY123457");
3256  other_acc->SetPir().SetVersion(1);
3257  TestOneOtherAcc(other_acc, false, false);
3258 
3259  // swissprot
3260  other_acc->SetSwissprot().SetAccession("AY123457");
3261  other_acc->SetSwissprot().SetVersion(1);
3262  TestOneOtherAcc(other_acc, false, false);
3263 
3264  // prf
3265  other_acc->SetPrf().SetAccession("AY123457");
3266  other_acc->SetPrf().SetVersion(1);
3267  TestOneOtherAcc(other_acc, false, false);
3268 
3269  // tpg
3270  other_acc->SetTpg().SetAccession("AY123457");
3271  other_acc->SetTpg().SetVersion(1);
3272  TestOneOtherAcc(other_acc, false, true, true);
3273 
3274  // tpe
3275  other_acc->SetTpe().SetAccession("AY123457");
3276  other_acc->SetTpe().SetVersion(1);
3277  TestOneOtherAcc(other_acc, false, true, true);
3278 
3279  // tpd
3280  other_acc->SetTpd().SetAccession("AY123457");
3281  other_acc->SetTpd().SetVersion(1);
3282  TestOneOtherAcc(other_acc, false, true, true);
3283 
3284  // other
3286  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3287  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3288  CRef<CSeq_id> gi_id(new CSeq_id());
3289  gi_id->SetGi(GI_CONST(21914627));
3290  entry->SetSeq().SetId().push_back(gi_id);
3291  entry->SetSeq().SetId().push_back(other_acc);
3292  other_acc->SetOther().SetAccession("NC_123457");
3293  other_acc->SetOther().SetVersion(1);
3294 
3296 
3297  string acc_str = "gb|AY123456.1|";
3298  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "INSDRefSeqPackaging", "INSD and RefSeq records should not be present in the same set"));
3299  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "MultipleAccessions", "Multiple accessions on sequence with gi number"));
3300  // AddChromosomeNoLocation(expected_errors, acc_str);
3301  eval = validator.Validate(seh, options);
3302  CheckErrors(*eval, expected_errors);
3303 
3304  CLEAR_ERRORS
3305 }
3306 
3307 
3308 BOOST_AUTO_TEST_CASE(Test_HistAssemblyMissing)
3309 {
3311  tpg_entry->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3312  tpg_entry->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3313 
3315  tpe_entry->SetSeq().SetId().front()->SetTpe().SetAccession("AY123456");
3316  tpe_entry->SetSeq().SetId().front()->SetTpe().SetVersion(1);
3317 
3319  tpd_entry->SetSeq().SetId().front()->SetTpd().SetAccession("AY123456");
3320  tpd_entry->SetSeq().SetId().front()->SetTpd().SetVersion(1);
3321 
3322  STANDARD_SETUP_NAME(tpg_entry)
3323 
3324  // tpg
3325  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Info, "HistAssemblyMissing", "TPA record tpg|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3326  // AddChromosomeNoLocation(expected_errors, tpg_entry);
3327  eval = validator.Validate(seh, options);
3328  CheckErrors(*eval, expected_errors);
3329 
3330  // tpe
3331  scope.RemoveTopLevelSeqEntry(seh);
3332  seh = scope.AddTopLevelSeqEntry(*tpe_entry);
3333  ChangeErrorAcc(expected_errors, "tpe|AY123456.1|");
3334  expected_errors[0]->SetErrMsg("TPA record tpe|AY123456.1| should have Seq-hist.assembly for PRIMARY block");
3335  eval = validator.Validate(seh, options);
3336  CheckErrors(*eval, expected_errors);
3337 
3338 
3339  // tpd
3340  scope.RemoveTopLevelSeqEntry(seh);
3341  seh = scope.AddTopLevelSeqEntry(*tpd_entry);
3342  ChangeErrorAcc(expected_errors, "tpd|AY123456.1|");
3343  expected_errors[0]->SetErrMsg("TPA record tpd|AY123456.1| should have Seq-hist.assembly for PRIMARY block");
3344  eval = validator.Validate(seh, options);
3345  CheckErrors(*eval, expected_errors);
3346 
3347  CLEAR_ERRORS
3348 
3349  // error suppressed if keyword present
3350  CRef<CSeqdesc> block(new CSeqdesc());
3351  block->SetGenbank().SetKeywords().push_back("TPA:reassembly");
3352  tpg_entry->SetSeq().SetDescr().Set().push_back(block);
3353  scope.RemoveTopLevelSeqEntry(seh);
3354  seh = scope.AddTopLevelSeqEntry(*tpg_entry);
3355  eval = validator.Validate(seh, options);
3356  // AddChromosomeNoLocation(expected_errors, tpg_entry);
3357 
3358  CheckErrors(*eval, expected_errors);
3359  block->SetEmbl().SetKeywords().push_back("TPA:reassembly");
3360  eval = validator.Validate(seh, options);
3361  CheckErrors(*eval, expected_errors);
3362  CLEAR_ERRORS
3363 }
3364 
3365 BOOST_AUTO_TEST_CASE(Test_TerminalNs)
3366 {
3368  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNNAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAANNNNNNNNNN");
3369  entry->SetSeq().SetInst().SetLength(62);
3370 
3372 
3373  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at beginning of sequence"));
3374  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at end of sequence"));
3375  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
3376  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3377  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
3378  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3379  // AddChromosomeNoLocation(expected_errors, entry);
3380  eval = validator.Validate(seh, options);
3381  CheckErrors(*eval, expected_errors);
3382 
3383  // warning level changes if not local only
3384  scope.RemoveTopLevelSeqEntry(seh);
3385  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3386  seh = scope.AddTopLevelSeqEntry(*entry);
3387  ChangeErrorAcc(expected_errors, "gb|AY123456|");
3388  expected_errors[0]->SetSeverity(eDiag_Error);
3389  expected_errors[1]->SetSeverity(eDiag_Error);
3390  eval = validator.Validate(seh, options);
3391  CheckErrors(*eval, expected_errors);
3392 
3393  CLEAR_ERRORS
3394 
3395  // also try delta sequence
3396  scope.RemoveTopLevelSeqEntry(seh);
3398  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNNNNNNNCCC");
3399  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCNNNNNNNNN");
3400  seh = scope.AddTopLevelSeqEntry(*entry);
3401 
3402  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 3 bases"));
3403  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at beginning of sequence"));
3404  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at end of sequence"));
3405  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 52 percent Ns"));
3406  eval = validator.Validate(seh, options);
3407  CheckErrors(*eval, expected_errors);
3408 
3409  // 10 Ns but just local stays at warning
3410  scope.RemoveTopLevelSeqEntry(seh);
3412  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNNNNNNNNCC");
3413  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCNNNNNNNNNN");
3414  seh = scope.AddTopLevelSeqEntry(*entry);
3415  expected_errors[0]->SetErrMsg("Maximum contig length is 2 bases");
3416  expected_errors.back()->SetErrMsg("Sequence contains 58 percent Ns");
3417  eval = validator.Validate(seh, options);
3418  CheckErrors(*eval, expected_errors);
3419 
3420  // 10 Ns but now has non-local ID, error
3421  scope.RemoveTopLevelSeqEntry(seh);
3422  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3423  seh = scope.AddTopLevelSeqEntry(*entry);
3424  ChangeErrorAcc(expected_errors, "gb|AY123456|");
3425  expected_errors[1]->SetSeverity(eDiag_Error);
3426  expected_errors[2]->SetSeverity(eDiag_Error);
3427  eval = validator.Validate(seh, options);
3428  CheckErrors(*eval, expected_errors);
3429 
3430  // NC and patent IDs back to warning
3431  scope.RemoveTopLevelSeqEntry(seh);
3432  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3433  seh = scope.AddTopLevelSeqEntry(*entry);
3434  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3435  expected_errors[1]->SetSeverity(eDiag_Warning);
3436  expected_errors[2]->SetSeverity(eDiag_Warning);
3437  eval = validator.Validate(seh, options);
3438  CheckErrors(*eval, expected_errors);
3439 
3440  scope.RemoveTopLevelSeqEntry(seh);
3441  entry->SetSeq().SetId().front()->SetPatent().SetSeqid(1);
3442  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetCountry("USA");
3443  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetId().SetNumber("1");
3444  seh = scope.AddTopLevelSeqEntry(*entry);
3445  ChangeErrorAcc(expected_errors, "pat|USA|1|1");
3446  delete expected_errors.back();
3447  expected_errors.pop_back();
3448  eval = validator.Validate(seh, options);
3449  CheckErrors(*eval, expected_errors);
3450  CLEAR_ERRORS
3451 
3452  // no more TerminalNs warnings if circular
3453  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3455  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Error, "ContigsTooShort",
3456  "Maximum contig length is 2 bases"));
3457  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Warning, "UnwantedCompleteFlag",
3458  "Suspicious use of complete"));
3459  // AddChromosomeNoLocation(expected_errors, entry);
3460 
3461  eval = validator.Validate(seh, options);
3462  CheckErrors(*eval, expected_errors);
3463 
3464  CLEAR_ERRORS
3465 }
3466 
3467 
3468 BOOST_FIXTURE_TEST_CASE(Test_UnexpectedIdentifierChange, CGenBankFixture)
3469 {
3471  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123457");
3472  entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3473  CRef<CSeq_id> gi_id(new CSeq_id());
3474  gi_id->SetGi(GI_CONST(21914627));
3475  entry->SetSeq().SetId().push_back(gi_id);
3476 
3478 
3479  expected_errors.push_back(new CExpectedError("gb|AY123457.1|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123457.1|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
3480  // AddChromosomeNoLocation(expected_errors, entry);
3481  eval = validator.Validate(seh, options);
3482  CheckErrors(*eval, expected_errors);
3483 
3484  CLEAR_ERRORS
3485  scope.RemoveTopLevelSeqEntry(seh);
3486  entry->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3487  entry->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3488  seh = scope.AddTopLevelSeqEntry(*entry);
3489  // AddChromosomeNoLocation(expected_errors, entry);
3490  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Info, "HistAssemblyMissing", "TPA record tpg|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3491  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "Loss of accession (gb|AY123456.1|) on gi (21914627) compared to the NCBI sequence repository"));
3492  eval = validator.Validate(seh, options);
3493  CheckErrors(*eval, expected_errors);
3494 
3495  // TODO - try to instigate other errors
3496 
3497  CLEAR_ERRORS
3498 }
3499 
3500 
3501 BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqLit)
3502 {
3504  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNGG");
3505  SetTech(entry, CMolInfo::eTech_wgs);
3506 
3508 
3509  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqLit", "Run of 20 Ns in delta component 5 that starts at base 45"));
3510  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
3511  /*
3512  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3513  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3514  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3515  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3516  */
3517  AddChromosomeNoLocation(expected_errors, entry);
3518 
3519  eval = validator.Validate(seh, options);
3520  CheckErrors(*eval, expected_errors);
3521 
3522  CLEAR_ERRORS
3523 
3524  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGG");
3526  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqLit",
3527  "Run of 81 Ns in delta component 7 that starts at base 79"));
3528  /*
3529  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3530  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3531  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3532  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3533  */
3534  // AddChromosomeNoLocation(expected_errors, entry);
3535 
3536  eval = validator.Validate(seh, options);
3537  CheckErrors(*eval, expected_errors);
3538 
3540  eval = validator.Validate(seh, options);
3541  CheckErrors(*eval, expected_errors);
3542 
3544  eval = validator.Validate(seh, options);
3545  CheckErrors(*eval, expected_errors);
3546 
3547  unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGG");
3549  expected_errors[0]->SetErrMsg("Run of 101 Ns in delta component 9 that starts at base 174");
3550  eval = validator.Validate(seh, options);
3551  CheckErrors(*eval, expected_errors);
3552 
3553  CLEAR_ERRORS
3554 }
3555 
3556 
3557 BOOST_AUTO_TEST_CASE(Test_SeqLitGapLength0)
3558 {
3560  CRef<CDelta_seq> delta_seq(new CDelta_seq());
3561  delta_seq->SetLiteral().SetLength(0);
3562  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(delta_seq);
3563 
3565 
3566  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLitGapLength0", "Gap of length 0 in delta chain"));
3567  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "Last delta seq component is a gap"));
3568  // AddChromosomeNoLocation(expected_errors, entry);
3569  eval = validator.Validate(seh, options);
3570  CheckErrors(*eval, expected_errors);
3571 
3572  // some kinds of fuzz don't trigger other kind of error
3573  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_gt);
3574  eval = validator.Validate(seh, options);
3575  CheckErrors(*eval, expected_errors);
3576 
3577  delta_seq->SetLiteral().SetFuzz().Reset();
3578  delta_seq->SetLiteral().SetFuzz().SetP_m(10);
3579  eval = validator.Validate(seh, options);
3580  CheckErrors(*eval, expected_errors);
3581 
3582  // others will
3583  delta_seq->SetLiteral().SetFuzz().Reset();
3584  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3585  expected_errors[0]->SetErrMsg("Gap of length 0 with unknown fuzz in delta chain");
3586  eval = validator.Validate(seh, options);
3587  CheckErrors(*eval, expected_errors);
3588 
3589  // try again with swissprot, error goes to warning
3590  scope.RemoveTopLevelSeqEntry(seh);
3591  entry->SetSeq().SetId().front()->SetSwissprot().SetAccession("AY123456");
3592  seh = scope.AddTopLevelSeqEntry(*entry);
3593  expected_errors[0]->SetSeverity(eDiag_Warning);
3594  ChangeErrorAcc(expected_errors, "sp|AY123456|");
3595  eval = validator.Validate(seh, options);
3596  CheckErrors(*eval, expected_errors);
3597 
3598  delta_seq->SetLiteral().SetFuzz().SetP_m(10);
3599  expected_errors[0]->SetErrMsg("Gap of length 0 in delta chain");
3600  eval = validator.Validate(seh, options);
3601  CheckErrors(*eval, expected_errors);
3602 
3603  delta_seq->SetLiteral().SetFuzz().Reset();
3604  delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_gt);
3605  eval = validator.Validate(seh, options);
3606  CheckErrors(*eval, expected_errors);
3607 
3608  delta_seq->SetLiteral().ResetFuzz();
3609  eval = validator.Validate(seh, options);
3610  CheckErrors(*eval, expected_errors);
3611 
3612  CLEAR_ERRORS
3613 }
3614 
3615 
3617 {
3618  CRef<CSeqdesc> desc(new CSeqdesc());
3619  desc->SetUser().SetType().SetStr("TpaAssembly");
3620  entry->SetSeq().SetDescr().Set().push_back(desc);
3621 
3622  CRef<CUser_field> field(new CUser_field());
3623  field->SetLabel().SetStr("Label");
3624  field->SetData().SetStr("Data");
3625  desc->SetUser().SetData().push_back(field);
3626 }
3627 
3628 
3629 BOOST_FIXTURE_TEST_CASE(Test_TpaAssemblyProblem, CGenBankFixture)
3630 {
3631  CRef<CSeq_entry> entry(new CSeq_entry());
3634  member1->SetSeq().SetId().front()->SetLocal().SetStr("good");
3635  AddTpaAssemblyUserObject(member1);
3636  entry->SetSet().SetSeq_set().push_back(member1);
3638  member2->SetSeq().SetId().front()->SetLocal().SetStr("good2");
3639  AddTpaAssemblyUserObject(member2);
3640  entry->SetSet().SetSeq_set().push_back(member2);
3641 
3643 
3644  // two Tpa sequences, but neither has assembly and neither has GI, so no errors expected
3645  // AddChromosomeNoLocation(expected_errors, "lcl|good");
3646  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
3647  eval = validator.Validate(seh, options);
3648  CheckErrors(*eval, expected_errors);
3649 
3650  // now one has hist, other does not
3651  member1->SetSeq().SetInst().SetHist().SetAssembly().push_back(unit_test_util::BuildGoodAlign());
3652  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TpaAssemblyProblem", "There are 1 TPAs with history and 1 without history in this record."));
3653  eval = validator.Validate(seh, options);
3654  CheckErrors(*eval, expected_errors);
3655 
3656  // now one has gi
3657  scope.RemoveTopLevelSeqEntry(seh);
3658  member1->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3659  member1->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3660  CRef<CSeq_id> gi_id(new CSeq_id());
3661  gi_id->SetGi(GI_CONST(21914627));
3662  member1->SetSeq().SetId().push_back(gi_id);
3663  seh = scope.AddTopLevelSeqEntry(*entry);
3664 
3665  CLEAR_ERRORS
3666 
3667  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "Loss of accession (gb|AY123456.1|) on gi (21914627) compared to the NCBI sequence repository"));
3668  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Error, "TpaAssemblyProblem", "There are 1 TPAs with history and 1 without history in this record."));
3669  expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "TpaAssemblyProblem", "There are 1 TPAs without history in this record, but the record has a gi number assignment."));
3670  // AddChromosomeNoLocation(expected_errors, "tpg|AY123456.1|");
3671  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
3672  eval = validator.Validate(seh, options);
3673  CheckErrors(*eval, expected_errors);
3674 
3675  CLEAR_ERRORS
3676 }
3677 
3678 
3680 {
3681  // prepare entry
3683  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
3684  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
3685  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(9);
3686  entry->SetSeq().SetInst().SetLength(32);
3687 
3689 
3690  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SeqLocLength", "Short length (10) on seq-loc (gb|AY123456|:1-10) of delta seq_ext"));
3691  // AddChromosomeNoLocation(expected_errors, entry);
3692  eval = validator.Validate(seh, options);
3693  CheckErrors(*eval, expected_errors);
3694 
3695  scope.RemoveTopLevelSeqEntry(seh);
3696  // if length 11, should not be a problem
3698  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
3699  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
3700  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(10);
3701  entry->SetSeq().SetInst().SetLength(33);
3702  seh = scope.AddTopLevelSeqEntry(*entry);
3703  eval = validator.Validate(seh, options);
3704  CheckErrors(*eval, expected_errors);
3705 
3706  CLEAR_ERRORS
3707 }
3708 
3709 
3710 BOOST_AUTO_TEST_CASE(Test_MissingGaps)
3711 {
3712  // prepare entry
3714  // remove gaps
3716 
3718 
3719  // AddChromosomeNoLocation(expected_errors, entry);
3720  // only report errors for specific molinfo tech values
3721  eval = validator.Validate(seh, options);
3722  CheckErrors(*eval, expected_errors);
3723  // htgs_3 should not report
3725  eval = validator.Validate(seh, options);
3726  CheckErrors(*eval, expected_errors);
3727 
3729  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingGaps", "HTGS delta seq should have gaps between all sequence runs"));
3730  eval = validator.Validate(seh, options);
3731  CheckErrors(*eval, expected_errors);
3732 
3734  eval = validator.Validate(seh, options);
3735  CheckErrors(*eval, expected_errors);
3736 
3738  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 delta seq has no gaps and no graphs"));
3739  eval = validator.Validate(seh, options);
3740  CheckErrors(*eval, expected_errors);
3741 
3742  // RefGeneTracking changes severity
3743  scope.RemoveTopLevelSeqEntry(seh);
3744  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3746  seh = scope.AddTopLevelSeqEntry(*entry);
3747  expected_errors[0]->SetSeverity(eDiag_Info);
3748  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3749  eval = validator.Validate(seh, options);
3750  CheckErrors(*eval, expected_errors);
3751  delete expected_errors[1];
3752  expected_errors.pop_back();
3753 
3755  eval = validator.Validate(seh, options);
3756  CheckErrors(*eval, expected_errors);
3757 
3759  eval = validator.Validate(seh, options);
3760  CheckErrors(*eval, expected_errors);
3761 
3762  CLEAR_ERRORS
3763 }
3764 
3765 
3766 BOOST_AUTO_TEST_CASE(Test_CompleteTitleProblem)
3767 {
3768  // prepare entry
3769  CRef<CSeq_entry> entry = BuildGoodSeq();
3770  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3771  SetLineage(entry, "Viruses; foo");
3772  SetTitle(entry, "Foo complete genome");
3773 
3775 
3776  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning, "CompleteTitleProblem", "Complete genome in title without complete flag set"));
3777  // AddChromosomeNoLocation(expected_errors, entry);
3778 
3779  eval = validator.Validate(seh, options);
3780  CheckErrors(*eval, expected_errors);
3781 
3782  CLEAR_ERRORS
3783 
3784  // should be no error if complete
3786 
3787  eval = validator.Validate(seh, options);
3788  // AddChromosomeNoLocation(expected_errors, entry);
3789  CheckErrors(*eval, expected_errors);
3790 
3791  // different message and code if gaps
3792  scope.RemoveTopLevelSeqEntry(seh);
3793  entry = BuildGoodDeltaSeq();
3794  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3795  unit_test_util::SetLineage(entry, "Viruses; foo");
3796  SetTitle(entry, "Foo complete genome");
3798  seh = scope.AddTopLevelSeqEntry(*entry);
3799 
3800  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3801  "CompleteGenomeHasGaps", "Title contains 'complete genome' but sequence has gaps"));
3802 
3803  eval = validator.Validate(seh, options);
3804  CheckErrors(*eval, expected_errors);
3805 
3806  CLEAR_ERRORS
3807 }
3808 
3809 
3810 BOOST_AUTO_TEST_CASE(Test_CompleteCircleProblem)
3811 {
3812  // prepare entry
3814  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3815 
3817 
3818  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
3819  "CompleteCircleProblem",
3820  "Circular topology without complete flag set"));
3821  // AddChromosomeNoLocation(expected_errors, entry);
3822 
3823  eval = validator.Validate(seh, options);
3824  CheckErrors(*eval, expected_errors);
3825 
3826  CLEAR_ERRORS
3827 
3828  scope.RemoveTopLevelSeqEntry(seh);
3829  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3830  SetTitle(entry, "This is just a title");
3832  seh = scope.AddTopLevelSeqEntry(*entry);
3833  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3834  "CompleteCircleProblem",
3835  "Circular topology has complete flag set, but title should say complete sequence or complete genome"));
3836  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3837  "UnwantedCompleteFlag",
3838  "Suspicious use of complete"));
3839  // AddChromosomeNoLocation(expected_errors, entry);
3840 
3841  eval = validator.Validate(seh, options);
3842  CheckErrors(*eval, expected_errors);
3843 
3844  CLEAR_ERRORS
3845 }
3846 
3847 
3848 BOOST_AUTO_TEST_CASE(Test_BadHTGSeq)
3849 {
3850  // prepare entry
3852  // remove gaps
3854 
3855  STANDARD_SETUP_NAME(delta_entry)
3856 
3857  SetTech(delta_entry, CMolInfo::eTech_htgs_2);
3858  // AddChromosomeNoLocation(expected_errors, delta_entry);
3859  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingGaps", "HTGS delta seq should have gaps between all sequence runs"));
3860  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 delta seq has no gaps and no graphs"));
3861  eval = validator.Validate(seh, options);
3862  CheckErrors(*eval, expected_errors);
3863 
3864  delete expected_errors[1];
3865  expected_errors.pop_back();
3866 
3867  // HTGS_ACTIVEFIN keyword disables BadHTGSeq error
3868  AddGenbankKeyword(delta_entry, "HTGS_ACTIVEFIN");
3869  eval = validator.Validate(seh, options);
3870  CheckErrors(*eval, expected_errors);
3871 
3872  CLEAR_ERRORS
3873 
3874  scope.RemoveTopLevelSeqEntry(seh);
3876  SetTech(raw_entry, CMolInfo::eTech_htgs_2);
3877  seh = scope.AddTopLevelSeqEntry(*raw_entry);
3878  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 raw seq has no gaps and no graphs"));
3879  // AddChromosomeNoLocation(expected_errors, raw_entry);
3880  eval = validator.Validate(seh, options);
3881  CheckErrors(*eval, expected_errors);
3882 
3883  CLEAR_ERRORS
3884 
3885  // HTGS_ACTIVEFIN keyword disables error
3886  AddGenbankKeyword(raw_entry, "HTGS_ACTIVEFIN");
3887  // AddChromosomeNoLocation(expected_errors, raw_entry);
3888  eval = validator.Validate(seh, options);
3889  CheckErrors(*eval, expected_errors);
3890 
3891 
3892  // htg3 errors
3893  SetTech(raw_entry, CMolInfo::eTech_htgs_3);
3894  AddGenbankKeyword(raw_entry, "HTGS_DRAFT");
3895  AddGenbankKeyword(raw_entry, "HTGS_PREFIN");
3896  AddGenbankKeyword(raw_entry, "HTGS_FULLTOP");
3897  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_DRAFT keyword"));
3898  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_PREFIN keyword"));
3899  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_ACTIVEFIN keyword"));
3900  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_FULLTOP keyword"));
3901  eval = validator.Validate(seh, options);
3902  CheckErrors(*eval, expected_errors);
3903 
3904  scope.RemoveTopLevelSeqEntry(seh);
3905  seh = scope.AddTopLevelSeqEntry(*delta_entry);
3906  SetTech(delta_entry, CMolInfo::eTech_htgs_3);
3907  AddGenbankKeyword(delta_entry, "HTGS_DRAFT");
3908  AddGenbankKeyword(delta_entry, "HTGS_PREFIN");
3909  AddGenbankKeyword(delta_entry, "HTGS_FULLTOP");
3910  eval = validator.Validate(seh, options);
3911  CheckErrors(*eval, expected_errors);
3912 
3913  CLEAR_ERRORS
3914 }
3915 
3916 
3917 BOOST_AUTO_TEST_CASE(Test_GapInProtein_and_BadProteinStart)
3918 {
3919  // prepare entry
3921  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("PRK-EIN");
3922 
3924 
3925  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapInProtein", "[1] internal gap symbols in protein sequence (gene? - fake protein name)"));
3926  // AddChromosomeNoLocation(expected_errors, entry);
3927  eval = validator.Validate(seh, options);
3928  CheckErrors(*eval, expected_errors);
3929 
3930  CLEAR_ERRORS
3931 
3932  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("-RKTEIN");
3933  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadProteinStart", "gap symbol at start of protein sequence (gene? - fake protein name)"));
3934  // AddChromosomeNoLocation(expected_errors, entry);
3935  eval = validator.Validate(seh, options);
3936  CheckErrors(*eval, expected_errors);
3937 
3938  entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("-RK-EIN");
3939  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapInProtein", "[1] internal gap symbols in protein sequence (gene? - fake protein name)"));
3940  eval = validator.Validate(seh, options);
3941  CheckErrors(*eval, expected_errors);
3942 
3943  CLEAR_ERRORS
3944 }
3945 
3946 
3947 BOOST_AUTO_TEST_CASE(Test_TerminalGap)
3948 {
3949  // prepare entry
3951  CRef<CDelta_seq> first_seg(new CDelta_seq());
3952  first_seg->SetLiteral().SetLength(9);
3953  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_front(first_seg);
3954  CRef<CDelta_seq> last_seg(new CDelta_seq());
3955  last_seg->SetLiteral().SetLength(9);
3956  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(last_seg);
3957  entry->SetSeq().SetInst().SetLength(entry->SetSeq().SetInst().GetLength() + 18);
3958 
3960 
3961  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "First delta seq component is a gap"));
3962  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "Last delta seq component is a gap"));
3963  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at beginning of sequence"));
3964  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at end of sequence"));
3965  /*
3966  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3967  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3968  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3969  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3970  */
3971  // AddChromosomeNoLocation(expected_errors, entry);
3972 
3973  eval = validator.Validate(seh, options);
3974  CheckErrors(*eval, expected_errors);
3975 
3976  // if gap length is 10, severity is still warning because still all local IDS
3977  scope.RemoveTopLevelSeqEntry(seh);
3978  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetLength(10);
3979  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetLength(10);
3980  entry->SetSeq().SetInst().SetLength(entry->SetSeq().SetInst().GetLength() + 2);
3981  seh = scope.AddTopLevelSeqEntry(*entry);
3982  eval = validator.Validate(seh, options);
3983  CheckErrors(*eval, expected_errors);
3984 
3985 
3986  scope.RemoveTopLevelSeqEntry(seh);
3987  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3988  seh = scope.AddTopLevelSeqEntry(*entry);
3989  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3990  /*
3991  expected_errors[2]->SetSeverity(eDiag_Warning);
3992  expected_errors[3]->SetSeverity(eDiag_Warning);
3993  */
3994  eval = validator.Validate(seh, options);
3995  CheckErrors(*eval, expected_errors);
3996 
3997  scope.RemoveTopLevelSeqEntry(seh);
3998  entry->SetSeq().SetId().front()->SetPatent().SetSeqid(1);
3999  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetCountry("USA");
4000  entry->SetSeq().SetId().front()->SetPatent().SetCit().SetId().SetNumber("1");
4001  seh = scope.AddTopLevelSeqEntry(*entry);
4002  ChangeErrorAcc(expected_errors, "pat|USA|1|1");
4003  eval = validator.Validate(seh, options);
4004  CheckErrors(*eval, expected_errors);
4005 
4006  CLEAR_ERRORS
4007 
4008  // no more terminal gap warnings if circular - changed to still show first/last delta component
4009  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
4011  expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Warning, "UnwantedCompleteFlag",
4012  "Suspicious use of complete"));
4013  // AddChromosomeNoLocation(expected_errors, entry);
4014 
4015  eval = validator.Validate(seh, options);
4016  CheckErrors(*eval, expected_errors);
4017  CLEAR_ERRORS
4018 }
4019 
4020 
4021 BOOST_FIXTURE_TEST_CASE(Test_OverlappingDeltaRange, CGenBankFixture)
4022 {
4023  // prepare entry
4025  entry->SetSeq().SetInst().ResetExt();
4026  CRef<CSeq_id> seqid(new CSeq_id());
4027  seqid->SetGenbank().SetAccession("AY123456");
4028  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 0, 10);
4029  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 5, 15);
4030  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 20, 30);
4031  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 25, 35);
4032  entry->SetSeq().SetInst().SetLength(44);
4033 
4035 
4036  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingDeltaRange", "Overlapping delta range 6-16 and 1-11 on a Bioseq gb|AY123456|"));
4037  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingDeltaRange", "Overlapping delta range 26-36 and 21-31 on a Bioseq gb|AY123456|"));
4038  // AddChromosomeNoLocation(expected_errors, entry);
4039  eval = validator.Validate(seh, options);
4040  CheckErrors(*eval, expected_errors);
4041 
4042  CLEAR_ERRORS
4043 }
4044 
4045 
4046 BOOST_AUTO_TEST_CASE(Test_LeadingX)
4047 {
4048  // prepare entry
4050  entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("XROTEIN");
4051 
4053 
4054  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LeadingX", "Sequence starts with leading X"));
4055  // AddChromosomeNoLocation(expected_errors, entry);
4056  eval = validator.Validate(seh, options);
4057  CheckErrors(*eval, expected_errors);
4058 
4059  CLEAR_ERRORS
4060 }
4061 
4062 BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqRaw)
4063 {
4064  // prepare entry
4066  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTTT");
4067  entry->SetSeq().SetInst().SetLength(110);
4068 
4070 
4071  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqRaw", "Run of 100 Ns in raw sequence starting at base 6"));
4072  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4073  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 90 percent Ns"));
4074  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4075  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4076  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4077  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4078  // AddChromosomeNoLocation(expected_errors, entry);
4079  eval = validator.Validate(seh, options);
4080  CheckErrors(*eval, expected_errors);
4081 
4082  CLEAR_ERRORS
4083 
4084  // expect no InternalNsInSeqRaw error
4085  scope.RemoveTopLevelSeqEntry(seh);
4086  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAANNNNNNNNNNNNNNNNNNNNTTTTT");
4087  entry->SetSeq().SetInst().SetLength(30);
4088  seh = scope.AddTopLevelSeqEntry(*entry);
4089  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4090  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 66 percent Ns"));
4091  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4092  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4093  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4094  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4095  // AddChromosomeNoLocation(expected_errors, entry);
4096  eval = validator.Validate(seh, options);
4097  CheckErrors(*eval, expected_errors);
4098 
4099  CLEAR_ERRORS
4100 
4101  // WGS has lower threshold
4102  SetTech(entry, CMolInfo::eTech_wgs);
4103  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqRaw", "Run of 20 Ns in raw sequence starting at base 6"));
4104  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 5 bases"));
4105  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 66 percent Ns"));
4106  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4107  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4108  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4109  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4110  AddChromosomeNoLocation(expected_errors, entry);
4111  eval = validator.Validate(seh, options);
4112  CheckErrors(*eval, expected_errors);
4113 
4114  CLEAR_ERRORS
4115 }
4116 
4117 
4118 BOOST_AUTO_TEST_CASE(Test_InternalNsAdjacentToGap)
4119 {
4120  // prepare entry
4122  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("ATGATGATGNNN");
4123  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNATGATGATG");
4124 
4126 
4127  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ContigsTooShort", "Maximum contig length is 9 bases"));
4128  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InternalNsAdjacentToGap", "Ambiguous residue N is adjacent to a gap around position 13"));
4129  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InternalNsAdjacentToGap", "Ambiguous residue N is adjacent to a gap around position 23"));
4130 // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4131 // "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4132 // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4133 // "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4134  // AddChromosomeNoLocation(expected_errors, entry);
4135 
4136  eval = validator.Validate(seh, options);
4137  CheckErrors(*eval, expected_errors);
4138 
4139  CLEAR_ERRORS
4140 }
4141 
4142 BOOST_AUTO_TEST_CASE(Test_DeltaComponentIsGi0)
4143 {
4144  // prepare entry
4146  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
4147  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
4148  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGi(ZERO_GI);
4149 
4151 
4152  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "DeltaComponentIsGi0", "Delta component is gi|0"));
4153  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DeltaSeqError", "Unable to find far delta sequence component"));
4154  // AddChromosomeNoLocation(expected_errors, entry);
4155 
4156  eval = validator.Validate(seh, options);
4157  CheckErrors(*eval, expected_errors);
4158 
4159  CLEAR_ERRORS
4160 }
4161 
4162 
4163 BOOST_AUTO_TEST_CASE(Test_InternalGapsInSeqRaw)
4164 {
4165  // prepare entry
4167  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGGCCAAAATTGGCCAAAATTGG-CAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
4168 
4170 
4171  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue '-' at position [27]"));
4172  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalGapsInSeqRaw", "Raw nucleotide should not contain gap characters"));
4173  // AddChromosomeNoLocation(expected_errors, entry);
4174 
4175  eval = validator.Validate(seh, options);
4176  CheckErrors(*eval, expected_errors);
4177 
4178  CLEAR_ERRORS
4179 }
4180 
4181 
4182 BOOST_AUTO_TEST_CASE(Test_SelfReferentialSequence)
4183 {
4184  // prepare entry
4186  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
4187  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
4188  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetLocal().SetStr("good");
4189 
4191 
4192  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "SelfReferentialSequence", "Self-referential delta sequence"));
4193  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InstantiatedGapMismatch", "Exception 4 in GapByGapInst"));
4194  // AddChromosomeNoLocation(expected_errors, entry);
4195 
4196  eval = validator.Validate(seh, options);
4197  CheckErrors(*eval, expected_errors);
4198 
4199  CLEAR_ERRORS
4200 }
4201 
4202 
4204 {
4205  // prepare entry
4207  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetWhole().SetGenbank().SetAccession("AY123456");
4208  entry->SetSeq().SetInst().SetLength(507);
4209 
4211 
4212  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WholeComponent", "Delta seq component should not be of type whole"));
4213  // AddChromosomeNoLocation(expected_errors, entry);
4214 
4215  eval = validator.Validate(seh, options);
4216  CheckErrors(*eval, expected_errors);
4217 
4218  CLEAR_ERRORS
4219 }
4220 
4221 
4223 {
4224  CRef<CSeq_id> gnl(new CSeq_id());
4225  gnl->SetGeneral().SetDb("a");
4226  gnl->SetGeneral().SetTag().SetStr("b");
4227  seq.SetId().front()->Assign(*gnl);
4228  CRef<CSeq_id> lcl(new CSeq_id());
4229  lcl->SetLocal().SetStr("x");
4230  seq.SetId().push_back(lcl);
4231  seq.SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().Assign(*gnl);
4232 }
4233 
4234 
4235 BOOST_AUTO_TEST_CASE(Test_ProteinsHaveGeneralID)
4236 {
4237  // prepare entry
4239  s_AddGeneralAndLocal(entry->SetSeq());
4240 
4242 
4243  // no error unless part of nuc-prot set
4244  // AddChromosomeNoLocation(expected_errors,entry);
4245  eval = validator.Validate(seh, options);
4246  CheckErrors(*eval, expected_errors);
4247  CLEAR_ERRORS
4248 
4249  scope.RemoveTopLevelSeqEntry(seh);
4252  s_AddGeneralAndLocal(prot->SetSeq());
4253 
4255  cds->SetProduct().SetWhole().SetGeneral().SetDb("a");
4256  cds->SetProduct().SetWhole().SetGeneral().SetTag().SetStr("b");
4257  seh = scope.AddTopLevelSeqEntry(*entry);
4258 
4259  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "ProteinsHaveGeneralID", "INDEXER_ONLY - Protein bioseqs have general seq-id."));
4260  // AddChromosomeNoLocation(expected_errors, entry);
4261 
4262  eval = validator.Validate(seh, options);
4263  CheckErrors(*eval, expected_errors);
4264 
4265  CLEAR_ERRORS
4266 }
4267 
4268 
4269 BOOST_AUTO_TEST_CASE(Test_HighNContentPercent_and_HighNContentStretch)
4270 {
4271  // prepare entry
4273  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCNNNNNNNNNNNAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTT");
4274  entry->SetSeq().SetInst().SetLength(100);
4275  SetTech(entry, CMolInfo::eTech_tsa);
4277  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
4278 
4280 
4281  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 11 percent Ns"));
4282  // AddChromosomeNoLocation(expected_errors, entry);
4283  eval = validator.Validate(seh, options);
4284  CheckErrors(*eval, expected_errors);
4285 
4286  scope.RemoveTopLevelSeqEntry(seh);
4287  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCNNNNNNNNNNNNNNNNTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTT");
4288  seh = scope.AddTopLevelSeqEntry(*entry);
4289  expected_errors[0]->SetErrMsg("Sequence contains 16 percent Ns");
4290  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentStretch", "Sequence has a stretch of 16 Ns"));
4291  eval = validator.Validate(seh, options);
4292  CheckErrors(*eval, expected_errors);
4293 
4294  CLEAR_ERRORS
4295 
4296  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentStretch", "Sequence has a stretch of 16 Ns"));
4297  eval = validator.GetTSANStretchErrors(seh);
4298  CheckErrors(*eval, expected_errors);
4299  eval = validator.GetTSANStretchErrors(entry->GetSeq());
4300  CheckErrors(*eval, expected_errors);
4301 
4302  CLEAR_ERRORS
4303 
4304  scope.RemoveTopLevelSeqEntry(seh);
4305  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AANNNNNNNNNNGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGTTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCNNNNNNNNNNAAA");
4306  seh = scope.AddTopLevelSeqEntry(*entry);
4307  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4308  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4309  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4310  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4311  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent",
4312  "Sequence contains 20 percent Ns"));
4313  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent5Prime",
4314  "Sequence has a stretch of at least 10 Ns within the first 20 bases"));
4315  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent3Prime",
4316  "Sequence has a stretch of at least 10 Ns within the last 20 bases"));
4317  // AddChromosomeNoLocation(expected_errors, entry);
4318  eval = validator.Validate(seh, options);
4319  CheckErrors(*eval, expected_errors);
4320 
4321  CLEAR_ERRORS
4322 
4323  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent5Prime", "Sequence has a stretch of at least 10 Ns within the first 20 bases"));
4324  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent3Prime", "Sequence has a stretch of at least 10 Ns within the last 20 bases"));
4325  eval = validator.GetTSANStretchErrors(seh);
4326  CheckErrors(*eval, expected_errors);
4327  eval = validator.GetTSANStretchErrors(entry->GetSeq());
4328  CheckErrors(*eval, expected_errors);
4329 
4330  CLEAR_ERRORS
4331 
4332  scope.RemoveTopLevelSeqEntry(seh);
4334  CRef<CDelta_seq> gap_seg(new CDelta_seq());
4335  gap_seg->SetLiteral().SetSeq_data().SetGap();
4336  gap_seg->SetLiteral().SetLength(10);
4337  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
4338  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGA", CSeq_inst::eMol_dna);
4339  entry->SetSeq().SetInst().SetLength(entry->GetSeq().GetInst().GetLength() + 20);
4340  seh = scope.AddTopLevelSeqEntry(*entry);
4341 
4342  /*
4343  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4344  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4345  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4346  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4347  */
4348  // AddChromosomeNoLocation(expected_errors, entry);
4349 
4350  eval = validator.Validate(seh, options);
4351  CheckErrors(*eval, expected_errors);
4352 
4353  CLEAR_ERRORS
4354 }
4355 
4356 
4357 BOOST_AUTO_TEST_CASE(Test_SeqLitDataLength0)
4358 {
4359  // prepare entry
4361 
4362  CDelta_ext::Tdata::iterator seg_it = entry->SetSeq().SetInst().SetExt().SetDelta().Set().begin();
4363  ++seg_it;
4364  (*seg_it)->SetLiteral().SetSeq_data().SetIupacna().Set();
4365  (*seg_it)->SetLiteral().SetLength(0);
4366 
4367  entry->SetSeq().SetInst().SetLength(24);
4368 
4370 
4371  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLitDataLength0", "Seq-lit of length 0 in delta chain"));
4372  // AddChromosomeNoLocation(expected_errors, entry);
4373  eval = validator.Validate(seh, options);
4374  CheckErrors(*eval, expected_errors);
4375 
4376  CLEAR_ERRORS
4377 }
4378 
4379 
4381 {
4383 
4384  entry->SetSeq().SetInst().ResetSeq_data();
4385  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
4386  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("ATGATGATGCCC", CSeq_inst::eMol_dna);
4387  CRef<CDelta_seq> gap_seg(new CDelta_seq());
4388  gap_seg->SetLiteral().SetLength(101);
4389  gap_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
4390  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
4391  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATG", CSeq_inst::eMol_dna);
4392  entry->SetSeq().SetInst().SetLength(125);
4393 
4394  return entry;
4395 }
4396 
4397 
4398 BOOST_AUTO_TEST_CASE(Test_UnknownLengthGapNot100)
4399 {
4401 
4403 
4404  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnknownLengthGapNot100", "Gap of unknown length should have length 100"));
4405  /*
4406  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4407  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4408  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4409  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4410  */
4411  // AddChromosomeNoLocation(expected_errors, entry);
4412  eval = validator.Validate(seh, options);
4413  CheckErrors(*eval, expected_errors);
4414 
4415  CLEAR_ERRORS
4416 }
4417 
4418 
4420 {
4421  // prepare entry
4423  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
4425  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ds);
4426 
4428 
4429  // double strand
4430  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "mRNAshouldBeSingleStranded", "mRNA should be single stranded not double stranded"));
4431  // AddChromosomeNoLocation(expected_errors, entry);
4432  eval = validator.Validate(seh, options);
4433  CheckErrors(*eval, expected_errors);
4434 
4435  // mixed strand
4436  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_mixed);
4437  eval = validator.Validate(seh, options);
4438  CheckErrors(*eval, expected_errors);
4439 
4440  // mixed strand
4441  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_other);
4442  eval = validator.Validate(seh, options);
4443  CheckErrors(*eval, expected_errors);
4444 
4445  CLEAR_ERRORS
4446 
4447  // these should not produce errors
4448 
4449  // strand not set
4450  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_not_set);
4451  eval = validator.Validate(seh, options);
4452  // AddChromosomeNoLocation(expected_errors, entry);
4453 
4454  CheckErrors(*eval, expected_errors);
4455 
4456  entry->SetSeq().SetInst().ResetStrand();
4457  eval = validator.Validate(seh, options);
4458  CheckErrors(*eval, expected_errors);
4459 
4460  // single strand
4461  entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ss);
4462  eval = validator.Validate(seh, options);
4463  CheckErrors(*eval, expected_errors);
4464 
4465  CLEAR_ERRORS
4466 }
4467 
4468 
4469 BOOST_AUTO_TEST_CASE(Test_BioSourceMissing)
4470 {
4471  // prepare entry
4474  unit_test_util::AddGoodSource(entry->SetSet().SetSeq_set().front());
4475 
4477 
4478  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing", "Nuc-prot set does not contain expected BioSource descriptor"));
4479  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Fatal, "NoOrgFound", "No organism name included in the source. Other qualifiers may exist."));
4480  // AddChromosomeNoLocation(expected_errors, entry);
4481 
4482  eval = validator.Validate(seh, options);
4483  CheckErrors(*eval, expected_errors);
4484 
4485  CLEAR_ERRORS
4486 }
4487 
4488 
4489 BOOST_AUTO_TEST_CASE(Test_Descr_InvalidForType)
4490 {
4491  // prepare entry
4493  CRef<CSeqdesc> desc;
4494  desc.Reset(new CSeqdesc());
4496  entry->SetDescr().Set().push_back(desc);
4497  desc.Reset(new CSeqdesc());
4498  desc->SetModif().push_back(eGIBB_mod_dna);
4499  entry->SetDescr().Set().push_back(desc);
4500  desc.Reset(new CSeqdesc());
4502  entry->SetDescr().Set().push_back(desc);
4503  desc.Reset(new CSeqdesc());
4504  desc->SetOrg().SetTaxname("Sebaea microphylla");
4505  entry->SetDescr().Set().push_back(desc);
4506  AddTpaAssemblyUserObject(entry);
4507 
4509 
4510  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide",
4511  "Nucleic acid with protein sequence method"));
4512  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4513  "MolType descriptor is obsolete"));
4514  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4515  "Modif descriptor is obsolete"));
4516  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4517  "Method descriptor is obsolete"));
4518  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4519  "OrgRef descriptor is obsolete"));
4520  // AddChromosomeNoLocation(expected_errors, entry);
4521 
4522  // won't complain about TPA assembly if only local ID
4523  eval = validator.Validate(seh, options);
4524  CheckErrors(*eval, expected_errors);
4525 
4526  CLEAR_ERRORS
4527 
4528  scope.RemoveTopLevelSeqEntry(seh);
4529  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
4534  seh = scope.AddTopLevelSeqEntry(*entry);
4535  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TPAassemblyWithoutTPAKeyword",
4536  "Non-TPA record gb|AY123456| should not have TpaAssembly object"));
4537  // AddChromosomeNoLocation(expected_errors, entry);
4538  SetErrorsAccessions(expected_errors, "gb|AY123456|");
4539  eval = validator.Validate(seh, options);
4540  CheckErrors(*eval, expected_errors);
4541 
4542  scope.RemoveTopLevelSeqEntry(seh);
4543  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
4544  seh = scope.AddTopLevelSeqEntry(*entry);
4545  SetErrorsAccessions(expected_errors, "ref|NC_123456|");
4546  expected_errors[0]->SetErrMsg("Non-TPA record ref|NC_123456| should not have TpaAssembly object");
4547  eval = validator.Validate(seh, options);
4548  CheckErrors(*eval, expected_errors);
4549 
4550  desc.Reset(new CSeqdesc());
4552  entry->SetDescr().Set().push_back(desc);
4553  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InvalidForTypeGIBB",
4554  "Nucleic acid with GIBB-mol = peptide"));
4555  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InvalidForType",
4556  "MolType descriptor is obsolete"));
4557  eval = validator.Validate(seh, options);
4558  CheckErrors(*eval, expected_errors);
4559 
4561  expected_errors[1]->SetErrMsg("GIBB-mol unknown or other used");
4562  eval = validator.Validate(seh, options);
4563  CheckErrors(*eval, expected_errors);
4564 
4566  eval = validator.Validate(seh, options);
4567  CheckErrors(*eval, expected_errors);
4568 
4569  CLEAR_ERRORS
4570 
4571  scope.RemoveTopLevelSeqEntry(seh);
4573  desc.Reset(new CSeqdesc());
4575  entry->SetDescr().Set().push_back(desc);
4576  seh = scope.AddTopLevelSeqEntry(*entry);
4577  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4578  "GIBB-mol [1] used on protein"));
4579  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4580  "MolType descriptor is obsolete"));
4581  // AddChromosomeNoLocation(expected_errors, entry);
4582  eval = validator.Validate(seh, options);
4583  CheckErrors(*eval, expected_errors);
4584 
4586  expected_errors[0]->SetErrMsg("GIBB-mol [2] used on protein");
4587  eval = validator.Validate(seh, options);
4588  CheckErrors(*eval, expected_errors);
4589 
4590  desc->SetMol_type(eGIBB_mol_mRNA);
4591  expected_errors[0]->SetErrMsg("GIBB-mol [3] used on protein");
4592  eval = validator.Validate(seh, options);
4593  CheckErrors(*eval, expected_errors);
4594 
4595  desc->SetMol_type(eGIBB_mol_rRNA);
4596  expected_errors[0]->SetErrMsg("GIBB-mol [4] used on protein");
4597  eval = validator.Validate(seh, options);
4598  CheckErrors(*eval, expected_errors);
4599 
4600  desc->SetMol_type(eGIBB_mol_tRNA);
4601  expected_errors[0]->SetErrMsg("GIBB-mol [5] used on protein");
4602  eval = validator.Validate(seh, options);
4603  CheckErrors(*eval, expected_errors);
4604 
4606  expected_errors[0]->SetErrMsg("GIBB-mol [6] used on protein");
4607  eval = validator.Validate(seh, options);
4608  CheckErrors(*eval, expected_errors);
4609 
4611  expected_errors[0]->SetErrMsg("GIBB-mol [7] used on protein");
4612  eval = validator.Validate(seh, options);
4613  CheckErrors(*eval, expected_errors);
4614 
4616  expected_errors[0]->SetErrMsg("GIBB-mol [9] used on protein");
4617  eval = validator.Validate(seh, options);
4618  CheckErrors(*eval, expected_errors);
4619 
4621  expected_errors[0]->SetErrMsg("GIBB-mol [10] used on protein");
4622  eval = validator.Validate(seh, options);
4623  CheckErrors(*eval, expected_errors);
4624 
4625  CLEAR_ERRORS
4626 
4627  // invalid modif
4628  desc->SetModif().push_back(eGIBB_mod_dna);
4629  desc->SetModif().push_back(eGIBB_mod_rna);
4630  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4631  "Nucleic acid GIBB-mod [0] on protein"));
4632  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4633  "Nucleic acid GIBB-mod [1] on protein"));
4634  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4635  "Modif descriptor is obsolete"));
4636  // AddChromosomeNoLocation(expected_errors, entry);
4637  eval = validator.Validate(seh, options);
4638  CheckErrors(*eval, expected_errors);
4639 
4640  CLEAR_ERRORS
4641 
4642  scope.RemoveTopLevelSeqEntry(seh);
4643  entry = unit_test_util::BuildGoodSeq();
4644  for (auto& it : entry->SetSeq().SetDescr().Set()) {
4645  if (it->IsSource()) {
4646  it->SetSource().SetOrigin(CBioSource::eOrigin_synthetic);
4647  }
4648  }
4649  seh = scope.AddTopLevelSeqEntry(*entry);
4650  // if biomol not other, should generate error
4651  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidForType",
4652  "Molinfo-biomol other should be used if Biosource-location is synthetic"));
4653  // AddChromosomeNoLocation(expected_errors, entry);
4654  eval = validator.Validate(seh, options);
4655  CheckErrors(*eval, expected_errors);
4656 
4657  CLEAR_ERRORS
4658 
4659  for (auto& it : entry->SetSeq().SetDescr().Set()) {
4660  if (it->IsSource()) {
4661  it->SetSource().ResetOrigin();
4662  }
4663  }
4664 
4666  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidMolInfo",
4667  "Nucleic acid with Molinfo = peptide"));
4668  // AddChromosomeNoLocation(expected_errors, entry);
4669  eval = validator.Validate(seh, options);
4670  CheckErrors(*eval, expected_errors);
4671  CLEAR_ERRORS
4672 
4674  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
4675  "MoltypeOtherGenetic", "Molinfo-biomol = other genetic"));
4676  // AddChromosomeNoLocation(expected_errors, entry);
4677  eval = validator.Validate(seh, options);
4678  CheckErrors(*eval, expected_errors);
4679  CLEAR_ERRORS
4680 
4682  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4683  "MoltypeUnknown", "Molinfo-biomol unknown used"));
4684  // AddChromosomeNoLocation(expected_errors, entry);
4685  eval = validator.Validate(seh, options);
4686  CheckErrors(*eval, expected_errors);
4687  CLEAR_ERRORS
4688 
4690  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
4691  "MoltypeOther", "Molinfo-biomol other used"));
4692  // AddChromosomeNoLocation(expected_errors, entry);
4693  eval = validator.Validate(seh, options);
4694  CheckErrors(*eval, expected_errors);
4695  CLEAR_ERRORS
4696 
4697  scope.RemoveTopLevelSeqEntry(seh);
4699  seh = scope.AddTopLevelSeqEntry(*entry);
4700 
4701  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4702  "InvalidForType", "Molinfo-biomol [1] used on protein"));
4703  // AddChromosomeNoLocation(expected_errors, entry);
4705  expected_errors[0]->SetErrMsg("Molinfo-biomol [1] used on protein");
4706  eval = validator.Validate(seh, options);
4707  CheckErrors(*eval, expected_errors);
4708 
4710  expected_errors[0]->SetErrMsg("Molinfo-biomol [2] used on protein");
4711  eval = validator.Validate(seh, options);
4712  CheckErrors(*eval, expected_errors);
4713 
4715  expected_errors[0]->SetErrMsg("Molinfo-biomol [3] used on protein");
4716  eval = validator.Validate(seh, options);
4717  CheckErrors(*eval, expected_errors);
4718 
4720  expected_errors[0]->SetErrMsg("Molinfo-biomol [4] used on protein");
4721  eval = validator.Validate(seh, options);
4722  CheckErrors(*eval, expected_errors);
4723 
4725  expected_errors[0]->SetErrMsg("Molinfo-biomol [5] used on protein");
4726  eval = validator.Validate(seh, options);
4727  CheckErrors(*eval, expected_errors);
4728 
4730  expected_errors[0]->SetErrMsg("Molinfo-biomol [6] used on protein");
4731  eval = validator.Validate(seh, options);
4732  CheckErrors(*eval, expected_errors);
4733 
4735  expected_errors[0]->SetErrMsg("Molinfo-biomol [7] used on protein");
4736  eval = validator.Validate(seh, options);
4737  CheckErrors(*eval, expected_errors);
4738 
4740  expected_errors[0]->SetErrMsg("Molinfo-biomol [10] used on protein");
4741  eval = validator.Validate(seh, options);
4742  CheckErrors(*eval, expected_errors);
4743 
4745  expected_errors[0]->SetErrMsg("Molinfo-biomol [11] used on protein");
4746  eval = validator.Validate(seh, options);
4747  CheckErrors(*eval, expected_errors);
4748 
4750  expected_errors[0]->SetErrMsg("Molinfo-biomol [12] used on protein");
4751  eval = validator.Validate(seh, options);
4752  CheckErrors(*eval, expected_errors);
4753 
4755  expected_errors[0]->SetErrMsg("Molinfo-biomol [13] used on protein");
4756  eval = validator.Validate(seh, options);
4757  CheckErrors(*eval, expected_errors);
4758 
4760  expected_errors[0]->SetErrMsg("Molinfo-biomol [14] used on protein");
4761  eval = validator.Validate(seh, options);
4762  CheckErrors(*eval, expected_errors);
4763 
4765  expected_errors[0]->SetErrMsg("Molinfo-biomol [15] used on protein");
4766  eval = validator.Validate(seh, options);
4767  CheckErrors(*eval, expected_errors);
4768 
4769  CLEAR_ERRORS
4770 
4771  scope.RemoveTopLevelSeqEntry(seh);
4772  entry = unit_test_util::BuildGoodSeq();
4773  seh = scope.AddTopLevelSeqEntry(*entry);
4775  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SyntheticConstructWrongMolType",
4776  "synthetic construct should have other-genetic"));
4777  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SyntheticConstructNeedsArtificial",
4778  "synthetic construct should have artificial origin"));
4779  // AddChromosomeNoLocation(expected_errors, entry);
4780  eval = validator.Validate(seh, options);
4781  CheckErrors(*eval, expected_errors);
4782 
4783  CLEAR_ERRORS
4784 
4786 
4788  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide",
4789  "Nucleic acid with protein sequence method"));
4790  // AddChromosomeNoLocation(expected_errors, entry);
4791  eval = validator.Validate(seh, options);
4792  CheckErrors(*eval, expected_errors);
4793 
4795  eval = validator.Validate(seh, options);
4796  CheckErrors(*eval, expected_errors);
4797 
4798  SetTech(entry, CMolInfo::eTech_both);
4799  eval = validator.Validate(seh, options);
4800  CheckErrors(*eval, expected_errors);
4801 
4803  eval = validator.Validate(seh, options);
4804  CheckErrors(*eval, expected_errors);
4805 
4807  eval = validator.Validate(seh, options);
4808  CheckErrors(*eval, expected_errors);
4809 
4811  eval = validator.Validate(seh, options);
4812  CheckErrors(*eval, expected_errors);
4813 
4814  CLEAR_ERRORS
4815 
4816  scope.RemoveTopLevelSeqEntry(seh);
4818  seh = scope.AddTopLevelSeqEntry(*entry);
4819  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4820  "NucleotideTechniqueOnProtein", "Protein with nucleic acid sequence method"));
4821  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA",
4822  "EST sequence should be mRNA"));
4823 
4824  // AddChromosomeNoLocation(expected_errors, entry);
4825  SetTech(entry, CMolInfo::eTech_est);
4826  eval = validator.Validate(seh, options);
4827  CheckErrors(*eval, expected_errors);
4828 
4829  CLEAR_ERRORS
4830 
4831  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NucleotideTechniqueOnProtein",
4832  "Protein with nucleic acid sequence method"));
4833  // AddChromosomeNoLocation(expected_errors, entry);
4835  eval = validator.Validate(seh, options);
4836  CheckErrors(*eval, expected_errors);
4837 
4839  eval = validator.Validate(seh, options);
4840  CheckErrors(*eval, expected_errors);
4841 
4843  eval = validator.Validate(seh, options);
4844  CheckErrors(*eval, expected_errors);
4845 
4846  SetTech(entry, CMolInfo::eTech_htc);
4847  eval = validator.Validate(seh, options);
4848  CheckErrors(*eval, expected_errors);
4849 
4850  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic",
4851  "HTGS/STS/GSS/WGS sequence should be genomic"));
4852  SetTech(entry, CMolInfo::eTech_sts);
4853  eval = validator.Validate(seh, options);
4854  CheckErrors(*eval, expected_errors);
4855 
4857  eval = validator.Validate(seh, options);
4858  CheckErrors(*eval, expected_errors);
4859 
4861  eval = validator.Validate(seh, options);
4862  CheckErrors(*eval, expected_errors);
4863 
4865  eval = validator.Validate(seh, options);
4866  CheckErrors(*eval, expected_errors);
4867 
4869  eval = validator.Validate(seh, options);
4870  CheckErrors(*eval, expected_errors);
4871 
4872  SetTech(entry, CMolInfo::eTech_wgs);
4873  eval = validator.Validate(seh, options);
4874  AddChromosomeNoLocation(expected_errors, entry);
4875  CheckErrors(*eval, expected_errors);
4876 
4877  CLEAR_ERRORS
4878 
4879  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq",
4880  "HTGS 2 raw seq has no gaps and no graphs"));
4881  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NucleotideTechniqueOnProtein",
4882  "Protein with nucleic acid sequence method"));
4883  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic",
4884  "HTGS/STS/GSS/WGS sequence should be genomic"));
4885 
4886  // AddChromosomeNoLocation(expected_errors, entry);
4888  eval = validator.Validate(seh, options);
4889  CheckErrors(*eval, expected_errors);
4890 
4891  CLEAR_ERRORS
4892 
4893  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique",
4894  "Molinfo.tech barcode without BARCODE keyword"));
4895  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NucleotideTechniqueOnProtein",
4896  "Protein with nucleic acid sequence method"));
4897 
4898  // AddChromosomeNoLocation(expected_errors, entry);
4900  eval = validator.Validate(seh, options);
4901  CheckErrors(*eval, expected_errors);
4902 
4903  CLEAR_ERRORS
4904 }
4905 
4906 
4907 BOOST_AUTO_TEST_CASE(Test_Descr_Unknown)
4908 {
4909  // prepare entry
4911  CRef<CSeqdesc> desc(new CSeqdesc());
4912  desc->SetModif().push_back(eGIBB_mod_other);
4913  entry->SetDescr().Set().push_back(desc);
4914 
4916 
4917  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4918  "Modif descriptor is obsolete"));
4919  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Unknown",
4920  "GIBB-mod = other used"));
4921 
4922  // AddChromosomeNoLocation(expected_errors, entry);
4923  eval = validator.Validate(seh, options);
4924  CheckErrors(*eval, expected_errors);
4925 
4926  CLEAR_ERRORS
4927 }
4928 
4929 
4931 {
4933  set->SetSet().SetClass(CBioseq_set::eClass_gen_prod_set);
4934  set->SetSet().SetSeq_set().push_back(member);
4935  return set;
4936 }
4937 
4938 
4939 BOOST_AUTO_TEST_CASE(Test_Descr_NoPubFound)
4940 {
4941  // prepare entry
4944 
4946 
4947  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoPubFound",
4948  "No publications anywhere on this entire record."));
4949  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "MissingPubRequirement",
4950  "No submission citation anywhere on this entire record."));
4951  // AddChromosomeNoLocation(expected_errors, entry);
4952  eval = validator.Validate(seh, options);
4953  CheckErrors(*eval, expected_errors);
4954 
4955  CLEAR_ERRORS
4956 
4957  // make gpipe - should suppress error
4958  scope.RemoveTopLevelSeqEntry(seh);
4959  CRef<CSeq_id> id_suppress(new CSeq_id());
4960  id_suppress->SetGpipe().SetAccession("AY123456");
4961  entry->SetSet().SetSeq_set().front()->SetSeq().SetId().push_back(id_suppress);
4962  seh = scope.AddTopLevelSeqEntry(*entry);
4963  expected_errors.push_back(new CExpectedError("gpp|AY123456|", eDiag_Info, "MissingPubRequirement",
4964  "No submission citation anywhere on this entire record."));
4965  // AddChromosomeNoLocation(expected_errors, "gpp|AY123456|");
4966  eval = validator.Validate(seh, options);
4967  CheckErrors(*eval, expected_errors);
4968 
4969  CLEAR_ERRORS
4970 
4971  // make GPS - will suppress pub errors, although introduce gps erros
4972  scope.RemoveTopLevelSeqEntry(seh);
4973  entry->SetSet().SetSeq_set().front()->SetSeq().SetId().pop_back();
4974  CRef<CSeq_entry> gps = MakeGps(entry);
4975  seh = scope.AddTopLevelSeqEntry(*gps);
4976  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
4977  "GenomicProductPackagingProblem",
4978  "Nucleotide bioseq should be product of mRNA feature on contig, but is not"));
4979  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning,
4980  "GenomicProductPackagingProblem",
4981  "Protein bioseq should be product of CDS feature on contig, but is not"));
4982  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "MissingPubRequirement",
4983  "No submission citation anywhere on this entire record."));
4984 
4985  // AddChromosomeNoLocation(expected_errors, entry);
4986  eval = validator.Validate(seh, options);
4987  CheckErrors(*eval, expected_errors);
4988 
4989  CLEAR_ERRORS
4990 
4991  // only one has pub
4992  scope.RemoveTopLevelSeqEntry(seh);
4995  unit_test_util::AddGoodPub(entry->SetSet().SetSeq_set().front());
4996  seh = scope.AddTopLevelSeqEntry(*entry);
4997  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "NoPubFound",
4998  "No publications refer to this Bioseq."));
4999  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Info, "MissingPubRequirement",
5000  "Expected submission citation is missing for this Bioseq"));
5001 
5002  // AddChromosomeNoLocation(expected_errors, entry);
5003  eval = validator.Validate(seh, options);
5004  CheckErrors(*eval, expected_errors);
5005 
5006  CLEAR_ERRORS
5007 
5008  // intermediate wgs should suppress NoPubFound
5009  scope.RemoveTopLevelSeqEntry(seh);
5010  id_suppress->SetOther().SetAccession("NC_123456");
5011  entry->SetSet().SetSeq_set().front()->SetSeq().SetId().push_back(id_suppress);
5012  SetTech(entry->SetSet().SetSeq_set().front(), CMolInfo::eTech_wgs);
5013  seh = scope.AddTopLevelSeqEntry(*entry);
5014 
5015  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Info, "MissingPubRequirement",
5016  "Expected submission citation is missing for this Bioseq"));
5017  AddChromosomeNoLocation(expected_errors, "ref|NC_123456|");
5018  eval = validator.Validate(seh, options);
5019  CheckErrors(*eval, expected_errors);
5020 
5021  CLEAR_ERRORS
5022 }
5023 
5024 
5025 BOOST_AUTO_TEST_CASE(Test_Descr_NoOrgFound)
5026 {
5027  // prepare entry
5030 
5032 
5033  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing",
5034  "Nuc-prot set does not contain expected BioSource descriptor"));
5035  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoSourceDescriptor",
5036  "No source information included on this record."));
5037 
5038  eval = validator.Validate(seh, options);
5039  CheckErrors(*eval, expected_errors);
5040 
5041  CLEAR_ERRORS
5042 
5043  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing",
5044  "Nuc-prot set does not contain expected BioSource descriptor"));
5045 
5046  // suppress if patent or pdb
5047  scope.RemoveTopLevelSeqEntry(seh);
5048  CRef<CSeq_id> id2(new CSeq_id());
5049  id2->SetPatent().SetSeqid(1);
5050  id2->SetPatent().SetCit().SetCountry("USA");
5051  id2->SetPatent().SetCit().SetId().SetNumber("1");
5052  entry->SetSet().SetSeq_set().front()->SetSeq().SetId().push_back(id2);
5053  seh = scope.AddTopLevelSeqEntry(*entry);
5054  eval = validator.Validate(seh, options);
5055  CheckErrors(*eval, expected_errors);
5056 
5057  scope.RemoveTopLevelSeqEntry(seh);
5058  CRef<CPDB_seq_id> pdb_id(new CPDB_seq_id());
5059  pdb_id->SetMol().Set("foo");
5060  id2->SetPdb(*pdb_id);
5061  seh = scope.AddTopLevelSeqEntry(*entry);
5062  SetErrorsAccessions(expected_errors, "pdb|foo| ");
5063  eval = validator.Validate(seh, options);
5064  CheckErrors(*eval, expected_errors);
5065 
5066  // add one source
5067  scope.RemoveTopLevelSeqEntry(seh);
5068  entry->SetSet().SetSeq_set().front()->SetSeq().SetId().pop_back();
5069  unit_test_util::AddGoodSource(entry->SetSet().SetSeq_set().front());
5070  seh = scope.AddTopLevelSeqEntry(*entry);
5071  SetErrorsAccessions(expected_errors, "lcl|nuc");
5072  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Fatal, "NoOrgFound",
5073  "No organism name included in the source. Other qualifiers may exist."));
5074  // AddChromosomeNoLocation(expected_errors, entry);
5075 
5076  eval = validator.Validate(seh, options);
5077  CheckErrors(*eval, expected_errors);
5078 
5079  CLEAR_ERRORS
5080 
5081  // if there is a source descriptor but no tax name, still produce error
5082  unit_test_util::AddGoodSource(entry->SetSet().SetSeq_set().back());
5083  unit_test_util::SetTaxname(entry->SetSet().SetSeq_set().back(), "");
5084  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Fatal, "NoOrgFound",
5085  "No organism name included in the source. Other qualifiers may exist."));
5086  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceOnProtein",
5087  "Nuc-prot set has 1 protein with a BioSource descriptor"));
5088  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing",
5089  "Nuc-prot set does not contain expected BioSource descriptor"));
5090  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
5091  // AddChromosomeNoLocation(expected_errors, "lcl|prot");
5092  eval = validator.Validate(seh, options);
5093  CheckErrors(*eval, expected_errors);
5094 
5095  CLEAR_ERRORS
5096 }
5097 
5098 
5099 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleBioSources)
5100 {
5101  // prepare entry
5104 
5106 
5107  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MultipleBioSources",
5108  "Undesired multiple source descriptors"));
5109 
5110  // AddChromosomeNoLocation(expected_errors, "lcl|good");
5111  // AddChromosomeNoLocation(expected_errors, "lcl|good");
5112  eval = validator.Validate(seh, options);
5113  CheckErrors(*eval, expected_errors);
5114 
5115  CLEAR_ERRORS
5116 }
5117 
5118 
5119 BOOST_AUTO_TEST_CASE(Test_Descr_NoMolInfoFound)
5120 {
5121  // prepare entry
5124 
5126 
5127  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoMolInfoFound",
5128  "No Mol-info applies to this Bioseq"));
5129  // AddChromosomeNoLocation(expected_errors, entry);
5130 
5131  eval = validator.Validate(seh, options);
5132  CheckErrors(*eval, expected_errors);
5133 
5134  CLEAR_ERRORS
5135 }
5136 
5137 
5138 BOOST_AUTO_TEST_CASE(Test_Descr_NoTaxonID)
5139 {
5140  // prepare entry
5142  unit_test_util::SetTaxon(entry, 0);
5143 
5145 
5146  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
5147  "BioSource is missing taxon ID"));
5148  // AddChromosomeNoLocation(expected_errors, entry);
5149  eval = validator.Validate(seh, options);
5150  CheckErrors(*eval, expected_errors);
5151 
5152  CLEAR_ERRORS
5153 }
5154 
5155 
5156 BOOST_AUTO_TEST_CASE(Test_Descr_InconsistentBiosources)
5157 {
5158  // prepare entry
5159  CRef<CSeq_entry> entry(new CSeq_entry());
5162  entry->SetSet().SetSeq_set().push_back(first);
5164  second->SetSeq().SetId().front()->SetLocal().SetStr("good2");
5165  unit_test_util::SetTaxname(second, "");
5166  unit_test_util::SetTaxon(second, 0);
5167  unit_test_util::SetTaxname(second, "Trichechus manatus latirostris");
5168  unit_test_util::SetTaxon(second, 127582);
5169  entry->SetSet().SetSeq_set().push_back(second);
5170 
5171  CRef<CSeqdesc> desc(new CSeqdesc());
5172  desc->SetTitle("popset title");
5173  entry->SetSet().SetDescr().Set().push_back(desc);
5174 
5176 
5177  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentTaxNameSet",
5178  "Population set contains inconsistent organism names."));
5179  // AddChromosomeNoLocation(expected_errors, "lcl|good");
5180  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
5181 
5182  eval = validator.Validate(seh, options);
5183  CheckErrors(*eval, expected_errors);
5184 
5185  // warning instead of error if same up to ' sp. '
5188  unit_test_util::SetTaxname(first, "Corynebacterium sp. 979");
5190  unit_test_util::SetTaxname(second, "");
5191  unit_test_util::SetTaxon(second, 0);
5192  unit_test_util::SetTaxname(second, "Corynebacterium sp. DJ1");
5193  unit_test_util::SetTaxon(second, 632939);
5194  expected_errors[0]->SetSeverity(eDiag_Warning);
5195  eval = validator.Validate(seh, options);
5196  CheckErrors(*eval, expected_errors);
5197 
5198  // warning instead of error if one name is subset of the other
5201  unit_test_util::SetTaxname(first, "Trichechus manatus");
5203  unit_test_util::SetTaxname(second, "");
5204  unit_test_util::SetTaxon(second, 0);
5205  unit_test_util::SetTaxname(second, "Trichechus manatus latirostris");
5206  unit_test_util::SetTaxon(second, 127582);
5207  eval = validator.Validate(seh, options);
5208  CheckErrors(*eval, expected_errors);
5209 
5210  CLEAR_ERRORS
5211 
5212  // no error if not pop-set
5215  unit_test_util::SetTaxname(first, "Corynebacterium sp. 979");
5217  unit_test_util::SetTaxname(second, "");
5218  unit_test_util::SetTaxon(second, 0);
5219  unit_test_util::SetTaxname(second, "Trichechus manatus latirostris");
5220  unit_test_util::SetTaxon(second, 127582);
5223  // AddChromosomeNoLocation(expected_errors, "lcl|good");
5224  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
5225  eval = validator.Validate(seh, options);
5226  CheckErrors(*eval, expected_errors);
5227 
5228  CLEAR_ERRORS
5229 }
5230 
5231 
5232 BOOST_AUTO_TEST_CASE(Test_Descr_MissingLineage)
5233 {
5234  // prepare entry
5237 
5239 
5240  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingLineage",
5241  "No lineage for this BioSource."));
5242  // AddChromosomeNoLocation(expected_errors, entry);
5243 
5244  eval = validator.Validate(seh, options);
5245  CheckErrors(*eval, expected_errors);
5246 
5247  unit_test_util::SetLineage(entry, "");
5248  eval = validator.Validate(seh, options);
5249  CheckErrors(*eval, expected_errors);
5250 
5251  // warning if EMBL
5252  scope.RemoveTopLevelSeqEntry(seh);
5253  entry->SetSeq().SetId().front()->SetEmbl().SetAccession("B12345");
5254  seh = scope.AddTopLevelSeqEntry(*entry);
5255  expected_errors[0]->SetSeverity(eDiag_Warning);
5256  ChangeErrorAcc(expected_errors, "emb|B12345|");
5257  eval = validator.Validate(seh, options);
5258  CheckErrors(*eval, expected_errors);
5259 
5260  // warning if DDBJ
5261  scope.RemoveTopLevelSeqEntry(seh);
5262  entry->SetSeq().SetId().front()->SetDdbj().SetAccession("C12345");
5263  seh = scope.AddTopLevelSeqEntry(*entry);
5264  expected_errors[0]->SetSeverity(eDiag_Warning);
5265  ChangeErrorAcc(expected_errors, "dbj|C12345|");
5266  eval = validator.Validate(seh, options);
5267  CheckErrors(*eval, expected_errors);
5268 
5269 
5270  // critical instead of error if refseq AND has taxon
5271  scope.RemoveTopLevelSeqEntry(seh);
5272  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
5273  seh = scope.AddTopLevelSeqEntry(*entry);
5274  expected_errors[0]->SetSeverity(eDiag_Critical);
5275  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
5276  eval = validator.Validate(seh, options);
5277  CheckErrors(*eval, expected_errors);
5278 
5279  // back to error if no taxon but refseq
5280  unit_test_util::SetTaxon(entry, 0);
5281  expected_errors[0]->SetSeverity(eDiag_Error);
5282  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "NoTaxonID",
5283  "BioSource is missing taxon ID"));
5284  eval = validator.Validate(seh, options);
5285  CheckErrors(*eval, expected_errors);
5286 
5287  CLEAR_ERRORS
5288 }
5289 
5290 
5291 BOOST_AUTO_TEST_CASE(Test_Descr_SerialInComment)
5292 {
5293  // prepare entry
5295  CRef<CSeqdesc> comment(new CSeqdesc());
5296  comment->SetComment("blah blah [123456]");
5297  entry->SetSeq().SetDescr().Set().push_back(comment);
5298 
5300 
5301  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "SerialInComment",
5302  "Comment may refer to reference by serial number - attach reference specific comments to the reference REMARK instead."));
5303  // AddChromosomeNoLocation(expected_errors, entry);
5304 
5305  eval = validator.Validate(seh, options);
5306  CheckErrors(*eval, expected_errors);
5307 
5308  CLEAR_ERRORS
5309 }
5310 
5311 
5312 BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceNeedsFocus)
5313 {
5314  // prepare entry
5317 
5319 
5320  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BioSourceNeedsFocus",
5321  "BioSource descriptor must have focus or transgenic when BioSource feature with different taxname is present."));
5322  // AddChromosomeNoLocation(expected_errors, entry);
5323 
5324  eval = validator.Validate(seh, options);
5325  CheckErrors(*eval, expected_errors);
5326 
5327  CLEAR_ERRORS
5328 
5329  // AddChromosomeNoLocation(expected_errors, entry);
5330 
5331  // error goes away if focus is set on descriptor
5332  unit_test_util::SetFocus(entry);
5333  eval = validator.Validate(seh, options);
5334  CheckErrors(*eval, expected_errors);
5335 
5336  // error goes away if descriptor is transgenic
5338  unit_test_util::SetTransgenic(entry, true);
5339  eval = validator.Validate(seh, options);
5340  CheckErrors(*eval, expected_errors);
5341 
5342  CLEAR_ERRORS
5343 }
5344 
5345 
5346 BOOST_AUTO_TEST_CASE(Test_Descr_BadOrganelle)
5347 {
5348  // prepare entry
5351 
5353 
5354  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadOrganelleLocation",
5355  "Only Kinetoplastida have kinetoplasts"));
5356  // AddChromosomeNoLocation(expected_errors, entry);
5357 
5358  eval = validator.Validate(seh, options);
5359  CheckErrors(*eval, expected_errors);
5360 
5362  expected_errors[0]->SetErrMsg("Only Chlorarachniophyceae and Cryptophyceae have nucleomorphs");
5363  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyNucleomorphProblem",
5364  "Taxonomy lookup does not have expected nucleomorph flag"));
5365  eval = validator.Validate(seh, options);
5366  CheckErrors(*eval, expected_errors);
5367 
5368  CLEAR_ERRORS
5370  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadOrganelleLocation",
5371  "Only Ciliophora have macronuclear locations"));
5372  // AddChromosomeNoLocation(expected_errors, entry);
5373  eval = validator.Validate(seh, options);
5374  CheckErrors(*eval, expected_errors);
5375 
5376  CLEAR_ERRORS
5377 
5380  // AddChromosomeNoLocation(expected_errors, entry);
5381  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyPlastidsProblem",
5382  "Taxonomy lookup does not have expected plastid flag"));
5383  eval = validator.Validate(seh, options);
5384  CheckErrors(*eval, expected_errors);
5385 
5386  CLEAR_ERRORS
5387 
5388  // no plastid error if flag is present
5390  eval = validator.Validate(seh, options);
5391  // AddChromosomeNoLocation(expected_errors, entry);
5392  CheckErrors(*eval, expected_errors);
5393 
5394  CLEAR_ERRORS
5395 }
5396 
5397 
5398 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleChromosomes)
5399 {
5400  // prepare entry
5402  unit_test_util::SetChromosome(entry, "1");
5403 
5405 
5406  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleSourceQualifiers",
5407  "Multiple identical chromosome qualifiers present"));
5408  // AddChromosomeNoLocation(expected_errors, entry);
5409 
5410  eval = validator.Validate(seh, options);
5411  CheckErrors(*eval, expected_errors);
5412 
5413  unit_test_util::SetChromosome(entry, "2");
5414  expected_errors[0]->SetErrMsg("Multiple conflicting chromosome qualifiers present");
5415  eval = validator.Validate(seh, options);
5416  CheckErrors(*eval, expected_errors);
5417 
5418  CLEAR_ERRORS
5419 }
5420 
5421 
5422 BOOST_AUTO_TEST_CASE(Test_Descr_BadSubSource)
5423 {
5424  // prepare entry
5426  unit_test_util::SetSubSource(entry, 0, "foo");
5427 
5429 
5430  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "BadSubSource",
5431  "Unknown subsource subtype 0"));
5432  // AddChromosomeNoLocation(expected_errors, entry);
5433 
5434  eval = validator.Validate(seh, options);
5435  CheckErrors(*eval, expected_errors);
5436 
5437  CLEAR_ERRORS
5438 }
5439 
5440 void ShowOrgRef(const COrg_ref& org)
5441 {
5442  ESerialDataFormat outFormat = eSerial_AsnText;
5443  unique_ptr<CObjectOStream> os;
5444  os.reset(CObjectOStream::Open(outFormat, cout));
5445  *os << org;
5446 }
5447 
5448 
5449 void ShowOrgRef(const CSeq_entry& entry)
5450 {
5451  if (entry.IsSeq()) {
5452  if (entry.GetSeq().IsSetDescr()) {
5453  for (const auto& it : entry.GetSeq().GetDescr().Get()) {
5454  if (it->IsSource() && it->GetSource().IsSetOrg()) {
5455  ShowOrgRef(it->GetSource().GetOrg());
5456  }
5457  }
5458  }
5459  } else if (entry.IsSet()) {
5460  if (entry.GetSet().IsSetDescr()) {
5461  for (const auto& it : entry.GetSet().GetDescr().Get()) {
5462  if (it->IsSource() && it->GetSource().IsSetOrg()) {
5463  ShowOrgRef(it->GetSource().GetOrg());
5464  }
5465  }
5466  }
5467  if (entry.GetSet().IsSetSeq_set()) {
5468  for (const auto& it : entry.GetSet().GetSeq_set()) {
5469  ShowOrgRef(*it);
5470  }
5471  }
5472  }
5473 }
5474 
5475 
5476 BOOST_AUTO_TEST_CASE(Test_Descr_BadOrgMod)
5477 {
5478  // prepare entry
5480  unit_test_util::SetOrgMod(entry, 0, "foo");
5481  unit_test_util::SetOrgMod(entry, 1, "bar");
5485  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Sebaea microphylla");
5486  unit_test_util::SetCommon(entry, "some common name");
5487  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_common, "some common name");
5488  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_type_material, "invalid type material name");
5489 // ShowOrgRef(*entry);
5490 
5492 
5493  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
5494  "OrganismNotFound", "Organism not found in taxonomy database (suggested:Sebaea microphylla var. c)"));
5495 
5496  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "BadOrgMod",
5497  "Unknown orgmod subtype 0"));
5498  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "BadOrgMod",
5499  "Unknown orgmod subtype 1"));
5500  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleStrains",
5501  "Multiple strain qualifiers on the same BioSource"));
5502  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTypeMaterial",
5503  "Bad value for type_material"));
5504  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrgModMissingValue",
5505  "Variety value specified is not found in taxname"));
5506  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HostIdenticalToOrganism",
5507  "Specific host is identical to taxname"));
5508  /*
5509  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadOrgMod",
5510  "OrgMod common is identical to Org-ref common"));
5511  */
5512  // AddChromosomeNoLocation(expected_errors, entry);
5513 
5514  eval = validator.Validate(seh, options);
5515  CheckErrors(*eval, expected_errors);
5516 
5517  CLEAR_ERRORS
5518 }
5519 
5520 
5521 BOOST_AUTO_TEST_CASE(Test_BadVariety)
5522 {
5524  SetOrgMod(entry, COrgMod::eSubtype_variety, "x");
5525  SetTaxname(entry, "Sebaea microphylla var. x");
5526  SetTaxon(entry, 0);
5527 
5529 
5530  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
5531  "BadVariety",
5532  "Orgmod variety should only be in plants, fungi, or cyanobacteria"));
5533  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
5534  "NoTaxonID", "BioSource is missing taxon ID"));
5535  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
5536  "OrganismNotFound", "Organism not found in taxonomy database"));
5537  // AddChromosomeNoLocation(expected_errors, entry);
5538  eval = validator.Validate(seh, options);
5539  CheckErrors(*eval, expected_errors);
5540 
5541  CLEAR_ERRORS
5542 }
5543 
5544 
5545 BOOST_AUTO_TEST_CASE(Test_Descr_InconsistentProteinTitle)
5546 {
5547  // prepare entry
5549  CRef<CSeqdesc> desc(new CSeqdesc());
5550  desc->SetTitle("Not the correct title");
5551  entry->SetSet().SetSeq_set().back()->SetSeq().SetDescr().Set().push_back(desc);
5552 
5554 
5555  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "InconsistentProteinTitle",
5556  "Instantiated protein title does not match automatically generated title"));
5557  // AddChromosomeNoLocation(expected_errors, entry);
5558 
5559  eval = validator.Validate(seh, options);
5560  CheckErrors(*eval, expected_errors);
5561 
5562  CLEAR_ERRORS
5563 }
5564 
5565 
5566 BOOST_FIXTURE_TEST_CASE(Test_Descr_Inconsistent, CGenBankFixture)
5567 {
5568  // prepare entry
5570  CRef<CSeqdesc> desc1(new CSeqdesc());
5572  entry->SetSeq().SetDescr().Set().push_back(desc1);
5573  CRef<CSeqdesc> desc2(new CSeqdesc());
5575  entry->SetSeq().SetDescr().Set().push_back(desc2);
5576  CRef<CSeqdesc> desc3(new CSeqdesc());
5577  desc3->SetModif().push_back(eGIBB_mod_dna);
5578  desc3->SetModif().push_back(eGIBB_mod_rna);
5579  desc3->SetModif().push_back(eGIBB_mod_mitochondrial);
5580  desc3->SetModif().push_back(eGIBB_mod_cyanelle);
5581  desc3->SetModif().push_back(eGIBB_mod_complete);
5582  desc3->SetModif().push_back(eGIBB_mod_partial);
5583  desc3->SetModif().push_back(eGIBB_mod_no_left);
5584  desc3->SetModif().push_back(eGIBB_mod_no_right);
5585  entry->SetSeq().SetDescr().Set().push_back(desc3);
5586 
5587  CRef<CSeqdesc> desc_gb1(new CSeqdesc());
5588  desc_gb1->SetGenbank().SetKeywords().push_back("TPA:experimental");
5589  desc_gb1->SetGenbank().SetKeywords().push_back("TPA:inferential");
5590  entry->SetSeq().SetDescr().Set().push_back(desc_gb1);
5591  CRef<CSeqdesc> desc_gb2(new CSeqdesc());
5592  desc_gb2->SetGenbank();
5593  entry->SetSeq().SetDescr().Set().push_back(desc_gb2);
5594 
5595  CRef<CSeqdesc> desc_embl1(new CSeqdesc());
5596  desc_embl1->SetEmbl();
5597  entry->SetSeq().SetDescr().Set().push_back(desc_embl1);
5598  CRef<CSeqdesc> desc_embl2(new CSeqdesc());
5599  desc_embl2->SetEmbl();
5600  entry->SetSeq().SetDescr().Set().push_back(desc_embl2);
5601 
5602  CRef<CSeqdesc> desc_pir1(new CSeqdesc());
5603  desc_pir1->SetPir();
5604  entry->SetSeq().SetDescr().Set().push_back(desc_pir1);
5605  CRef<CSeqdesc> desc_pir2(new CSeqdesc());
5606  desc_pir2->SetPir();
5607  entry->SetSeq().SetDescr().Set().push_back(desc_pir2);
5608 
5609  CRef<CSeqdesc> desc_sp1(new CSeqdesc());
5610  desc_sp1->SetSp();
5611  entry->SetSeq().SetDescr().Set().push_back(desc_sp1);
5612  CRef<CSeqdesc> desc_sp2(new CSeqdesc());
5613  desc_sp2->SetSp();
5614  entry->SetSeq().SetDescr().Set().push_back(desc_sp2);
5615 
5616  CRef<CSeqdesc> desc_pdb1(new CSeqdesc());
5617  desc_pdb1->SetPdb();
5618  entry->SetSeq().SetDescr().Set().push_back(desc_pdb1);
5619  CRef<CSeqdesc> desc_pdb2(new CSeqdesc());
5620  desc_pdb2->SetPdb();
5621  entry->SetSeq().SetDescr().Set().push_back(desc_pdb2);
5622 
5623  CRef<CSeqdesc> desc_prf1(new CSeqdesc());
5624  desc_prf1->SetPrf();
5625  entry->SetSeq().SetDescr().Set().push_back(desc_prf1);
5626  CRef<CSeqdesc> desc_prf2(new CSeqdesc());
5627  desc_prf2->SetPrf();
5628  entry->SetSeq().SetDescr().Set().push_back(desc_prf2);
5629 
5630  CRef<CSeqdesc> desc_create1(new CSeqdesc());
5631  desc_create1->SetCreate_date().SetStd().SetYear(2009);
5632  desc_create1->SetCreate_date().SetStd().SetMonth(4);
5633  entry->SetSeq().SetDescr().Set().push_back(desc_create1);
5634  CRef<CSeqdesc> desc_create2(new CSeqdesc());
5635  desc_create2->SetCreate_date().SetStd().SetYear(2009);
5636  desc_create2->SetCreate_date().SetStd().SetMonth(3);
5637  entry->SetSeq().SetDescr().Set().push_back(desc_create2);
5638  CRef<CSeqdesc> desc_update(new CSeqdesc());
5639  desc_update->SetUpdate_date().SetStd().SetYear(2009);
5640  desc_update->SetUpdate_date().SetStd().SetMonth(2);
5641  entry->SetSeq().SetDescr().Set().push_back(desc_update);
5642 
5643  CRef<CSeqdesc> src_desc(new CSeqdesc());
5644  src_desc->SetSource().SetOrg().SetTaxname("Trichechus manatus");
5645  unit_test_util::SetTaxon(src_desc->SetSource(), 9778);
5646  src_desc->SetSource().SetOrg().SetOrgname().SetLineage("some lineage");
5647  entry->SetSeq().SetDescr().Set().push_back(src_desc);
5648 
5651  CRef<CSeqdesc> m_desc(new CSeqdesc());
5655  entry->SetSeq().SetDescr().Set().push_back(m_desc);
5656 
5658 
5659  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentTPA",
5660  "TPA:experimental and TPA:inferential should not both be in the same set of keywords"));
5661  /*
5662  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentDates",
5663  "Inconsistent create_dates [Mar 2009] and [Apr 2009]"));
5664  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentDates",
5665  "Inconsistent create_date [Apr 2009] and update_date [Feb 2009]"));
5666  */
5667  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentTaxName",
5668  "Inconsistent organism names [Trichechus manatus] and [Sebaea microphylla]"));
5669  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolInfo",
5670  "Inconsistent Molinfo-biomol [1] and [11]"));
5671  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolInfoTechnique",
5672  "Inconsistent Molinfo-tech [5] and [17]"));
5673  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolInfo",
5674  "Inconsistent Molinfo-completeness [3] and [4]"));
5675  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentGenBankblocks",
5676  "Multiple GenBank blocks"));
5677  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5678  "Multiple EMBL blocks"));
5679  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5680  "Multiple PIR blocks"));
5681  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5682  "Multiple PDB blocks"));
5683  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5684  "Multiple PRF blocks"));
5685  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5686  "Multiple SWISS-PROT blocks"));
5687  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5688  "Inconsistent GIBB-mod [0] and [1]"));
5689  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5690  "Inconsistent GIBB-mod [4] and [7]"));
5691  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5692  "Inconsistent GIBB-mod [11] and [10]"));
5693  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5694  "Inconsistent GIBB-mod [11] and [16]"));
5695  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5696  "Inconsistent GIBB-mod [11] and [17]"));
5697  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5698  "Inconsistent GIBB-mol [1] and [2]"));
5699  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
5700  "MolType descriptor is obsolete"));
5701  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
5702  "MolType descriptor is obsolete"));
5703  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
5704  "Modif descriptor is obsolete"));
5705  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDate",
5706  "Create date has error - BAD_DAY"));
5707  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDate",
5708  "Create date has error - BAD_DAY"));
5709  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDate",
5710  "Update date has error - BAD_DAY"));
5711  /*
5712  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MultipleBioSources",
5713  "Undesired multiple source descriptors"));
5714  */
5715  // AddChromosomeNoLocation(expected_errors, entry);
5716 
5717  eval = validator.Validate(seh, options);
5718  CheckErrors(*eval, expected_errors);
5719 
5720  CLEAR_ERRORS
5721 
5722  // try different WGS-style accessions, check for wgs_tech
5723  scope.RemoveTopLevelSeqEntry(seh);
5724  entry = unit_test_util::BuildGoodSeq();
5725  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("ABCD12345678");
5726  seh = scope.AddTopLevelSeqEntry(*entry);
5727 
5728  expected_errors.push_back(new CExpectedError("gb|ABCD12345678|", eDiag_Error, "InconsistentMolInfoTechnique",
5729  "WGS accession should have Mol-info.tech of wgs"));
5730  // AddChromosomeNoLocation(expected_errors, entry);
5731  eval = validator.Validate(seh, options);
5732  CheckErrors(*eval, expected_errors);
5733  scope.RemoveTopLevelSeqEntry(seh);
5734  entry->SetSeq().SetId().front()->SetEmbl().SetAccession("ABCE12345678");
5735  ChangeErrorAcc(expected_errors, "emb|ABCE12345678|");
5736  seh = scope.AddTopLevelSeqEntry(*entry);
5737  eval = validator.Validate(seh, options);
5738  CheckErrors(*eval, expected_errors);
5739  scope.RemoveTopLevelSeqEntry(seh);
5740  entry->SetSeq().SetId().front()->SetDdbj().SetAccession("ABCF12345678");
5741  ChangeErrorAcc(expected_errors, "dbj|ABCF12345678|");
5742  seh = scope.AddTopLevelSeqEntry(*entry);
5743  eval = validator.Validate(seh, options);
5744  CheckErrors(*eval, expected_errors);
5745 
5746  CLEAR_ERRORS
5747 
5748  // look for correct accession if WGS tech present
5749  scope.RemoveTopLevelSeqEntry(seh);
5750  entry->SetSeq().SetId().front()->SetEmbl().SetAccession("AA123456");
5751  // AddChromosomeNoLocation(expected_errors, entry);
5752  seh = scope.AddTopLevelSeqEntry(*entry);
5753  eval = validator.Validate(seh, options);
5754  CheckErrors(*eval, expected_errors);
5755 
5756  CLEAR_ERRORS
5757 
5758  scope.RemoveTopLevelSeqEntry(seh);
5759  entry->SetSeq().SetId().front()->SetDdbj().SetAccession("AB123456");
5760  // AddChromosomeNoLocation(expected_errors, entry);
5761  seh = scope.AddTopLevelSeqEntry(*entry);
5762  eval = validator.Validate(seh, options);
5763  CheckErrors(*eval, expected_errors);
5764 
5765  CLEAR_ERRORS
5766 
5767  scope.RemoveTopLevelSeqEntry(seh);
5768  entry = unit_test_util::BuildGoodSeq();
5769  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AC123456");
5770 
5771  SetTech(entry, CMolInfo::eTech_wgs);
5772  seh = scope.AddTopLevelSeqEntry(*entry);
5773  expected_errors.push_back(new CExpectedError("gb|AC123456|", eDiag_Error, "InconsistentWGSFlags",
5774  "Mol-info.tech of wgs should have WGS accession"));
5775  expected_errors.push_back(new CExpectedError("gb|AC123456|", eDiag_Warning, "UnexpectedIdentifierChange",
5776  "Loss of general ID (BCMHGSC: PROJECT_GXOU.BAYLOR) on gi (25008031) compared to the NCBI sequence repository"));
5777  AddChromosomeNoLocation(expected_errors, entry);
5778  eval = validator.Validate(seh, options);
5779  CheckErrors(*eval, expected_errors);
5780 
5781  CLEAR_ERRORS
5782 
5783  scope.RemoveTopLevelSeqEntry(seh);
5784  entry->SetSeq().SetId().front()->SetOther().SetAccession("NM_123456");
5785  seh = scope.AddTopLevelSeqEntry(*entry);
5786  expected_errors.push_back(new CExpectedError("ref|NM_123456|", eDiag_Error, "InconsistentWGSFlags",
5787  "Mol-info.tech of wgs should have WGS accession"));
5788  AddChromosomeNoLocation(expected_errors, entry);
5789 
5790  eval = validator.Validate(seh, options);
5791  CheckErrors(*eval, expected_errors);
5792 
5793  scope.RemoveTopLevelSeqEntry(seh);
5794  entry->SetSeq().SetId().front()->SetOther().SetAccession("NP_123456");
5795  seh = scope.AddTopLevelSeqEntry(*entry);
5796  ChangeErrorAcc(expected_errors, "ref|NP_123456|");
5797  eval = validator.Validate(seh, options);
5798  CheckErrors(*eval, expected_errors);
5799 
5800  scope.RemoveTopLevelSeqEntry(seh);
5801  entry->SetSeq().SetId().front()->SetOther().SetAccession("NG_123456");
5802  seh = scope.AddTopLevelSeqEntry(*entry);
5803  ChangeErrorAcc(expected_errors, "ref|NG_123456|");
5804  eval = validator.Validate(seh, options);
5805  CheckErrors(*eval, expected_errors);
5806 
5807  scope.RemoveTopLevelSeqEntry(seh);
5808  entry->SetSeq().SetId().front()->SetOther().SetAccession("NR_123456");
5809  seh = scope.AddTopLevelSeqEntry(*entry);
5810  ChangeErrorAcc(expected_errors, "ref|NR_123456|");
5811  eval = validator.Validate(seh, options);
5812  CheckErrors(*eval, expected_errors);
5813 
5814  CLEAR_ERRORS
5815 
5816  // no tech warning if other but not one of four starts
5817  scope.RemoveTopLevelSeqEntry(seh);
5818  entry->SetSeq().SetId().front()->SetOther().SetAccession("NX_123456");
5819  seh = scope.AddTopLevelSeqEntry(*entry);
5820  eval = validator.Validate(seh, options);
5821  AddChromosomeNoLocation(expected_errors, entry);
5822  CheckErrors(*eval, expected_errors);
5823 
5824  CLEAR_ERRORS
5825 
5826  // skip warning if segset accession
5827  vector<string> segset_accession_prefixes;
5828  segset_accession_prefixes.push_back("AH");
5829  segset_accession_prefixes.push_back("CH");
5830  segset_accession_prefixes.push_back("CM");
5831  segset_accession_prefixes.push_back("DS");
5832  segset_accession_prefixes.push_back("EM");
5833  segset_accession_prefixes.push_back("EN");
5834  segset_accession_prefixes.push_back("EP");
5835  segset_accession_prefixes.push_back("EQ");
5836  segset_accession_prefixes.push_back("FA");
5837  segset_accession_prefixes.push_back("GG");
5838  segset_accession_prefixes.push_back("GL");
5839 
5840  for (const string& it : segset_accession_prefixes) {
5841  scope.RemoveTopLevelSeqEntry(seh);
5842  entry->SetSeq().SetId().front()->SetOther().SetAccession(it + "_123456");
5843  seh = scope.AddTopLevelSeqEntry(*entry);
5844  eval = validator.Validate(seh, options);
5845  AddChromosomeNoLocation(expected_errors, entry);
5846  CheckErrors(*eval, expected_errors);
5847  CLEAR_ERRORS
5848  }
5849 
5850  // biomol on NC should be genomic or cRNA
5851  scope.RemoveTopLevelSeqEntry(seh);
5852  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
5855  seh = scope.AddTopLevelSeqEntry(*entry);
5856  // no error expected
5857  eval = validator.Validate(seh, options);
5858  // AddChromosomeNoLocation(expected_errors, entry);
5859  CheckErrors(*eval, expected_errors);
5861  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
5862  // no error expected
5863  eval = validator.Validate(seh, options);
5864  CheckErrors(*eval, expected_errors);
5865  // expect errors
5867  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InconsistentRefSeqMoltype",
5868  "genomic RefSeq accession should use genomic or cRNA moltype"));
5869  eval = validator.Validate(seh, options);
5870  CheckErrors(*eval, expected_errors);
5872  eval = validator.Validate(seh, options);
5873  CheckErrors(*eval, expected_errors);
5875  eval = validator.Validate(seh, options);
5876  CheckErrors(*eval, expected_errors);
5878  eval = validator.Validate(seh, options);
5879  CheckErrors(*eval, expected_errors);
5881  eval = validator.Validate(seh, options);
5882  CheckErrors(*eval, expected_errors);
5884  eval = validator.Validate(seh, options);
5885  CheckErrors(*eval, expected_errors);
5887  eval = validator.Validate(seh, options);
5888  CheckErrors(*eval, expected_errors);
5890  eval = validator.Validate(seh, options);
5891  CheckErrors(*eval, expected_errors);
5893  eval = validator.Validate(seh, options);
5894  CheckErrors(*eval, expected_errors);
5896  eval = validator.Validate(seh, options);
5897  CheckErrors(*eval, expected_errors);
5899  eval = validator.Validate(seh, options);
5900  CheckErrors(*eval, expected_errors);
5902  eval = validator.Validate(seh, options);
5903  CheckErrors(*eval, expected_errors);
5904 
5905  CLEAR_ERRORS
5906 }
5907 
5908 
5909 BOOST_AUTO_TEST_CASE(Test_Descr_ObsoleteSourceLocation)
5910 {
5911  // prepare entry
5914 
5916 
5917  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ObsoleteSourceLocation",
5918  "Transposon and insertion sequence are no longer legal locations"));
5919  // AddChromosomeNoLocation(expected_errors, entry);
5920  eval = validator.Validate(seh, options);
5921  CheckErrors(*eval, expected_errors);
5922 
5924  eval = validator.Validate(seh, options);
5925  CheckErrors(*eval, expected_errors);
5926 
5927  CLEAR_ERRORS
5928 }
5929 
5930 
5931 BOOST_AUTO_TEST_CASE(Test_Descr_ObsoleteSourceQual)
5932 {
5933  // prepare entry
5937 
5939 
5940  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ObsoleteSourceQual",
5941  "Transposon name and insertion sequence name are no longer legal qualifiers"));
5942  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ObsoleteSourceQual",
5943  "Transposon name and insertion sequence name are no longer legal qualifiers"));
5944  // AddChromosomeNoLocation(expected_errors, entry);
5945 
5946  eval = validator.Validate(seh, options);
5947  CheckErrors(*eval, expected_errors);
5948 
5949  CLEAR_ERRORS
5950 }
5951 
5952 
5953 BOOST_AUTO_TEST_CASE(Test_Descr_StructuredSourceNote)
5954 {
5955  // prepare entry
5957 
5959 
5960  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StructuredSourceNote",
5961  "Source note has structured tag '"));
5962  // AddChromosomeNoLocation(expected_errors, entry);
5963 
5964  vector<string> tag_prefixes;
5965  tag_prefixes.push_back("acronym:");
5966  tag_prefixes.push_back("anamorph:");
5967  tag_prefixes.push_back("authority:");
5968  tag_prefixes.push_back("biotype:");
5969  tag_prefixes.push_back("biovar:");
5970  tag_prefixes.push_back("bio_material:");
5971  tag_prefixes.push_back("breed:");
5972  tag_prefixes.push_back("cell_line:");
5973  tag_prefixes.push_back("cell_type:");
5974  tag_prefixes.push_back("chemovar:");
5975  tag_prefixes.push_back("chromosome:");
5976  tag_prefixes.push_back("clone:");
5977  tag_prefixes.push_back("clone_lib:");
5978  tag_prefixes.push_back("collected_by:");
5979  tag_prefixes.push_back("collection_date:");
5980  tag_prefixes.push_back("common:");
5981  tag_prefixes.push_back("country:");
5982  tag_prefixes.push_back("cultivar:");
5983  tag_prefixes.push_back("culture_collection:");
5984  tag_prefixes.push_back("dev_stage:");
5985  tag_prefixes.push_back("dosage:");
5986  tag_prefixes.push_back("ecotype:");
5987  tag_prefixes.push_back("endogenous_virus_name:");
5988  tag_prefixes.push_back("environmental_sample:");
5989  tag_prefixes.push_back("forma:");
5990  tag_prefixes.push_back("forma_specialis:");
5991  tag_prefixes.push_back("frequency:");
5992  tag_prefixes.push_back("fwd_pcr_primer_name");
5993  tag_prefixes.push_back("fwd_pcr_primer_seq");
5994  tag_prefixes.push_back("fwd_primer_name");
5995  tag_prefixes.push_back("fwd_primer_seq");
5996  tag_prefixes.push_back("genotype:");
5997  tag_prefixes.push_back("germline:");
5998  tag_prefixes.push_back("group:");
5999  tag_prefixes.push_back("haplogroup:");
6000  tag_prefixes.push_back("haplotype:");
6001  tag_prefixes.push_back("identified_by:");
6002  tag_prefixes.push_back("insertion_seq_name:");
6003  tag_prefixes.push_back("isolate:");
6004  tag_prefixes.push_back("isolation_source:");
6005  tag_prefixes.push_back("lab_host:");
6006  tag_prefixes.push_back("lat_lon:");
6007  tag_prefixes.push_back("left_primer:");
6008  tag_prefixes.push_back("linkage_group:");
6009  tag_prefixes.push_back("map:");
6010  tag_prefixes.push_back("mating_type:");
6011  tag_prefixes.push_back("metagenome_source:");
6012  tag_prefixes.push_back("metagenomic:");
6013  tag_prefixes.push_back("nat_host:");
6014  tag_prefixes.push_back("pathovar:");
6015  tag_prefixes.push_back("placement:");
6016  tag_prefixes.push_back("plasmid_name:");
6017  tag_prefixes.push_back("plastid_name:");
6018  tag_prefixes.push_back("pop_variant:");
6019  tag_prefixes.push_back("rearranged:");
6020  tag_prefixes.push_back("rev_pcr_primer_name");
6021  tag_prefixes.push_back("rev_pcr_primer_seq");
6022  tag_prefixes.push_back("rev_primer_name");
6023  tag_prefixes.push_back("rev_primer_seq");
6024  tag_prefixes.push_back("right_primer:");
6025  tag_prefixes.push_back("segment:");
6026  tag_prefixes.push_back("serogroup:");
6027  tag_prefixes.push_back("serotype:");
6028  tag_prefixes.push_back("serovar:");
6029  tag_prefixes.push_back("sex:");
6030  tag_prefixes.push_back("specimen_voucher:");
6031  tag_prefixes.push_back("strain:");
6032  tag_prefixes.push_back("subclone:");
6033  tag_prefixes.push_back("subgroup:");
6034  tag_prefixes.push_back("substrain:");
6035  tag_prefixes.push_back("subtype:");
6036  tag_prefixes.push_back("sub_species:");
6037  tag_prefixes.push_back("synonym:");
6038  tag_prefixes.push_back("taxon:");
6039  tag_prefixes.push_back("teleomorph:");
6040  tag_prefixes.push_back("tissue_lib:");
6041  tag_prefixes.push_back("tissue_type:");
6042  tag_prefixes.push_back("transgenic:");
6043  tag_prefixes.push_back("transposon_name:");
6044  tag_prefixes.push_back("type:");
6045  tag_prefixes.push_back("variety:");
6046 
6047  for (const string& it : tag_prefixes) {
6048  expected_errors[0]->SetErrMsg("Source note has structured tag '" + it + "'");
6050  eval = validator.Validate(seh, options);
6051  CheckErrors(*eval, expected_errors);
6054  eval = validator.Validate(seh, options);
6055  CheckErrors(*eval, expected_errors);
6057  }
6058 
6059 
6060  CLEAR_ERRORS
6061 }
6062 
6063 
6064 BOOST_AUTO_TEST_CASE(Test_Descr_UnnecessaryBioSourceFocus)
6065 {
6066  // prepare entry
6068  unit_test_util::SetFocus(entry);
6069 
6071 
6072  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnnecessaryBioSourceFocus",
6073  "BioSource descriptor has focus, but no BioSource feature"));
6074  // AddChromosomeNoLocation(expected_errors, entry);
6075 
6076  eval = validator.Validate(seh, options);
6077  CheckErrors(*eval, expected_errors);
6078 
6079  CLEAR_ERRORS
6080 }
6081 
6082 
6083 BOOST_AUTO_TEST_CASE(Test_Descr_RefGeneTrackingWithoutStatus)
6084 {
6085  // prepare entry
6087  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
6088  CRef<CSeqdesc> desc(new CSeqdesc());
6090  entry->SetSeq().SetDescr().Set().push_back(desc);
6091 
6092  CRef<CUser_field> field(new CUser_field());
6093  field->SetLabel().SetStr("Label");
6094  field->SetData().SetStr("Data");
6095  desc->SetUser().SetData().push_back(field);
6096 
6098 
6099  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "RefGeneTrackingWithoutStatus",
6100  "RefGeneTracking object needs to have Status set"));
6101  // AddChromosomeNoLocation(expected_errors, entry);
6102 
6103  eval = validator.Validate(seh, options);
6104  CheckErrors(*eval, expected_errors);
6105 
6106  CLEAR_ERRORS
6107 }
6108 
6109 
6110 BOOST_AUTO_TEST_CASE(Test_Descr_UnwantedCompleteFlag)
6111 {
6112  // prepare entry
6114  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
6116  SetTitle(entry, "a title without the word");
6117 
6119 
6120  expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning, "UnwantedCompleteFlag",
6121  "Suspicious use of complete"));
6122  // AddChromosomeNoLocation(expected_errors, entry);
6123 
6124  eval = validator.Validate(seh, options);
6125  CheckErrors(*eval, expected_errors);
6126 
6127  // tech of HTGS3 lowers to warning
6129  expected_errors[0]->SetSeverity(eDiag_Warning);
6130  eval = validator.Validate(seh, options);
6131  CheckErrors(*eval, expected_errors);
6132 
6133  CLEAR_ERRORS
6134 
6135  // suppress if complete sequence or complete genome in title
6136  SetTitle(entry, "complete sequence");
6137  // AddChromosomeNoLocation(expected_errors, entry);
6138  eval = validator.Validate(seh, options);
6139  CheckErrors(*eval, expected_errors);
6140 
6141  CLEAR_ERRORS
6142 
6143  // suppress if viral
6144  scope.RemoveTopLevelSeqEntry(seh);
6145  SetTitle(entry, "a title without the word");
6146  entry->SetSeq().SetId().front()->SetEmbl().SetAccession("AY123457");
6147  unit_test_util::SetLineage(entry, "Viruses");
6148  // AddChromosomeNoLocation(expected_errors, entry);
6149  seh = scope.AddTopLevelSeqEntry(*entry);
6150  eval = validator.Validate(seh, options);
6151  CheckErrors(*eval, expected_errors);
6152 
6153  // suppress if artificial
6154  unit_test_util::SetLineage(entry, "Bacteria");
6156  CheckErrors(*eval, expected_errors);
6157 
6158  CLEAR_ERRORS
6159 }
6160 
6161 
6162 BOOST_AUTO_TEST_CASE(Test_Descr_CollidingPublications)
6163 {
6164  // prepare entry
6168  CRef<CPub> otherpub1(new CPub());
6169  otherpub1->SetArticle().SetAuthors().SetNames().SetStd().push_back(auth1);
6171  title1->SetName("First title");
6172  otherpub1->SetArticle().SetTitle().Set().push_back(title1);
6173  pub1->SetPub().SetPub().Set().push_back(otherpub1);
6174  entry->SetSeq().SetDescr().Set().push_back(pub1);
6176  CRef<CPub> otherpub2(new CPub());
6178  otherpub2->SetArticle().SetAuthors().SetNames().SetStd().push_back(auth1);
6180  title2->SetName("Second title");
6181  otherpub2->SetArticle().SetTitle().Set().push_back(title2);
6182  pub2->SetPub().SetPub().Set().push_back(otherpub2);
6183  entry->SetSeq().SetDescr().Set().push_back(pub2);
6184 
6186 
6187  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CollidingPubMedID",
6188  "Multiple publications with identical PubMed ID"));
6189  // AddChromosomeNoLocation(expected_errors, entry);
6190 
6191  eval = validator.Validate(seh, options);
6192  CheckErrors(*eval, expected_errors);
6193 
6194  // should also report muid collisions
6195  pub1->SetPub().SetPub().Set().front()->SetMuid(ENTREZ_ID_CONST(2));
6196  pub2->SetPub().SetPub().Set().front()->SetMuid(ENTREZ_ID_CONST(2));
6197  eval = validator.Validate(seh, options);
6198  CheckErrors(*eval, expected_errors);
6199 
6200  // look for same pub twice
6201  title2->SetName("First title");
6202  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CollidingPublications",
6203  "Multiple equivalent publications annotated on this sequence [Darwin|Ft; Darwin]"));
6204  eval = validator.Validate(seh, options);
6205  CheckErrors(*eval, expected_errors);
6206 
6207  delete expected_errors[1];
6208  expected_errors.pop_back();
6209 
6210  // look for multiple IDs on same pub
6211  scope.RemoveTopLevelSeqEntry(seh);
6212  entry->SetSeq().SetDescr().Set().pop_back();
6213  CRef<CPub> extra_id(new CPub());
6214  extra_id->SetMuid(ENTREZ_ID_CONST(3));
6215  pub1->SetPub().SetPub().Set().push_back(extra_id);
6216  seh = scope.AddTopLevelSeqEntry(*entry);
6217  expected_errors[0]->SetErrCode("CollidingPublications");
6218  expected_errors[0]->SetErrMsg("Multiple conflicting muids in a single publication");
6219  eval = validator.Validate(seh, options);
6220  CheckErrors(*eval, expected_errors);
6221  extra_id->SetMuid(ENTREZ_ID_CONST(2));
6222  expected_errors[0]->SetErrMsg("Multiple redundant muids in a single publication");
6223  eval = validator.Validate(seh, options);
6224  CheckErrors(*eval, expected_errors);
6225  pub1->SetPub().SetPub().Set().front()->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
6226  extra_id->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(3)));
6227  expected_errors[0]->SetErrMsg("Multiple conflicting pmids in a single publication");
6228  eval = validator.Validate(seh, options);
6229  CheckErrors(*eval, expected_errors);
6230  extra_id->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
6231  expected_errors[0]->SetErrMsg("Multiple redundant pmids in a single publication");
6232  eval = validator.Validate(seh, options);
6233  CheckErrors(*eval, expected_errors);
6234 
6235  CLEAR_ERRORS
6236 }
6237 
6238 
6239 BOOST_AUTO_TEST_CASE(Test_Descr_TransgenicProblem)
6240 {
6241  // prepare entry
6244 
6246 
6247  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TransgenicProblem",
6248  "Transgenic source descriptor requires presence of source feature"));
6249  // AddChromosomeNoLocation(expected_errors, entry);
6250 
6251  eval = validator.Validate(seh, options);
6252  CheckErrors(*eval, expected_errors);
6253 
6254  CLEAR_ERRORS
6255 
6256  scope.RemoveTopLevelSeqEntry(seh);
6257  // adding source feature turns off warning
6258  AddGoodSourceFeature(entry);
6259  seh = scope.AddTopLevelSeqEntry(*entry);
6260 
6261  // AddChromosomeNoLocation(expected_errors, entry);
6262  eval = validator.Validate(seh, options);
6263  CheckErrors(*eval, expected_errors);
6264 
6265  CLEAR_ERRORS
6266 }
6267 
6268 
6269 BOOST_AUTO_TEST_CASE(Test_Descr_TaxonomyLookupProblem)
6270 {
6271  // prepare entry
6273  unit_test_util::SetTaxname(entry, "Not valid");
6274  unit_test_util::SetTaxon(entry, 0);
6275 
6277 
6278  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
6279  "BioSource is missing taxon ID"));
6280  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
6281  "Organism not found in taxonomy database"));
6282  // AddChromosomeNoLocation(expected_errors, entry);
6283 
6284  eval = validator.Validate(seh, options);
6285  CheckErrors(*eval, expected_errors);
6286 
6287  CLEAR_ERRORS
6288 
6289  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
6290  "BioSource is missing taxon ID"));
6291  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyIsSpeciesProblem",
6292  "Taxonomy lookup reports is_species_level FALSE"));
6293  // AddChromosomeNoLocation(expected_errors, entry);
6294  unit_test_util::SetTaxname(entry, "Poeciliinae");
6295  eval = validator.Validate(seh, options);
6296  CheckErrors(*eval, expected_errors);
6297 
6298  CLEAR_ERRORS
6299 
6300  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
6301  "BioSource is missing taxon ID"));
6302  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyConsultRequired",
6303  "Taxonomy lookup reports taxonomy consultation needed"));
6304  // AddChromosomeNoLocation(expected_errors, entry);
6305  unit_test_util::SetTaxname(entry, "Anabaena circinalis");
6306  eval = validator.Validate(seh, options);
6307  CheckErrors(*eval, expected_errors);
6308 
6309  CLEAR_ERRORS
6310 
6311  unit_test_util::SetTaxname(entry, "Homo sapiens");
6313  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadOrganelleLocation",
6314  "Only Chlorarachniophyceae and Cryptophyceae have nucleomorphs"));
6315  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
6316  "BioSource is missing taxon ID"));
6317  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyNucleomorphProblem",
6318  "Taxonomy lookup does not have expected nucleomorph flag"));
6319  // AddChromosomeNoLocation(expected_errors, entry);
6320  eval = validator.Validate(seh, options);
6321  CheckErrors(*eval, expected_errors);
6322 
6323 
6324  CLEAR_ERRORS
6325 }
6326 
6327 
6328 void TestConsultRequired(const string& taxname)
6329 {
6331  unit_test_util::SetTaxname(entry, taxname);
6332  unit_test_util::SetTaxon(entry, 0);
6333 
6335 
6336  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
6337  "BioSource is missing taxon ID"));
6338  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyConsultRequired",
6339  "Taxonomy lookup reports taxonomy consultation needed"));
6340  // AddChromosomeNoLocation(expected_errors, entry);
6341 
6342  eval = validator.Validate(seh, options);
6343  CheckErrors(*eval, expected_errors);
6344 
6345  CLEAR_ERRORS
6346 }
6347 
6348 
6350 {
6351  // TestConsultRequired("Colletotrichum cliviae");
6352 
6353  // TestConsultRequired("Erythrobacter marisflavi");
6354 }
6355 
6356 
6357 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleTitles)
6358 {
6359  // prepare entry
6361 
6362  SetTitle(entry, "First title");
6363  CRef<CSeqdesc> desc(new CSeqdesc());
6364  desc->SetTitle("Second title");
6365  entry->SetSeq().SetDescr().Set().push_back(desc);
6366 
6368 
6369  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MultipleTitles",
6370  "Undesired multiple title descriptors"));
6371  // AddChromosomeNoLocation(expected_errors, entry);
6372 
6373  eval = validator.Validate(seh, options);
6374  CheckErrors(*eval, expected_errors);
6375 
6376  CLEAR_ERRORS
6377 }
6378 
6379 
6380 BOOST_AUTO_TEST_CASE(Test_Descr_RefGeneTrackingOnNonRefSeq)
6381 {
6382  // prepare entry
6383  CRef<CSeq_entry> entry(new CSeq_entry());
6386  AddRefGeneTrackingUserObject(firstseq);
6387  entry->SetSet().SetSeq_set().push_back(firstseq);
6388 
6390  secondseq->SetSeq().SetId().front()->SetLocal().SetStr("good2");
6391  entry->SetSet().SetSeq_set().push_back(secondseq);
6392 
6394 
6395  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RefGeneTrackingOnNonRefSeq",
6396  "RefGeneTracking object should only be in RefSeq record"));
6397  // AddChromosomeNoLocation(expected_errors, "lcl|good");
6398  // AddChromosomeNoLocation(expected_errors, "lcl|good2");
6399 
6400  eval = validator.Validate(seh, options);
6401  CheckErrors(*eval, expected_errors);
6402 
6403  CLEAR_ERRORS
6404 
6405  // no error if any bioseq in record is RefSeq
6406  scope.RemoveTopLevelSeqEntry(seh);
6407  secondseq->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
6408  seh = scope.AddTopLevelSeqEntry(*entry);
6409  // AddChromosomeNoLocation(expected_errors, entry);
6410  eval = validator.Validate(seh, options);
6411  CheckErrors(*eval, expected_errors);
6412 
6413  CLEAR_ERRORS
6414 }
6415 
6416 
6417 BOOST_AUTO_TEST_CASE(Test_OrgModMissingValue)
6418 {
6419  // prepare entry
6421  unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6422  unit_test_util::SetTaxon(entry, 0);
6423  unit_test_util::SetTaxon(entry, 3702);
6424  unit_test_util::SetLineage(entry, "Cyanobacteria");
6426 
6428 
6429  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrgModMissingValue",
6430  "Variety value specified is not found in taxname"));
6431  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
6432  "Organism not found in taxonomy database (suggested:Arabidopsis thaliana var. foo)"));
6433  // AddChromosomeNoLocation(expected_errors, entry);
6434 
6435  eval = validator.Validate(seh, options);
6436  CheckErrors(*eval, expected_errors);
6437 
6440  expected_errors[0]->SetErrMsg("Forma value specified is not found in taxname");
6441  expected_errors[1]->SetErrMsg("Organism not found in taxonomy database (suggested:Arabidopsis thaliana f. foo)");
6442  eval = validator.Validate(seh, options);
6443  CheckErrors(*eval, expected_errors);
6444 
6447  expected_errors[0]->SetErrMsg("Subspecies value specified is not found in taxname");
6448  expected_errors[1]->SetErrMsg("Organism not found in taxonomy database (suggested:Arabidopsis thaliana subsp. foo)");
6449  eval = validator.Validate(seh, options);
6450  CheckErrors(*eval, expected_errors);
6451 
6452  CLEAR_ERRORS
6453  // this one does not cause taxname lookup to fail
6456  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
6457  "OrgModMissingValue",
6458  "Forma specialis value specified is not found in taxname"));
6459  // AddChromosomeNoLocation(expected_errors, entry);
6460  eval = validator.Validate(seh, options);
6461  CheckErrors(*eval, expected_errors);
6462 
6463  CLEAR_ERRORS
6464 
6465  // some don't produce errors
6468  // AddChromosomeNoLocation(expected_errors, entry);
6469  eval = validator.Validate(seh, options);
6470  CheckErrors(*eval, expected_errors);
6471 
6474  eval = validator.Validate(seh, options);
6475  CheckErrors(*eval, expected_errors);
6476 
6477 }
6478 
6479 
6480 BOOST_AUTO_TEST_CASE(Test_BadTextInSourceQualifier)
6481 {
6482  // descriptive text in non-text qualifiers
6484  unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6485  unit_test_util::SetTaxon(entry, 0);
6486  unit_test_util::SetTaxon(entry, 3702);
6492  AddGoodSourceFeature(entry);
6493 
6495 
6496  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTextInSourceQualifier",
6497  "Germline qualifier should not have descriptive text"));
6498  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTextInSourceQualifier",
6499  "Rearranged qualifier should not have descriptive text"));
6500  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTextInSourceQualifier",
6501  "Transgenic qualifier should not have descriptive text"));
6502  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTextInSourceQualifier",
6503  "Environmental_sample qualifier should not have descriptive text"));
6504  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTextInSourceQualifier",
6505  "Metagenomic qualifier should not have descriptive text"));
6506  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceInconsistency",
6507  "Germline and rearranged should not both be present"));
6508  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceInconsistency",
6509  "Transgenic and environmental sample should not both be present"));
6510  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "EnvironSampleMissingQualifier",
6511  "Environmental sample should also have isolation source or specific host annotated"));
6512  // AddChromosomeNoLocation(expected_errors, entry);
6513  eval = validator.Validate(seh, options);
6514  CheckErrors(*eval, expected_errors);
6515 
6516  CLEAR_ERRORS
6517 }
6518 
6519 
6520 BOOST_AUTO_TEST_CASE(Test_InvalidSexQualifier)
6521 {
6523  unit_test_util::SetLineage(entry, "Viruses; foo");
6525  unit_test_util::SetLineage(entry, "Bacteria; foo");
6527 
6528  // unexpected sex qualifier
6529  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidSexQualifier",
6530  "Unexpected use of /sex qualifier"));
6531  // AddChromosomeNoLocation(expected_errors, entry);
6532  eval = validator.Validate(seh, options);
6533  CheckErrors(*eval, expected_errors);
6534 
6535  CLEAR_ERRORS
6536 
6537  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidSexQualifier",
6538  "Unexpected use of /sex qualifier"));
6539  // AddChromosomeNoLocation(expected_errors, entry);
6540  unit_test_util::SetLineage(entry, "Archaea; foo");
6541  eval = validator.Validate(seh, options);
6542  CheckErrors(*eval, expected_errors);
6543  unit_test_util::SetLineage(entry, "Eukaryota; Fungi; foo");
6544  eval = validator.Validate(seh, options);
6545  CheckErrors(*eval, expected_errors);
6546  unit_test_util::SetLineage(entry, "");
6547  expected_errors[0]->SetErrMsg("Invalid value (a) for /sex qualifier");
6548  expected_errors[0]->SetSeverity(eDiag_Error);
6549  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingLineage",
6550  "No lineage for this BioSource."));
6551  eval = validator.Validate(seh, options);
6552  CheckErrors(*eval, expected_errors);
6553 
6554  CLEAR_ERRORS
6555 
6556  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingLineage",
6557  "No lineage for this BioSource."));
6558  // AddChromosomeNoLocation(expected_errors, entry);
6559 
6560  // no error if acceptable value
6561  vector<string> ok_sex_vals;
6562  ok_sex_vals.push_back("female");
6563  ok_sex_vals.push_back("male");
6564  ok_sex_vals.push_back("hermaphrodite");
6565  ok_sex_vals.push_back("unisexual");
6566  ok_sex_vals.push_back("bisexual");
6567  ok_sex_vals.push_back("asexual");
6568  ok_sex_vals.push_back("monoecious");
6569  ok_sex_vals.push_back("monecious");
6570  ok_sex_vals.push_back("dioecious");
6571  ok_sex_vals.push_back("diecious");
6572 
6573  for (const string& it : ok_sex_vals) {
6576  eval = validator.Validate(seh, options);
6577  CheckErrors(*eval, expected_errors);
6578  }
6579 
6580  CLEAR_ERRORS
6581 
6583  // mating-type error for animal
6584  unit_test_util::SetLineage(entry, "Eukaryota; Metazoa; foo");
6586  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidMatingType",
6587  "Unexpected use of /mating_type qualifier"));
6588  // AddChromosomeNoLocation(expected_errors, entry);
6589  eval = validator.Validate(seh, options);
6590  CheckErrors(*eval, expected_errors);
6591  // mating-type error for 3 plant lineages
6592  unit_test_util::SetLineage(entry, "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; foo");
6593  eval = validator.Validate(seh, options);
6594  CheckErrors(*eval, expected_errors);
6595  unit_test_util::SetLineage(entry, "Eukaryota; Rhodophyta; foo");
6596  eval = validator.Validate(seh, options);
6597  CheckErrors(*eval, expected_errors);
6598  unit_test_util::SetLineage(entry, "Eukaryota; stramenopiles; Phaeophyceae; foo");
6599  eval = validator.Validate(seh, options);
6600  CheckErrors(*eval, expected_errors);
6601  // mating-type error for virus
6602  unit_test_util::SetLineage(entry, "Viruses; foo");
6603  eval = validator.Validate(seh, options);
6604  CheckErrors(*eval, expected_errors);
6605  // for other lineages, error if sex value
6606  unit_test_util::SetLineage(entry, "Eukaryota; Fungi; foo");
6607 
6608  for (const string& it : ok_sex_vals) {
6611  eval = validator.Validate(seh, options);
6612  CheckErrors(*eval, expected_errors);
6613  }
6614  CLEAR_ERRORS
6615 
6616  // no error if not valid sex value
6619  // AddChromosomeNoLocation(expected_errors, entry);
6620 
6621  eval = validator.Validate(seh, options);
6622  CheckErrors(*eval, expected_errors);
6623 
6624 }
6625 
6626 
6627 BOOST_AUTO_TEST_CASE(Test_HIVMolType)
6628 {
6629  // prepare entry
6631  unit_test_util::SetTaxname(entry, "Human immunodeficiency virus");
6632  unit_test_util::SetTaxon(entry, 0);
6633  unit_test_util::SetTaxon(entry, 12721);
6634  unit_test_util::SetLineage(entry, "Cyanobacteria");
6635  // unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_variety, "foo");
6636 
6638 
6639  // HIV location problems
6641  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
6642  "HIV with moltype DNA should be proviral"));
6643  eval = validator.Validate(seh, options);
6644  CheckErrors(*eval, expected_errors);
6645 
6646  CLEAR_ERRORS
6647 
6648  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "InconsistentVirusMoltype",
6649  "HIV with mRNA molecule type is rare"));
6650  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
6652  eval = validator.Validate(seh, options);
6653  CheckErrors(*eval, expected_errors);
6654 
6655  CLEAR_ERRORS
6656 
6657 }
6658 
6659 BOOST_AUTO_TEST_CASE(Test_MissingPlasmid)
6660 {
6662  unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6663  unit_test_util::SetTaxon(entry, 0);
6664  unit_test_util::SetTaxon(entry, 3702);
6665  unit_test_util::SetLineage(entry, "Cyanobacteria");
6667 
6669 
6670  // plasmid
6671  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingPlasmidLocation",
6672  "Plasmid subsource but not plasmid location"));
6673  eval = validator.Validate(seh, options);
6674  CheckErrors(*eval, expected_errors);
6675  // error goes away if plasmid genome
6676  CLEAR_ERRORS
6677 
6679  // AddChromosomeNoLocation(expected_errors, entry);
6680  eval = validator.Validate(seh, options);
6681  CheckErrors(*eval, expected_errors);
6682 
6683  // if plasmid genome, better have plasmid name
6685  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingPlasmidName",
6686  "Plasmid location set but plasmid name missing. Add a plasmid source modifier with the plasmid name. Use unnamed if the name is not known."));
6687  eval = validator.Validate(seh, options);
6688  CheckErrors(*eval, expected_errors);
6689  CLEAR_ERRORS
6690 }
6691 
6692 BOOST_AUTO_TEST_CASE(Test_BadPlastidName)
6693 {
6694  // prepare entry
6696  unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6697  unit_test_util::SetTaxon(entry, 0);
6698  unit_test_util::SetTaxon(entry, 3702);
6699  unit_test_util::SetLineage(entry, "Cyanobacteria");
6701 
6703 
6704  // plastid-name
6705  vector<string> plastid_vals;
6706  plastid_vals.push_back("chloroplast");
6707  plastid_vals.push_back("chromoplast");
6708  plastid_vals.push_back("kinetoplast");
6709  plastid_vals.push_back("plastid");
6710  plastid_vals.push_back("apicoplast");
6711  plastid_vals.push_back("leucoplast");
6712  plastid_vals.push_back("proplastid");
6713 
6715  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPlastidName",
6716  "Plastid name subsource chloroplast but not chloroplast location"));
6717  // AddChromosomeNoLocation(expected_errors, entry);
6718  for (const string& it : plastid_vals) {
6721  expected_errors[0]->SetErrMsg("Plastid name subsource " + it + " but not " + it + " location");
6722  eval = validator.Validate(seh, options);
6723  CheckErrors(*eval, expected_errors);
6724  }
6725 
6728  expected_errors[0]->SetErrMsg("Plastid name subsource contains unrecognized value");
6729  eval = validator.Validate(seh, options);
6730  CheckErrors(*eval, expected_errors);
6731 
6732  CLEAR_ERRORS
6733 }
6734 
6735 BOOST_AUTO_TEST_CASE(Test_BadBioSourceFrequencyValue)
6736 {
6737  // prepare entry
6739  unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6740  unit_test_util::SetTaxon(entry, 0);
6741  unit_test_util::SetTaxon(entry, 3702);
6742  unit_test_util::SetLineage(entry, "Cyanobacteria");
6744 
6746  //frequency
6747  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadBioSourceFrequencyValue",
6748  "bad frequency qualifier value 1"));
6749  eval = validator.Validate(seh, options);
6750  CheckErrors(*eval, expected_errors);
6753  expected_errors[0]->SetSeverity(eDiag_Warning);
6754  expected_errors[0]->SetErrMsg("bad frequency qualifier value abc");
6755  eval = validator.Validate(seh, options);
6756  CheckErrors(*eval, expected_errors);
6758 
6759  CLEAR_ERRORS
6760 }
6761 
6762 
6763 BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceInconsistency)
6764 {
6765  // prepare entry
6767  unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6768  unit_test_util::SetTaxon(entry, 0);
6769  unit_test_util::SetTaxon(entry, 3702);
6770  unit_test_util::SetLineage(entry, "Cyanobacteria");
6771 // unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_variety, "foo");
6772 
6774 
6775 
6776  // unexpected qualifiers for viruses
6777  unit_test_util::SetLineage(entry, "Viruses; foo");
6780  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidSexQualifier",
6781  "Virus has unexpected Sex qualifier"));
6782  eval = validator.Validate(seh, options);
6783  CheckErrors(*eval, expected_errors);
6786  expected_errors[0]->SetErrCode("BioSourceInconsistency");
6787  expected_errors[0]->SetErrMsg("Virus has unexpected Cell-line qualifier");
6788  eval = validator.Validate(seh, options);
6789  CheckErrors(*eval, expected_errors);
6792  expected_errors[0]->SetErrMsg("Virus has unexpected Cell-type qualifier");
6793  eval = validator.Validate(seh, options);
6794  CheckErrors(*eval, expected_errors);
6797  expected_errors[0]->SetErrCode("InvalidTissueType");
6798  expected_errors[0]->SetErrMsg("Virus has unexpected Tissue-type qualifier");
6799  eval = validator.Validate(seh, options);
6800  CheckErrors(*eval, expected_errors);
6803  expected_errors[0]->SetErrCode("BioSourceInconsistency");
6804  expected_errors[0]->SetErrMsg("Virus has unexpected Dev-stage qualifier");
6805  eval = validator.Validate(seh, options);
6806  CheckErrors(*eval, expected_errors);
6809  expected_errors[0]->SetErrMsg("Virus has unexpected Breed qualifier");
6810  eval = validator.Validate(seh, options);
6811  CheckErrors(*eval, expected_errors);
6814  expected_errors[0]->SetErrMsg("Virus has unexpected Cultivar qualifier");
6815  eval = validator.Validate(seh, options);
6816  CheckErrors(*eval, expected_errors);
6818 
6821  expected_errors[0]->SetErrMsg("Germline and rearranged should not both be present");
6822  eval = validator.Validate(seh, options);
6823  CheckErrors(*eval, expected_errors);
6826 
6827  CLEAR_ERRORS
6828 
6829  scope.RemoveTopLevelSeqEntry(seh);
6833  unit_test_util::SetFocus(entry);
6835  seh = scope.AddTopLevelSeqEntry(*entry);
6836  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceInconsistency",
6837  "Transgenic and environmental sample should not both be present"));
6838  // AddChromosomeNoLocation(expected_errors, entry);
6839 
6840  eval = validator.Validate(seh, options);
6841  CheckErrors(*eval, expected_errors);
6842 
6843  CLEAR_ERRORS
6844 
6849  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "MissingEnvironmentalSample",
6850  "Metagenomic should also have environmental sample annotated"));
6851  eval = validator.Validate(seh, options);
6852  CheckErrors(*eval, expected_errors);
6853 
6854  CLEAR_ERRORS
6855 
6857  unit_test_util::SetLineage(entry, "Eukaryota; foo");
6860  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceInconsistency",
6861  "Sex and mating type should not both be present"));
6862  eval = validator.Validate(seh, options);
6863  CheckErrors(*eval, expected_errors);
6864 
6865  CLEAR_ERRORS
6866 
6869  unit_test_util::SetLineage(entry, "Eukaryota; metagenomes");
6870  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingMetagenomicQualifier",
6871  "If metagenomes appears in lineage, BioSource should have metagenomic qualifier"));
6872  eval = validator.Validate(seh, options);
6873  CheckErrors(*eval, expected_errors);
6874  CLEAR_ERRORS
6875 
6876 
6877  unit_test_util::SetTaxname(entry, "uncultured bacterium");
6878  unit_test_util::SetLineage(entry, "Bacteria; foo");
6879  unit_test_util::SetTaxon(entry, 0);
6880  unit_test_util::SetTaxon(entry, 77133);
6881  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnculturedNeedsEnvSample",
6882  "Uncultured should also have /environmental_sample"));
6883  eval = validator.Validate(seh, options);
6884  CheckErrors(*eval, expected_errors);
6885 
6886  CLEAR_ERRORS
6887 
6888  scope.RemoveTopLevelSeqEntry(seh);
6889  entry = unit_test_util::BuildGoodSeq();
6891  seh = scope.AddTopLevelSeqEntry(*entry);
6892  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
6893  "EnvironSampleMissingQualifier",
6894  "Environmental sample should also have isolation source or specific host annotated"));
6895  // AddChromosomeNoLocation(expected_errors, entry);
6896  eval = validator.Validate(seh, options);
6897  CheckErrors(*eval, expected_errors);
6898 
6899  CLEAR_ERRORS
6900 
6902  unit_test_util::SetDiv(entry, "BCT");
6904  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
6905  "BadOrganelleLocation",
6906  "Bacterial or viral source should not have organelle location"));
6907  eval = validator.Validate(seh, options);
6908  CheckErrors(*eval, expected_errors);
6909  unit_test_util::SetDiv(entry, "VRL");
6910  eval = validator.Validate(seh, options);
6911  CheckErrors(*eval, expected_errors);
6912 
6913  CLEAR_ERRORS
6914 
6915  unit_test_util::SetDiv(entry, "ENV");
6917  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
6918  "MissingEnvironmentalSample",
6919  "BioSource with ENV division is missing environmental sample subsource"));
6920  eval = validator.Validate(seh, options);
6921  CheckErrors(*eval, expected_errors);
6922 
6923  CLEAR_ERRORS
6924 
6925  unit_test_util::SetDiv(entry, "");
6929  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
6930  "StrainWithEnvironSample",
6931  "Strain should not be present in an environmental sample"));
6932  eval = validator.Validate(seh, options);
6933  CheckErrors(*eval, expected_errors);
6934 
6935  CLEAR_ERRORS
6936 
6941  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
6942  "MissingMetagenomicQualifier",
6943  "Metagenome source should also have metagenomic qualifier"));
6944  eval = validator.Validate(seh, options);
6945  CheckErrors(*eval, expected_errors);
6946 
6947  CLEAR_ERRORS
6948 
6950  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_synonym, "synonym value");
6952  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
6953  "OrgModValueInvalid",
6954  "OrgMod synonym is identical to OrgMod gb_synonym"));
6955  eval = validator.Validate(seh, options);
6956  CheckErrors(*eval, expected_errors);
6957 
6958  CLEAR_ERRORS
6959 
6963  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
6964  "InconsistentVirusMoltype",
6965  "cRNA note conflicts with molecule type"));
6966  eval = validator.Validate(seh, options);
6967  CheckErrors(*eval, expected_errors);
6968 
6970  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
6971  expected_errors[0]->SetErrMsg("cRNA note redundant with molecule type");
6972  eval = validator.Validate(seh, options);
6973  CheckErrors(*eval, expected_errors);
6974 
6977  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
6978  unit_test_util::SetLineage(entry, "Viruses; no DNA stage");
6979  expected_errors[0]->SetErrMsg("Genomic DNA viral lineage indicates no DNA stage");
6980  eval = validator.Validate(seh, options);
6981  CheckErrors(*eval, expected_errors);
6982 
6983  unit_test_util::SetLineage(entry, "Bacteria; foo");
6985  expected_errors[0]->SetErrMsg("cRNA note conflicts with molecule type");
6986  eval = validator.Validate(seh, options);
6987  CheckErrors(*eval, expected_errors);
6988 
6990  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
6991  expected_errors[0]->SetErrMsg("cRNA note redundant with molecule type");
6992  eval = validator.Validate(seh, options);
6993  CheckErrors(*eval, expected_errors);
6994 
6995  CLEAR_ERRORS
6996 
6997  scope.RemoveTopLevelSeqEntry(seh);
6998  entry = unit_test_util::BuildGoodSeq();
6999  seh = scope.AddTopLevelSeqEntry(*entry);
7000 
7001  // report missing env_sample/strain/isolate if bacterial and biosample
7002  unit_test_util::SetLineage(entry, "Bacteria; foo");
7003  CRef<CSeqdesc> biosample(new CSeqdesc());
7004  biosample->SetUser().SetType().SetStr("DBLink");
7006  f->SetLabel().SetStr("BioSample");
7007  f->SetData().SetStr("PRJNA12345");
7008  biosample->SetUser().SetData().push_back(f);
7009  entry->SetSeq().SetDescr().Set().push_back(biosample);
7010  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BacteriaMissingSourceQualifier",
7011  "Bacteria should have strain or isolate or environmental sample"));
7012  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DBLinkBadBioSample",
7013  "Bad BioSample format - PRJNA12345"));
7014  // AddChromosomeNoLocation(expected_errors, entry);
7015  eval = validator.Validate(seh, options);
7016  CheckErrors(*eval, expected_errors);
7017 
7018  CLEAR_ERRORS
7019 
7020  // no error if strain, isolate, or environmental sample set
7021  scope.RemoveTopLevelSeqEntry(seh);
7023  seh = scope.AddTopLevelSeqEntry(*entry);
7024  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DBLinkBadBioSample",
7025  "Bad BioSample format - PRJNA12345"));
7026  // AddChromosomeNoLocation(expected_errors, entry);
7027  eval = validator.Validate(seh, options);
7028  CheckErrors(*eval, expected_errors);
7029 
7030  scope.RemoveTopLevelSeqEntry(seh);
7033  seh = scope.AddTopLevelSeqEntry(*entry);
7034  eval = validator.Validate(seh, options);
7035  CheckErrors(*eval, expected_errors);
7036 
7037  scope.RemoveTopLevelSeqEntry(seh);
7040  seh = scope.AddTopLevelSeqEntry(*entry);
7041  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "EnvironSampleMissingQualifier",
7042  "Environmental sample should also have isolation source or specific host annotated"));
7043  eval = validator.Validate(seh, options);
7044  CheckErrors(*eval, expected_errors);
7045 
7046  CLEAR_ERRORS
7047 }
7048 
7049 
7051 {
7053  unit_test_util::SetLineage(entry, "Bacteria; foo");
7057 
7058  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidTissueType",
7059  "Tissue-type is inappropriate for bacteria"));
7060  // AddChromosomeNoLocation(expected_errors, entry);
7061  eval = validator.Validate(seh, options);
7062  CheckErrors(*eval, expected_errors);
7063 
7064  CLEAR_ERRORS
7065 }
7066 
7067 BOOST_AUTO_TEST_CASE(Test_InconsistentVirusMoltype)
7068 {
7070 
7072 
7073  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses");
7074  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InconsistentVirusMoltype",
7075  "Negative-sense single-stranded RNA virus with plus strand CDS should be cRNA"));
7076  expected_errors[0]->SetAccession("lcl|nuc");
7077  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MolInfoConflictsWithBioSource",
7078  "Taxonomy indicates single-stranded RNA, molecule type (DNA) is conflicting."));
7079  expected_errors[1]->SetAccession("lcl|nuc");
7080  eval = validator.Validate(seh, options);
7081  CheckErrors(*eval, expected_errors);
7082  CLEAR_ERRORS
7083 
7084  // error remains if mRNA
7086  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InconsistentVirusMoltype",
7087  "Negative-sense single-stranded RNA virus with plus strand CDS should be cRNA"));
7088  expected_errors[0]->SetAccession("lcl|nuc");
7089  entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7090  eval = validator.Validate(seh, options);
7091  // AddChromosomeNoLocation(expected_errors, entry);
7092  CheckErrors(*eval, expected_errors);
7093  // error goes away if mRNA or cRNA or ambisense or synthetic
7094  CLEAR_ERRORS
7095 
7097  eval = validator.Validate(seh, options);
7098  CheckErrors(*eval, expected_errors);
7100  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Arenaviridae");
7101  eval = validator.Validate(seh, options);
7102  CheckErrors(*eval, expected_errors);
7103  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Phlebovirus");
7104  eval = validator.Validate(seh, options);
7105  CheckErrors(*eval, expected_errors);
7106  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Tospovirus");
7107  eval = validator.Validate(seh, options);
7108  CheckErrors(*eval, expected_errors);
7109  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Tenuivirus");
7110  eval = validator.Validate(seh, options);
7111  CheckErrors(*eval, expected_errors);
7112  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses");
7115  eval = validator.Validate(seh, options);
7116  CheckErrors(*eval, expected_errors);
7118  unit_test_util::SetDiv(entry, "VRL");
7120  eval = validator.Validate(seh, options);
7121  CheckErrors(*eval, expected_errors);
7125  eval = validator.Validate(seh, options);
7126  CheckErrors(*eval, expected_errors);
7127 
7131  eval = validator.Validate(seh, options);
7132  CheckErrors(*eval, expected_errors);
7135 
7136  scope.RemoveTopLevelSeqEntry(seh);
7137  unit_test_util::RevComp(entry);
7138  seh = scope.AddTopLevelSeqEntry(*entry);
7139  // still no error if genomic
7140  eval = validator.Validate(seh, options);
7141  CheckErrors(*eval, expected_errors);
7142 
7143  CLEAR_ERRORS
7144 
7145  // error if not genomic
7147  entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7148  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "CDSonMinusStrandMRNA",
7149  "CDS should not be on minus strand of mRNA molecule"));
7150  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InconsistentVirusMoltype",
7151  "Negative-sense single-stranded RNA virus with minus strand CDS should be genomic RNA"));
7152  // AddChromosomeNoLocation(expected_errors, entry);
7153  eval = validator.Validate(seh, options);
7154  CheckErrors(*eval, expected_errors);
7155 
7156  CLEAR_ERRORS
7157 
7158  scope.RemoveTopLevelSeqEntry(seh);
7159  entry = unit_test_util::BuildGoodSeq();
7160  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses");
7162  misc_feat->SetComment("nonfunctional");
7163  seh = scope.AddTopLevelSeqEntry(*entry);
7164  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
7165  "Negative-sense single-stranded RNA virus with nonfunctional plus strand misc_feature should be cRNA"));
7166  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MolInfoConflictsWithBioSource",
7167  "Taxonomy indicates single-stranded RNA, molecule type (DNA) is conflicting."));
7168  // AddChromosomeNoLocation(expected_errors, entry);
7169  eval = validator.Validate(seh, options);
7170  CheckErrors(*eval, expected_errors);
7171 
7172  // error stays if mRNA
7173  CLEAR_ERRORS
7174 
7175  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
7176  "Negative-sense single-stranded RNA virus with nonfunctional plus strand misc_feature should be cRNA"));
7178  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7179  // AddChromosomeNoLocation(expected_errors, entry);
7180  eval = validator.Validate(seh, options);
7181  CheckErrors(*eval, expected_errors);
7182 
7183  // error goes away if cRNA or ambisense or synthetic
7184  CLEAR_ERRORS
7185 
7187  eval = validator.Validate(seh, options);
7188  CheckErrors(*eval, expected_errors);
7190  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Arenaviridae");
7191  eval = validator.Validate(seh, options);
7192  CheckErrors(*eval, expected_errors);
7193  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Phlebovirus");
7194  eval = validator.Validate(seh, options);
7195  CheckErrors(*eval, expected_errors);
7196  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Tospovirus");
7197  eval = validator.Validate(seh, options);
7198  CheckErrors(*eval, expected_errors);
7199  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Tenuivirus");
7200  eval = validator.Validate(seh, options);
7201  CheckErrors(*eval, expected_errors);
7202 
7203  scope.RemoveTopLevelSeqEntry(seh);
7204  unit_test_util::RevComp(entry);
7205  seh = scope.AddTopLevelSeqEntry(*entry);
7206  // still no error if genomic
7207  eval = validator.Validate(seh, options);
7208  CheckErrors(*eval, expected_errors);
7209 
7210  // error if not genomic
7212  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7213  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
7214  "Ambisense virus should be genomic RNA or cRNA"));
7215  eval = validator.Validate(seh, options);
7216  CheckErrors(*eval, expected_errors);
7217 
7218  CLEAR_ERRORS
7219 
7221  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7222  unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses");
7223  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
7224  "Negative-sense single-stranded RNA virus with nonfunctional minus strand misc_feature should be genomic RNA"));
7225  eval = validator.Validate(seh, options);
7226  CheckErrors(*eval, expected_errors);
7227 
7228  CLEAR_ERRORS
7229 }
7230 
7231 
7232 BOOST_AUTO_TEST_CASE(Test_SingleStrandViruses)
7233 {
7235  unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses");
7237  entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7238 
7240 
7241  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InconsistentVirusMoltype",
7242  "Positive-sense single-stranded RNA virus should be genomic RNA"));
7243  // AddChromosomeNoLocation(expected_errors, entry);
7244  eval = validator.Validate(seh, options);
7245  CheckErrors(*eval, expected_errors);
7246 
7247  // error goes away if ambisense or synthetic
7248  CLEAR_ERRORS
7249 
7250  // AddChromosomeNoLocation(expected_errors, entry);
7251  unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses; Arenaviridae");
7252  eval = validator.Validate(seh, options);
7253  CheckErrors(*eval, expected_errors);
7254  unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses; Phlebovirus");
7255  eval = validator.Validate(seh, options);
7256  CheckErrors(*eval, expected_errors);
7257  unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses; Tospovirus");
7258  eval = validator.Validate(seh, options);
7259  CheckErrors(*eval, expected_errors);
7260  unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses; Tenuivirus");
7261  eval = validator.Validate(seh, options);
7262  CheckErrors(*eval, expected_errors);
7263  unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses");
7265  eval = validator.Validate(seh, options);
7266  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InvalidForType",
7267  "Molinfo-biomol other should be used if Biosource-location is synthetic"));
7268  CheckErrors(*eval, expected_errors);
7269  CLEAR_ERRORS
7270  unit_test_util::SetDiv(entry, "VRL");
7272  // AddChromosomeNoLocation(expected_errors, entry);
7273  eval = validator.Validate(seh, options);
7274  CheckErrors(*eval, expected_errors);
7277  eval = validator.Validate(seh, options);
7278  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InvalidForType",
7279  "artificial origin should have other-genetic"));
7280  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SyntheticConstructWrongMolType",
7281  "synthetic construct should have other-genetic"));
7282  CheckErrors(*eval, expected_errors);
7283 
7284  CLEAR_ERRORS
7285 }
7286 
7287 
7288 BOOST_AUTO_TEST_CASE(Test_Descr_FastaBracketTitle)
7289 {
7290  // prepare entry
7292  SetTitle(entry, "[a=b]");
7293 
7295 
7296  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FastaBracketTitle",
7297  "Title may have unparsed [...=...] construct"));
7298  // AddChromosomeNoLocation(expected_errors, entry);
7299 
7300  eval = validator.Validate(seh, options);
7301  CheckErrors(*eval, expected_errors);
7302 
7303  CLEAR_ERRORS
7304 
7305  // no error if TMSMART or BankIt
7306  scope.RemoveTopLevelSeqEntry(seh);
7307  CRef<CSeq_id> other(new CSeq_id());
7308  other->SetGeneral().SetDb("TMSMART");
7309  other->SetGeneral().SetTag().SetStr("good");
7310  entry->SetSeq().SetId().push_back(other);
7311  seh = scope.AddTopLevelSeqEntry(*entry);
7312  // AddChromosomeNoLocation(expected_errors, entry);
7313  eval = validator.Validate(seh, options);
7314  CheckErrors(*eval, expected_errors);
7315  CLEAR_ERRORS
7316 
7317  scope.RemoveTopLevelSeqEntry(seh);
7318  other->SetGeneral().SetDb("BankIt");
7319  seh = scope.AddTopLevelSeqEntry(*entry);
7320  // AddChromosomeNoLocation(expected_errors, entry);
7321  eval = validator.Validate(seh, options);
7322  CheckErrors(*eval, expected_errors);
7323 
7324  CLEAR_ERRORS
7325 }
7326 
7327 
7328 BOOST_AUTO_TEST_CASE(Test_Descr_MissingText)
7329 {
7330  // prepare entry
7332  CRef<CSeqdesc> desc(new CSeqdesc());
7333  desc->SetComment();
7334  entry->SetSeq().SetDescr().Set().push_back(desc);
7335 
7337 
7338  // comment
7339  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "CommentMissingText",
7340  "Comment descriptor needs text"));
7341  // AddChromosomeNoLocation(expected_errors, entry);
7342 
7343  eval = validator.Validate(seh, options);
7344  CheckErrors(*eval, expected_errors);
7345  CLEAR_ERRORS
7346 
7347  // title
7348  scope.RemoveTopLevelSeqEntry(seh);
7349  desc->SetTitle();
7350  seh = scope.AddTopLevelSeqEntry(*entry);
7351  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
7352  "TitleMissingText", "Title descriptor needs text"));
7353  // AddChromosomeNoLocation(expected_errors, entry);
7354  eval = validator.Validate(seh, options);
7355  CheckErrors(*eval, expected_errors);
7356  CLEAR_ERRORS
7357 
7358  // name
7359  scope.RemoveTopLevelSeqEntry(seh);
7360  desc->SetName();
7361  seh = scope.AddTopLevelSeqEntry(*entry);
7362  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
7363  "MissingText", "Name descriptor needs text"));
7364  // AddChromosomeNoLocation(expected_errors, entry);
7365  eval = validator.Validate(seh, options);
7366  CheckErrors(*eval, expected_errors);
7367  CLEAR_ERRORS
7368 
7369  // region
7370  scope.RemoveTopLevelSeqEntry(seh);
7371  desc->SetRegion();
7372  seh = scope.AddTopLevelSeqEntry(*entry);
7373  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RegionMissingText",
7374  "Region descriptor needs text"));
7375  // AddChromosomeNoLocation(expected_errors, entry);
7376  eval = validator.Validate(seh, options);
7377  CheckErrors(*eval, expected_errors);
7378 
7379  CLEAR_ERRORS
7380 }
7381 
7382 
7383 BOOST_AUTO_TEST_CASE(Test_Descr_BadCollectionDate)
7384 {
7385  // prepare entry
7388 
7390 
7391  // bad format
7392  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadCollectionDate",
7393  "Collection_date format is not in DD-Mmm-YYYY format"));
7394  // AddChromosomeNoLocation(expected_errors, entry);
7395 
7396  eval = validator.Validate(seh, options);
7397  CheckErrors(*eval, expected_errors);
7398 
7399  // still bad format
7402  eval = validator.Validate(seh, options);
7403  CheckErrors(*eval, expected_errors);
7404 
7405  // range has bad format
7407  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_collection_date, "21-Oct-2013-20-Oct-2015");
7408  eval = validator.Validate(seh, options);
7409  CheckErrors(*eval, expected_errors);
7410 
7413  expected_errors[0]->SetErrMsg("Collection_date is in the future");
7414  eval = validator.Validate(seh, options);
7415  CheckErrors(*eval, expected_errors);
7416 
7417  // range in future
7419  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_collection_date, "21-Oct-2013/20-Oct-2030");
7420  eval = validator.Validate(seh, options);
7421  CheckErrors(*eval, expected_errors);
7422 
7423  CLEAR_ERRORS
7424 
7425  // ISO date should be ok
7428  // AddChromosomeNoLocation(expected_errors, entry);
7429  eval = validator.Validate(seh, options);
7430  CheckErrors(*eval, expected_errors);
7431 
7432  // range of dates should be ok
7435  eval = validator.Validate(seh, options);
7436  CheckErrors(*eval, expected_errors);
7439  eval = validator.Validate(seh, options);
7440  CheckErrors(*eval, expected_errors);
7442  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_collection_date, "06-Aug-2004/07-Jan-2007");
7443  eval = validator.Validate(seh, options);
7444  CheckErrors(*eval, expected_errors);
7445 
7446  bool bad_format = false, in_future = false;
7447  CSubSource::IsCorrectDateFormat("29-Feb-2012", bad_format, in_future);
7448  BOOST_CHECK_EQUAL(bad_format, false);
7449  BOOST_CHECK_EQUAL(in_future, false);
7450 
7451  CSubSource::IsCorrectDateFormat("2014-06", bad_format, in_future);
7452  BOOST_CHECK_EQUAL(bad_format, false);
7453  BOOST_CHECK_EQUAL(in_future, false);
7454 
7455  CLEAR_ERRORS
7456 }
7457 
7458 
7459 BOOST_AUTO_TEST_CASE(Test_Descr_BadPCRPrimerSequence)
7460 {
7461  char bad_ch;
7462  BOOST_CHECK_EQUAL(CPCRPrimerSeq::IsValid("01-May-2010", bad_ch), false);
7463  BOOST_CHECK_EQUAL(bad_ch, '0');
7464 
7465  // prepare entry
7468 
7470 
7471  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerSequence",
7472  "PCR forward primer sequence format is incorrect, first bad character is '?'"));
7473  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerSequence",
7474  "PCR primer does not have both sequences"));
7475  // AddChromosomeNoLocation(expected_errors, entry);
7476 
7477  eval = validator.Validate(seh, options);
7478  CheckErrors(*eval, expected_errors);
7479 
7482  expected_errors[0]->SetErrMsg("PCR reverse primer sequence format is incorrect, first bad character is '0'");
7483 
7484  eval = validator.Validate(seh, options);
7485  CheckErrors(*eval, expected_errors);
7486 
7489  expected_errors[0]->SetErrMsg("PCR reverse primer sequence format is incorrect, first bad character is 'q'");
7490 
7491  eval = validator.Validate(seh, options);
7492  CheckErrors(*eval, expected_errors);
7493 
7496  expected_errors[0]->SetErrMsg("PCR reverse primer sequence format is incorrect, first bad character is '?'");
7497 
7498  eval = validator.Validate(seh, options);
7499  CheckErrors(*eval, expected_errors);
7500 
7501  CLEAR_ERRORS
7502 
7506 
7507  // AddChromosomeNoLocation(expected_errors, entry);
7508  eval = validator.Validate(seh, options);
7509  CheckErrors(*eval, expected_errors);
7510 
7513  for (auto& it : entry->SetSeq().SetDescr().Set()) {
7514  if (it->IsSource()) {
7515  CRef<CPCRPrimer> fwd(new CPCRPrimer());
7516  fwd->SetName().Set("AATTGGCCAATTGGC");
7517  fwd->SetSeq().Set("AATTGGCCAATTGG4C");
7518  CRef<CPCRReaction> reaction(new CPCRReaction());
7519  reaction->SetForward().Set().push_back(fwd);
7520  CRef<CPCRPrimer> rev(new CPCRPrimer());
7521  rev->SetName().Set("AATTGGCCAATTGGC");
7522  rev->SetSeq().Set("AATTGGCCAATTGG5C");
7523  reaction->SetReverse().Set().push_back(rev);
7524  it->SetSource().SetPcr_primers().Set().push_back(reaction);
7525  }
7526  }
7527 
7528  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerSequence",
7529  "PCR forward primer sequence format is incorrect, first bad character is '4'"));
7530  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerName",
7531  "PCR forward primer name appears to be a sequence"));
7532  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerSequence",
7533  "PCR reverse primer sequence format is incorrect, first bad character is '5'"));
7534  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerName",
7535  "PCR reverse primer name appears to be a sequence"));
7536 
7537  eval = validator.Validate(seh, options);
7538  CheckErrors(*eval, expected_errors);
7539 
7540  CLEAR_ERRORS
7541 }
7542 
7543 BOOST_AUTO_TEST_CASE(Test_Descr_ModifyPCRPrimer)
7544 {
7545  string fwd_seq;
7546  fwd_seq.assign("5-agtctctctc-");
7547  bool modified = CPCRPrimerSeq::TrimJunk(fwd_seq);
7548  BOOST_CHECK_EQUAL(modified, true);
7549  BOOST_CHECK_EQUAL(fwd_seq, string("agtctctctc"));
7550 
7551  fwd_seq.assign("5`aattggccaattg3'");
7552  modified = CPCRPrimerSeq::TrimJunk(fwd_seq);
7553  BOOST_CHECK_EQUAL(modified, true);
7554  BOOST_CHECK_EQUAL(fwd_seq, string("aattggccaattg"));
7555 
7556  fwd_seq.assign("aattggccaacct");
7557  modified = CPCRPrimerSeq::TrimJunk(fwd_seq);
7558  BOOST_CHECK_EQUAL(modified, false);
7559  BOOST_CHECK_EQUAL(fwd_seq, string("aattggccaacct"));
7560 
7561  fwd_seq.assign("agttt<I>tagaga<i>gac");
7562  modified = CPCRPrimerSeq::Fixi(fwd_seq);
7563  BOOST_CHECK_EQUAL(modified, true);
7564  BOOST_CHECK_EQUAL(fwd_seq, string("agttt<i>tagaga<i>gac"));
7565 
7566  fwd_seq.assign("agtccat<iagata>gtct");
7567  modified = CPCRPrimerSeq::Fixi(fwd_seq);
7568  BOOST_CHECK_EQUAL(modified, true);
7569  BOOST_CHECK_EQUAL(fwd_seq, string("agtccat<i>agata>gtct"));
7570 
7571  fwd_seq.assign("agtccat<i>gtctaaa");
7572  modified = CPCRPrimerSeq::Fixi(fwd_seq);
7573  BOOST_CHECK_EQUAL(modified, false);
7574  BOOST_CHECK_EQUAL(fwd_seq, string("agtccat<i>gtctaaa"));
7575 
7576 }
7577 
7578 BOOST_AUTO_TEST_CASE(Test_Descr_BadPunctuation)
7579 {
7580  // prepare entry
7582  CRef<CSeqdesc> desc(new CSeqdesc());
7583  desc->SetTitle("abc.");
7584  entry->SetSeq().SetDescr().Set().push_back(desc);
7585 
7587 
7588  // end with period
7589  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPunctuation",
7590  "Title descriptor ends in bad punctuation"));
7591  // AddChromosomeNoLocation(expected_errors, entry);
7592 
7593  eval = validator.Validate(seh, options);
7594  CheckErrors(*eval, expected_errors);
7595 
7596  // end with comma
7597  desc->SetTitle("abc,");
7598  eval = validator.Validate(seh, options);
7599  CheckErrors(*eval, expected_errors);
7600 
7601  // end with semicolon
7602  desc->SetTitle("abc;");
7603  eval = validator.Validate(seh, options);
7604  CheckErrors(*eval, expected_errors);
7605 
7606  // end with colon
7607  desc->SetTitle("abc:");
7608  eval = validator.Validate(seh, options);
7609  CheckErrors(*eval, expected_errors);
7610 
7611  CLEAR_ERRORS
7612 }
7613 
7614 
7615 BOOST_AUTO_TEST_CASE(Test_Descr_BadPCRPrimerName)
7616 {
7617  // prepare entry
7620 
7622 
7623  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerName",
7624  "PCR primer name appears to be a sequence"));
7625  // AddChromosomeNoLocation(expected_errors, entry);
7626 
7627  eval = validator.Validate(seh, options);
7628  CheckErrors(*eval, expected_errors);
7629 
7632 
7633  eval = validator.Validate(seh, options);
7634  CheckErrors(*eval, expected_errors);
7635 
7636  CLEAR_ERRORS
7637 
7638  // no error if invalid sequence
7641 
7642  // AddChromosomeNoLocation(expected_errors, entry);
7643  eval = validator.Validate(seh, options);
7644  CheckErrors(*eval, expected_errors);
7645 
7646  CLEAR_ERRORS
7647 }
7648 
7649 
7650 BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceOnProtein)
7651 {
7652  // prepare entry
7654  unit_test_util::AddGoodSource(entry->SetSet().SetSeq_set().back());
7655 
7657 
7658  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceOnProtein",
7659  "Nuc-prot set has 1 protein with a BioSource descriptor"));
7660  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
7661  // AddChromosomeNoLocation(expected_errors, "lcl|prot");
7662 
7663  eval = validator.Validate(seh, options);
7664  CheckErrors(*eval, expected_errors);
7665 
7666  CLEAR_ERRORS
7667 }
7668 
7669 
7670 BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceDbTagConflict)
7671 {
7672  // prepare entry
7674  unit_test_util::SetDbxref(entry, "AFTOL", 12345);
7675  unit_test_util::SetDbxref(entry, "AFTOL", 12346);
7676 
7678 
7679  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceDbTagConflict",
7680  "BioSource uses db AFTOL multiple times"));
7681  // AddChromosomeNoLocation(expected_errors, entry);
7682 
7683  eval = validator.Validate(seh, options);
7684  CheckErrors(*eval, expected_errors);
7685 
7686  CLEAR_ERRORS
7687 }
7688 
7689 
7691 {
7693  CRef<CBioSource> src;
7694  for (auto it : entry->SetSeq().SetDescr().Set()) {
7695  if (it->IsSource()) {
7696  src.Reset(&(it->SetSource()));
7697  break;
7698  }
7699  }
7700  src->SetPcr_primers().Assign(rset);
7702  eval = validator.Validate(seh, options);
7703  for (CValidError_CI vit(*eval); vit; ++vit) {
7704  if (NStr::Equal(vit->GetErrCode(), "DuplicatePCRPrimerSequence")) {
7705  return false;
7706  }
7707  }
7708  return true;
7709 }
7710 
7711 
7712 BOOST_AUTO_TEST_CASE(Test_Descr_DuplicatePCRPrimerSequence)
7713 {
7714  // prepare entry
7716  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_fwd_primer_seq, "(AAATTTGGGCCC,AAATTTGGGCCC)");
7717  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_rev_primer_seq, "(CCCTTTGGGCCC,CCCTTTGGGCCC)");
7718 
7720 
7721  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicatePCRPrimerSequence",
7722  "PCR primer sequence has duplicates"));
7723  // AddChromosomeNoLocation(expected_errors, entry);
7724 
7725  eval = validator.Validate(seh, options);
7726  CheckErrors(*eval, expected_errors);
7727 
7728  CLEAR_ERRORS
7729 
7730  CRef<CPCRPrimer> f1(new CPCRPrimer());
7731  CRef<CPCRPrimer> f2(new CPCRPrimer());
7732  CRef<CPCRPrimer> rv1(new CPCRPrimer());
7733  CRef<CPCRPrimer> rv2(new CPCRPrimer());
7734  CRef<CPCRReaction> r1(new CPCRReaction());
7735  CRef<CPCRReaction> r2(new CPCRReaction());
7736 
7738  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7739  rset->Set().push_back(r1);
7740  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7741  rset->Set().push_back(r2);
7742  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), false);
7743  r1->SetForward().Set().push_back(f1);
7744  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7745  r2->SetForward().Set().push_back(f2);
7746  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), false);
7747  f1->SetSeq().Set("aa");
7748  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7749  f2->SetSeq().Set("tt");
7750  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7751  f2->SetSeq().Set("aa");
7752  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), false);
7753  r1->SetReverse().Set().push_back(rv1);
7754  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7755  r2->SetReverse().Set().push_back(rv2);
7756  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), false);
7757  rv1->SetName().Set("a name");
7758  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7759  rv2->SetName().Set("a different name");
7760  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7761  rv2->SetName().Set("a name");
7762  BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), false);
7763 }
7764 
7765 
7766 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleNames)
7767 {
7768  // prepare entry
7770  CRef<CSeqdesc> d1(new CSeqdesc());
7771  d1->SetName("name #1");
7772  entry->SetSeq().SetDescr().Set().push_back(d1);
7773  CRef<CSeqdesc> d2(new CSeqdesc());
7774  d2->SetName("name #1");
7775  entry->SetSeq().SetDescr().Set().push_back(d2);
7776 
7778 
7779  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleNames",
7780  "Undesired multiple name descriptors, identical text"));
7781  // AddChromosomeNoLocation(expected_errors, entry);
7782  eval = validator.Validate(seh, options);
7783  CheckErrors(*eval, expected_errors);
7784 
7785  d2->SetName("name #2");
7786  expected_errors[0]->SetErrMsg("Undesired multiple name descriptors, different text");
7787  eval = validator.Validate(seh, options);
7788  CheckErrors(*eval, expected_errors);
7789 
7790  CLEAR_ERRORS
7791 }
7792 
7793 
7794 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleComments)
7795 {
7796  // prepare entry
7798  CRef<CSeqdesc> d1(new CSeqdesc());
7799  d1->SetComment("name 1");
7800  entry->SetSeq().SetDescr().Set().push_back(d1);
7801  CRef<CSeqdesc> d2(new CSeqdesc());
7802  d2->SetComment("name 1");
7803  entry->SetSeq().SetDescr().Set().push_back(d2);
7804 
7806 
7807  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleComments",
7808  "Undesired multiple comment descriptors, identical text"));
7809  // AddChromosomeNoLocation(expected_errors, entry);
7810  eval = validator.Validate(seh, options);
7811  CheckErrors(*eval, expected_errors);
7812 
7813  CLEAR_ERRORS
7814 
7815  // ok if different
7816  d2->SetComment("name 2");
7817  eval = validator.Validate(seh, options);
7818  // AddChromosomeNoLocation(expected_errors, entry);
7819 
7820  CheckErrors(*eval, expected_errors);
7821  CLEAR_ERRORS
7822 }
7823 
7824 
7825 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonFormat)
7826 {
7827  // prepare entry
7830 
7832 
7833  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonFormat",
7834  "lat_lon format has extra text after correct dd.dd N|S ddd.dd E|W format"));
7835  // AddChromosomeNoLocation(expected_errors, entry);
7836 
7837  eval = validator.Validate(seh, options);
7838  CheckErrors(*eval, expected_errors);
7839 
7842  expected_errors[0]->SetErrMsg("lat_lon format is incorrect - should be dd.dd N|S ddd.dd E|W");
7843  expected_errors[0]->SetSeverity(eDiag_Error);
7844  eval = validator.Validate(seh, options);
7845  CheckErrors(*eval, expected_errors);
7846 
7847  CLEAR_ERRORS
7848 }
7849 
7850 
7851 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonRange)
7852 {
7853  // prepare entry
7856 
7858 
7859  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonRange",
7860  "latitude value is out of range - should be between 90.00 N and 90.00 S"));
7861  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonRange",
7862  "longitude value is out of range - should be between 180.00 E and 180.00 W"));
7863  // AddChromosomeNoLocation(expected_errors, entry);
7864  eval = validator.Validate(seh, options);
7865  CheckErrors(*eval, expected_errors);
7866 
7869  eval = validator.Validate(seh, options);
7870  CheckErrors(*eval, expected_errors);
7871 
7872  CLEAR_ERRORS
7873 }
7874 
7875 
7876 BOOST_AUTO_TEST_CASE(Test_Descr_BadAltitude)
7877 {
7878  // prepare entry
7882 
7883  eval = validator.Validate(seh, options);
7884  // AddChromosomeNoLocation(expected_errors, entry);
7885  CheckErrors(*eval, expected_errors);
7886 
7889  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAltitude",
7890  "'123' is an invalid altitude value, altitude should be provided in meters"));
7891 
7892  eval = validator.Validate(seh, options);
7893  CheckErrors(*eval, expected_errors);
7894 
7895  // raise to error
7896  expected_errors[0]->SetSeverity(eDiag_Error);
7897  eval = validator.Validate(seh, options | CValidator::eVal_genome_submission);
7898  CheckErrors(*eval, expected_errors);
7899 
7900 
7901  CLEAR_ERRORS
7902 
7905  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAltitude",
7906  "'123 ft.' is an invalid altitude value, altitude should be provided in meters"));
7907  // AddChromosomeNoLocation(expected_errors, entry);
7908 
7909  eval = validator.Validate(seh, options);
7910  CheckErrors(*eval, expected_errors);
7911 
7912  // raise to error
7913  expected_errors[0]->SetSeverity(eDiag_Error);
7914  eval = validator.Validate(seh, options | CValidator::eVal_genome_submission);
7915  CheckErrors(*eval, expected_errors);
7916 
7917  CLEAR_ERRORS
7918 
7919  BOOST_CHECK_EQUAL(CSubSource::FixAltitude("123 ft."), "37 m");
7920 }
7921 
7922 
7923 BOOST_AUTO_TEST_CASE(Test_IsLikelyTaxname)
7924 {
7925  BOOST_CHECK_EQUAL(IsLikelyTaxname(""), false);
7926  BOOST_CHECK_EQUAL(IsLikelyTaxname(" "), false);
7927  BOOST_CHECK_EQUAL(IsLikelyTaxname("Carassius"), false);
7928  BOOST_CHECK_EQUAL(IsLikelyTaxname("Carassius sp."), true);
7929  BOOST_CHECK_EQUAL(IsLikelyTaxname("Carassius carassius"), true);
7930  BOOST_CHECK_EQUAL(IsLikelyTaxname(" Carassius carassius"), false);
7931  BOOST_CHECK_EQUAL(IsLikelyTaxname("1Carassius carassius"), false);
7932  BOOST_CHECK_EQUAL(IsLikelyTaxname("Homunculus loxodontus"), false);
7933 }
7934 
7935 
7936 void TestSpecificHostNoError(const string& host)
7937 {
7938  // prepare entry
7941 
7943  options |= CValidator::eVal_use_entrez;
7944  // AddChromosomeNoLocation(expected_errors, entry);
7945  eval = validator.Validate(seh, options);
7946  CheckErrors(*eval, expected_errors);
7947  CLEAR_ERRORS
7948 }
7949 
7950 
7951 BOOST_AUTO_TEST_CASE(Test_Descr_BadSpecificHost)
7952 {
7953  // prepare entry
7955  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Metapone madagascaria");
7956 
7958 
7959  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadSpecificHost",
7960  "Specific host value is misspelled: Metapone madagascaria"));
7961  // AddChromosomeNoLocation(expected_errors, entry);
7962  eval = validator.Validate(seh, options);
7963  CheckErrors(*eval, expected_errors);
7964 
7966  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Homo Sapiens");
7967  expected_errors[0]->SetErrMsg("Specific host value is incorrectly capitalized: Homo Sapiens");
7968  eval = validator.Validate(seh, options);
7969  CheckErrors(*eval, expected_errors);
7970 
7972  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Homo nonrecognizedus");
7973  expected_errors[0]->SetErrMsg("Invalid value for specific host: Homo nonrecognizedus");
7974  eval = validator.Validate(seh, options);
7975  CheckErrors(*eval, expected_errors);
7976 
7977  CLEAR_ERRORS
7978  // should not generate an error
7981  // AddChromosomeNoLocation(expected_errors, entry);
7982  eval = validator.Validate(seh, options);
7983  CheckErrors(*eval, expected_errors);
7984 
7985  // also, can ignore text after semicolon
7987  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Homo sapiens; sex: female");
7988  eval = validator.Validate(seh, options);
7989  CheckErrors(*eval, expected_errors);
7990 
7991  // should see errors for bad lineages
7993  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Lentinula edodes");
7994  unit_test_util::SetLineage(entry, "Streptophyta; foo");
7995 
7996  eval = validator.Validate(seh, options);
7997  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
7998  "BadSpecificHost",
7999  "Suspect Host Value - a prokaryote, fungus or virus is suspect as a host for a plant or animal"));
8000  CheckErrors(*eval, expected_errors);
8001  CLEAR_ERRORS
8002 
8003  // others
8004  TestSpecificHostNoError("Racoon");
8005  TestSpecificHostNoError("SNAKE");
8006  TestSpecificHostNoError("Snake");
8007  TestSpecificHostNoError("Turtle");
8008  TestSpecificHostNoError("mallard duck");
8009  TestSpecificHostNoError("Guinea pig");
8010  TestSpecificHostNoError("sea urchin"); // RW-1364
8011 }
8012 
8013 BOOST_AUTO_TEST_CASE(Test_Validity_SpecificHost)
8014 {
8015  string host, error_msg;
8016 
8017  host = "home sapiens";
8018  BOOST_CHECK_EQUAL(false, IsSpecificHostValid(host, error_msg));
8019  BOOST_CHECK_EQUAL(error_msg, "Specific host value is misspelled: home sapiens");
8020 
8021  host = "Svalbard rock ptarmigan";
8022  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8023  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8024 
8025  host = "Racoon";
8026  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8027  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8028 
8029  host = "SNAKE";
8030  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8031  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8032 
8033  host = "Snake";
8034  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8035  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8036 
8037  host = "Turtle";
8038  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8039  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8040 
8041 
8042  host = "Homo sapiens";
8043  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8044  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8045 
8046  host = "Homo supiens";
8047  BOOST_CHECK_EQUAL(false, IsSpecificHostValid(host, error_msg));
8048  BOOST_CHECK_EQUAL(error_msg, string("Invalid value for specific host: Homo supiens"));
8049 
8050  host = "Pinus sp.";
8051  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8052  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8053 
8054  host = "Gallus Gallus";
8055  BOOST_CHECK_EQUAL(false, IsSpecificHostValid(host, error_msg));
8056  BOOST_CHECK_EQUAL(error_msg, string("Specific host value is incorrectly capitalized: Gallus Gallus"));
8057 
8058  host = "Eschericia coli";
8059  BOOST_CHECK_EQUAL(false, IsSpecificHostValid(host, error_msg));
8060  BOOST_CHECK_EQUAL(error_msg, string("Specific host value is misspelled: Eschericia coli"));
8061 
8062  host = "Avian";
8063  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8064  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8065 
8066  host = "Bovine";
8067  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8068  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8069 
8070  host = "Pig";
8071  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8072  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8073 
8074  host = "Chicken";
8075  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8076  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8077 
8078  host = "turtle";
8079  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8080  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8081 
8082  host = "Homo sapiens; sex: female";
8083  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8084  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8085 
8086  host = "Guinea pig";
8087  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8088  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8089 
8090  host = "Equus sp.";
8091  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8092  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8093 
8094  host = "Ficus sp.";
8095  BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8096  BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8097 }
8098 
8099 
8100 BOOST_AUTO_TEST_CASE(Test_FixSpecificHost)
8101 {
8102  string hostfix, host;
8103 
8104  host = "home sapiens";
8105  hostfix = FixSpecificHost(host);
8106  BOOST_CHECK_EQUAL(hostfix, "Homo sapiens");
8107 
8108  host = "homo sapiens";
8109  hostfix = FixSpecificHost(host);
8110  BOOST_CHECK_EQUAL(hostfix, "Homo sapiens");
8111 
8112  host = "Homo supiens";
8113  hostfix = FixSpecificHost(host);
8114  BOOST_CHECK_EQUAL(hostfix, kEmptyStr);
8115 
8116  host = "Pinus sp.";
8117  hostfix = FixSpecificHost(host);
8118  BOOST_CHECK_EQUAL(hostfix, "Pinus sp.");
8119 
8120  host = "Gallus Gallus";
8121  hostfix = FixSpecificHost(host);
8122  BOOST_CHECK_EQUAL(hostfix, string("Gallus gallus"));
8123 
8124  host = "Eschericia coli";
8125  hostfix = FixSpecificHost(host);
8126  BOOST_CHECK_EQUAL(hostfix, string("Escherichia coli"));
8127 
8128  host = "Avian";
8129  hostfix = FixSpecificHost(host);
8130  BOOST_CHECK_EQUAL(hostfix, host);
8131 
8132  host = "";
8133  hostfix = FixSpecificHost(host);
8134  BOOST_CHECK_EQUAL(hostfix, kEmptyStr);
8135 
8136  host = "Bovine";
8137  hostfix = FixSpecificHost(host);
8138  BOOST_CHECK_EQUAL(hostfix, string("Bovine"));
8139 
8140  host = "Homo sapiens";
8141  hostfix = FixSpecificHost(host);
8142  BOOST_CHECK_EQUAL(hostfix, string("Homo sapiens"));
8143 
8144  host = "Pig";
8145  hostfix = FixSpecificHost(host);
8146  BOOST_CHECK_EQUAL(hostfix, string("Pig"));
8147 
8148  host = " Chicken";
8149  hostfix = FixSpecificHost(host);
8150  BOOST_CHECK_EQUAL(hostfix, string("Chicken"));
8151 
8152  host = "Homo sapiens; sex: female";
8153  hostfix = FixSpecificHost(host);
8154  BOOST_CHECK_EQUAL(hostfix, host);
8155 
8156  host = "HUMAN";
8157  hostfix = FixSpecificHost(host);
8158  BOOST_CHECK_EQUAL(hostfix, "Homo sapiens");
8159 }
8160 
8161 
8162 BOOST_AUTO_TEST_CASE(Test_Descr_RefGeneTrackingIllegalStatus)
8163 {
8164  // prepare entry
8166  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
8168  SetRefGeneTrackingStatus(entry, "unknown");
8169 
8171 
8172  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "RefGeneTrackingIllegalStatus",
8173  "RefGeneTracking object has illegal Status 'unknown'"));
8174  // AddChromosomeNoLocation(expected_errors, entry);
8175  eval = validator.Validate(seh, options);
8176  CheckErrors(*eval, expected_errors);
8177 
8178  CLEAR_ERRORS
8179 }
8180 
8181 
8182 BOOST_AUTO_TEST_CASE(Test_Descr_ReplacedCountryCode)
8183 {
8184  // prepare entry
8186 
8188 
8189  vector<string> old_countries;
8190  old_countries.push_back("Belgian Congo");
8191  old_countries.push_back("British Guiana");
8192  old_countries.push_back("Burma");
8193  old_countries.push_back("Czechoslovakia");
8194  old_countries.push_back("Korea");
8195  old_countries.push_back("Serbia and Montenegro");
8196  old_countries.push_back("Siam");
8197  old_countries.push_back("USSR");
8198  old_countries.push_back("Yugoslavia");
8199  old_countries.push_back("Zaire");
8200  old_countries.push_back("Macedonia");
8201 
8203  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ReplacedGeoLocNameCode", ""));
8204  } else {
8205  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ReplacedCountryCode", ""));
8206  }
8207  // AddChromosomeNoLocation(expected_errors, entry);
8208 
8209  for (const string& it : old_countries) {
8212  expected_errors[0]->SetErrMsg("Replaced geo_loc_name [" + it + "]");
8213  } else {
8214  expected_errors[0]->SetErrMsg("Replaced country name [" + it + "]");
8215  }
8216  eval = validator.Validate(seh, options);
8217  CheckErrors(*eval, expected_errors);
8219  }
8220 
8221  CLEAR_ERRORS
8222 }
8223 
8224 
8225 BOOST_AUTO_TEST_CASE(Test_Descr_BadInstitutionCode)
8226 {
8227  // prepare entry
8229 
8231 
8232  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadInstitutionCode",
8233  "Voucher is missing institution code"));
8234  // AddChromosomeNoLocation(expected_errors, entry);
8236  eval = validator.Validate(seh, options);
8237  CheckErrors(*eval, expected_errors);
8238 
8241  eval = validator.Validate(seh, options);
8242  CheckErrors(*eval, expected_errors);
8243 
8246  eval = validator.Validate(seh, options);
8247  CheckErrors(*eval, expected_errors);
8249 
8250  // codes that need disambiguating country
8251  expected_errors[0]->SetSeverity(eDiag_Warning);
8252  vector<string> ambig;
8253  // specimen voucher codes
8254  ambig.push_back("BAH");
8255  ambig.push_back("ACE");
8256  ambig.push_back("SLU");
8257  ambig.push_back("UAB");
8258  ambig.push_back("CAIM");
8259  ambig.push_back("HER");
8260  ambig.push_back("DSC");
8261  ambig.push_back("DNHM");
8262  ambig.push_back("BNHM");
8263  ambig.push_back("UI");
8264  ambig.push_back("KMK");
8265  ambig.push_back("MT");
8266  ambig.push_back("MP");
8267  ambig.push_back("NASC");
8268  ambig.push_back("IZAC");
8269  ambig.push_back("CCG");
8270  ambig.push_back("PIN");
8271  ambig.push_back("HSU");
8272  ambig.push_back("CAUP");
8273  ambig.push_back("ISU");
8274  ambig.push_back("SDSU");
8275  ambig.push_back("GC");
8276  ambig.push_back("UNL");
8277  ambig.push_back("MZUP");
8278  ambig.push_back("MG");
8279  ambig.push_back("HNHM");
8280  ambig.push_back("PMS");
8281  ambig.push_back("LE");
8282  ambig.push_back("GCM");
8283  ambig.push_back("TMP");
8284  ambig.push_back("DMNH");
8285  ambig.push_back("ZMUH");
8286  ambig.push_back("SMF");
8287  ambig.push_back("ZSP");
8288  ambig.push_back("TAU");
8289  ambig.push_back("MJG");
8290  ambig.push_back("DUM");
8291  ambig.push_back("ANU");
8292  ambig.push_back("CPAP");
8293  ambig.push_back("CSU");
8294  ambig.push_back("WACA");
8295  ambig.push_back("MMNH");
8296  ambig.push_back("ALA");
8297  ambig.push_back("RV");
8298  ambig.push_back("ABS");
8299  ambig.push_back("FM");
8300  ambig.push_back("HNU");
8301  ambig.push_back("PO");
8302  ambig.push_back("GAM");
8303  ambig.push_back("MCM");
8304  ambig.push_back("LU");
8305  ambig.push_back("SDM");
8306  ambig.push_back("PMK");
8307  ambig.push_back("VI");
8308  ambig.push_back("IMM");
8309  ambig.push_back("R");
8310  ambig.push_back("CHM");
8311  ambig.push_back("CMC");
8312  ambig.push_back("JSPC");
8313  ambig.push_back("YU");
8314  ambig.push_back("STM");
8315  ambig.push_back("RSM");
8316  ambig.push_back("BB");
8317  ambig.push_back("BHM");
8318  ambig.push_back("CBU");
8319  ambig.push_back("MCCM");
8320  ambig.push_back("NMSU");
8321  ambig.push_back("OTM");
8322  ambig.push_back("LP");
8323  ambig.push_back("SME");
8324  ambig.push_back("PEM");
8325  ambig.push_back("UMF");
8326  ambig.push_back("CIS");
8327  ambig.push_back("LBG");
8328  ambig.push_back("CCAC");
8329  ambig.push_back("SNP");
8330  ambig.push_back("UT");
8331  ambig.push_back("IBA");
8332  ambig.push_back("UNCC");
8333  ambig.push_back("NHMC");
8334  ambig.push_back("BAC");
8335  ambig.push_back("PMG");
8336  ambig.push_back("MRC");
8337  ambig.push_back("ETH");
8338  ambig.push_back("OMC");
8339  ambig.push_back("NMV");
8340  ambig.push_back("MLS");
8341  ambig.push_back("NJM");
8342  ambig.push_back("INA");
8343  ambig.push_back("BCM");
8344  ambig.push_back("YM");
8345  ambig.push_back("CAM");
8346  ambig.push_back("UA");
8347  ambig.push_back("OSM");
8348  ambig.push_back("CPS");
8349  ambig.push_back("POKM");
8350  ambig.push_back("VSM");
8351  ambig.push_back("ZMG");
8352  ambig.push_back("IO");
8353  ambig.push_back("USM");
8354  ambig.push_back("UCS");
8355  ambig.push_back("CN");
8356  ambig.push_back("PCM");
8357  ambig.push_back("MU");
8358  ambig.push_back("ISC");
8359  ambig.push_back("CIB");
8360  ambig.push_back("GML");
8361  ambig.push_back("NU");
8362  ambig.push_back("NCSC");
8363  ambig.push_back("MHNN");
8364  ambig.push_back("NCC");
8365  ambig.push_back("MSM");
8366  ambig.push_back("RM");
8367  ambig.push_back("MBM");
8368  ambig.push_back("UPM");
8369  ambig.push_back("MSU");
8370  ambig.push_back("PI");
8371  ambig.push_back("CENA");
8372  ambig.push_back("IBRP");
8373  ambig.push_back("CRE");
8374  ambig.push_back("FSC");
8375  ambig.push_back("ENCB");
8376  ambig.push_back("BAS");
8377  ambig.push_back("GOE");
8378  ambig.push_back("PSS");
8379  ambig.push_back("CCB");
8380  ambig.push_back("SUM");
8381  ambig.push_back("NMPG");
8382  ambig.push_back("USP");
8383  ambig.push_back("IPB");
8384  ambig.push_back("BCC");
8385  ambig.push_back("FNU");
8386  ambig.push_back("SHM");
8387  ambig.push_back("TNSC");
8388  ambig.push_back("LS");
8389  ambig.push_back("TMC");
8390  ambig.push_back("HUT");
8391  ambig.push_back("ZMUO");
8392  ambig.push_back("ALM");
8393  ambig.push_back("ITCC");
8394  ambig.push_back("TM");
8395  ambig.push_back("WB");
8396  ambig.push_back("ZMK");
8397  ambig.push_back("LBM");
8398  ambig.push_back("NI");
8399  ambig.push_back("CB");
8400  // ambig.push_back("AMP");
8401  ambig.push_back("MM");
8402  ambig.push_back("PMU");
8403  ambig.push_back("DM");
8404  ambig.push_back("RIVE");
8405  ambig.push_back("TARI");
8406  ambig.push_back("CSCS");
8407  ambig.push_back("PSU");
8408  ambig.push_back("IMT");
8409  ambig.push_back("MZV");
8410  ambig.push_back("SZE");
8411  ambig.push_back("CUVC");
8412  ambig.push_back("LMJ");
8413  ambig.push_back("UC");
8414  ambig.push_back("ZIUS");
8415  ambig.push_back("FRI");
8416  ambig.push_back("CDA");
8417  ambig.push_back("ZMUA");
8418  ambig.push_back("MZUC");
8419  ambig.push_back("BR");
8420  ambig.push_back("UG");
8421  ambig.push_back("MDH");
8422  ambig.push_back("USD");
8423  ambig.push_back("MNHM");
8424  ambig.push_back("MAD");
8425  ambig.push_back("PMA");
8426  ambig.push_back("ICN");
8427  ambig.push_back("TU");
8428  ambig.push_back("PMNH");
8429  ambig.push_back("SAU");
8430  ambig.push_back("KM");
8431  ambig.push_back("GMNH");
8432  ambig.push_back("SSM");
8433  ambig.push_back("MZ");
8434  ambig.push_back("WSU");
8435  ambig.push_back("CIAN");
8436  ambig.push_back("ZMT");
8437  ambig.push_back("IMS");
8438  ambig.push_back("TCDU");
8439  ambig.push_back("SIAC");
8440  ambig.push_back("DFEC");
8441  ambig.push_back("CBD");
8442  ambig.push_back("SWC");
8443  ambig.push_back("MD");
8444  ambig.push_back("FU");
8445  ambig.push_back("UV");
8446  ambig.push_back("URM");
8447  ambig.push_back("JNU");
8448  ambig.push_back("IZ");
8449  ambig.push_back("UAIC");
8450  ambig.push_back("LEB");
8451  ambig.push_back("MCSN");
8452  ambig.push_back("UU");
8453  ambig.push_back("PUC");
8454  ambig.push_back("SNM");
8455  ambig.push_back("AKU");
8456  ambig.push_back("MH");
8457  ambig.push_back("MOR");
8458  ambig.push_back("IM");
8459  ambig.push_back("MSNT");
8460  ambig.push_back("IGM");
8461  ambig.push_back("NAP");
8462  ambig.push_back("NHMR");
8463  ambig.push_back("MW");
8464  ambig.push_back("PPCC");
8465  ambig.push_back("CNHM");
8466  ambig.push_back("IAL");
8467  ambig.push_back("PCU");
8468  ambig.push_back("HM");
8469 
8470  for (const string& it : ambig) {
8471  expected_errors[0]->SetErrMsg("Institution code " + it + " needs to be qualified with a <COUNTRY> designation");
8473  eval = validator.Validate(seh, options);
8474  CheckErrors(*eval, expected_errors);
8476  }
8477 
8478  // bio-material
8479  ambig.clear();
8480  ambig.push_back("NASC");
8481  ambig.push_back("TCDU");
8482 
8483  for (const string& it : ambig) {
8484  expected_errors[0]->SetErrMsg("Institution code " + it + " needs to be qualified with a <COUNTRY> designation");
8486  eval = validator.Validate(seh, options);
8487  CheckErrors(*eval, expected_errors);
8489  }
8490 
8491  // culture-collection
8492  ambig.clear();
8493  ambig.push_back("CAIM");
8494  ambig.push_back("STM");
8495  ambig.push_back("HER");
8496  ambig.push_back("FSC");
8497  ambig.push_back("MDH");
8498  ambig.push_back("DSC");
8499  ambig.push_back("IFM");
8500  ambig.push_back("MCCM");
8501  ambig.push_back("CCB");
8502  ambig.push_back("LBG");
8503  ambig.push_back("BCC");
8504  ambig.push_back("CCAC");
8505  ambig.push_back("CCF");
8506  ambig.push_back("IBA");
8507  ambig.push_back("CAUP");
8508  ambig.push_back("MRC");
8509  ambig.push_back("ETH");
8510  ambig.push_back("TMC");
8511  ambig.push_back("CBD");
8512  ambig.push_back("HUT");
8513  ambig.push_back("URM");
8514  ambig.push_back("NJM");
8515  ambig.push_back("INA");
8516  ambig.push_back("BTCC");
8517  ambig.push_back("YM");
8518  ambig.push_back("IZ");
8519  ambig.push_back("ITCC");
8520  ambig.push_back("WB");
8521  ambig.push_back("LE");
8522  ambig.push_back("LCC");
8523  ambig.push_back("LBM");
8524  ambig.push_back("NI");
8525  ambig.push_back("CB");
8526  // ambig.push_back("AMP");
8527  ambig.push_back("RIVE");
8528  ambig.push_back("DUM");
8529  ambig.push_back("AKU");
8530  ambig.push_back("CN");
8531  ambig.push_back("CCDM");
8532  ambig.push_back("PCM");
8533  ambig.push_back("MU");
8534  ambig.push_back("ISC");
8535  ambig.push_back("IMT");
8536  ambig.push_back("NU");
8537  ambig.push_back("RV");
8538  ambig.push_back("UC");
8539  ambig.push_back("NCSC");
8540  ambig.push_back("CCY");
8541  ambig.push_back("NCC");
8542  ambig.push_back("FRI");
8543  ambig.push_back("GAM");
8544  ambig.push_back("RM");
8545  ambig.push_back("MCM");
8546  ambig.push_back("PPCC");
8547  ambig.push_back("CDA");
8548  ambig.push_back("IAL");
8549  ambig.push_back("VI");
8550  ambig.push_back("PCU");
8551  ambig.push_back("CVCC");
8552  ambig.push_back("BR");
8553  ambig.push_back("MSU");
8554  for (const string& it : ambig) {
8555  expected_errors[0]->SetErrMsg("Institution code " + it + " needs to be qualified with a <COUNTRY> designation");
8557  eval = validator.Validate(seh, options);
8558  CheckErrors(*eval, expected_errors);
8560  }
8561 
8562  expected_errors[0]->SetErrMsg("Institution code zzz is not in list");
8564  eval = validator.Validate(seh, options);
8565  CheckErrors(*eval, expected_errors);
8568  eval = validator.Validate(seh, options);
8569  CheckErrors(*eval, expected_errors);
8572  eval = validator.Validate(seh, options);
8573  CheckErrors(*eval, expected_errors);
8575 
8576  expected_errors[0]->SetErrMsg("Institution code abrc exists, but correct capitalization is ABRC");
8578  eval = validator.Validate(seh, options);
8579  CheckErrors(*eval, expected_errors);
8581 
8582  expected_errors[0]->SetErrMsg("Institution code a exists, but correct capitalization is A");
8584  eval = validator.Validate(seh, options);
8585  CheckErrors(*eval, expected_errors);
8587 
8588  expected_errors[0]->SetErrMsg("Institution code abkmi exists, but correct capitalization is ABKMI");
8590  eval = validator.Validate(seh, options);
8591  CheckErrors(*eval, expected_errors);
8593 
8594  CLEAR_ERRORS
8595 
8596  // should be ok
8598  // AddChromosomeNoLocation(expected_errors, entry);
8599  eval = validator.Validate(seh, options);
8600  CheckErrors(*eval, expected_errors);
8601 
8602  CLEAR_ERRORS
8603 }
8604 
8605 
8606 BOOST_AUTO_TEST_CASE(Test_Descr_BadCollectionCode)
8607 {
8608  // prepare entry
8610 
8612 
8613  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadCollectionCode",
8614  "Institution code ABRC exists, but collection ABRC:bar is not in list"));
8615  // AddChromosomeNoLocation(expected_errors, entry);
8617  eval = validator.Validate(seh, options);
8618  CheckErrors(*eval, expected_errors);
8620 
8621  expected_errors[0]->SetErrMsg("Institution code A exists, but collection A:bar is not in list");
8623  eval = validator.Validate(seh, options);
8624  CheckErrors(*eval, expected_errors);
8626 
8627  expected_errors[0]->SetErrMsg("Institution code ABKMI exists, but collection ABKMI:bar is not in list");
8629  eval = validator.Validate(seh, options);
8630  CheckErrors(*eval, expected_errors);
8632 
8633  CLEAR_ERRORS
8634 
8635  // DNA is ok for biomaterial
8637  // AddChromosomeNoLocation(expected_errors, entry);
8638  eval = validator.Validate(seh, options);
8639  CheckErrors(*eval, expected_errors);
8641 
8642  CLEAR_ERRORS
8643 }
8644 
8645 
8646 BOOST_AUTO_TEST_CASE(Test_Descr_IncorrectlyFormattedVoucherID)
8647 {
8648  // prepare entry
8650 
8652 
8653  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IncorrectlyFormattedVoucherID",
8654  "Voucher is missing specific identifier"));
8655  // AddChromosomeNoLocation(expected_errors, entry);
8657  eval = validator.Validate(seh, options);
8658  CheckErrors(*eval, expected_errors);
8660 
8662  eval = validator.Validate(seh, options);
8663  CheckErrors(*eval, expected_errors);
8665 
8667  eval = validator.Validate(seh, options);
8668  CheckErrors(*eval, expected_errors);
8670 
8671  CLEAR_ERRORS
8672 }
8673 
8674 
8675 BOOST_AUTO_TEST_CASE(Test_Descr_UnstructuredVoucher)
8676 {
8677  // prepare entry
8679 
8681 
8682  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnstructuredVoucher",
8683  "Culture_collection should be structured, but is not"));
8684  // AddChromosomeNoLocation(expected_errors, entry);
8686  eval = validator.Validate(seh, options);
8687  CheckErrors(*eval, expected_errors);
8689 
8690  CLEAR_ERRORS
8691 }
8692 
8693 
8694 BOOST_AUTO_TEST_CASE(Test_Descr_ChromosomeLocation)
8695 {
8696  // prepare entry
8698 
8700 
8701  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ChromosomeLocation",
8702  "INDEXER_ONLY - BioSource location is chromosome"));
8704  eval = validator.Validate(seh, options);
8705  CheckErrors(*eval, expected_errors);
8706 
8707  CLEAR_ERRORS
8708 }
8709 
8710 
8711 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleSourceQualifiers)
8712 {
8713  // prepare entry
8715 
8717 
8718  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleSourceQualifiers",
8719  "Multiple country qualifiers present"));
8720  // AddChromosomeNoLocation(expected_errors, entry);
8723  eval = validator.Validate(seh, options);
8724  CheckErrors(*eval, expected_errors);
8726 
8727  expected_errors[0]->SetErrMsg("Multiple lat_lon qualifiers present");
8730  eval = validator.Validate(seh, options);
8731  CheckErrors(*eval, expected_errors);
8733 
8734  expected_errors[0]->SetErrMsg("Multiple fwd_primer_seq qualifiers present");
8735  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleSourceQualifiers", "Multiple rev_primer_seq qualifiers present"));
8736  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleSourceQualifiers", "Multiple fwd_primer_name qualifiers present"));
8737  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleSourceQualifiers", "Multiple rev_primer_name qualifiers present"));
8746  eval = validator.Validate(seh, options);
8747  CheckErrors(*eval, expected_errors);
8748 
8749  CLEAR_ERRORS
8750 }
8751 
8752 
8754 {
8755  if (subtype == CSubSource::eSubtype_sex
8756  || subtype == CSubSource::eSubtype_frequency
8757  || subtype == CSubSource::eSubtype_plasmid_name
8760  || subtype == CSubSource::eSubtype_plastid_name
8761  || subtype == CSubSource::eSubtype_country
8762  || subtype == CSubSource::eSubtype_lat_lon
8768  || subtype == CSubSource::eSubtype_country) {
8769  return true;
8770  } else {
8771  return false;
8772  }
8773 }
8774 
8775 
8777 {
8778  if (subtype == COrgMod::eSubtype_variety
8779  || subtype == COrgMod::eSubtype_strain
8780  || subtype == COrgMod::eSubtype_sub_species
8781  || subtype == COrgMod::eSubtype_forma
8782  || subtype == COrgMod::eSubtype_forma_specialis
8784  || subtype == COrgMod::eSubtype_bio_material
8786  || subtype == COrgMod::eSubtype_metagenome_source) {
8787  return true;
8788  } else {
8789  return false;
8790  }
8791 }
8792 
8793 
8795 {
8797  unit_test_util::SetSubSource(entry, subtype, "");
8798  unit_test_util::SetSubSource(entry, subtype, val);
8799 
8801 
8802  if (subtype == CSubSource::eSubtype_segment) {
8803  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NonViralSegment",
8804  "Non-viral source feature should not have a segment qualifier"));
8805  }
8806 
8807  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnbalancedParentheses",
8808  "Unbalanced parentheses in subsource '" + val + "'"));
8809  eval = validator.Validate(seh, options);
8810  CheckErrors(*eval, expected_errors);
8811  CLEAR_ERRORS
8812 }
8813 
8814 
8815 void CheckUnbalancedParenthesesOrgMod(COrgMod::TSubtype subtype, const string& val)
8816 {
8818  unit_test_util::SetOrgMod(entry, subtype, "");
8819  unit_test_util::SetOrgMod(entry, subtype, val);
8820 
8822 
8823  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnbalancedParentheses",
8824  "Unbalanced parentheses in orgmod '" + val + "'"));
8825  // AddChromosomeNoLocation(expected_errors, entry);
8826 
8827  eval = validator.Validate(seh, options);
8828  CheckErrors(*eval, expected_errors);
8829  CLEAR_ERRORS
8830 }
8831 
8832 
8833 BOOST_AUTO_TEST_CASE(Test_Descr_UnbalancedParentheses)
8834 {
8835  // prepare entry
8837 
8839 
8840  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnbalancedParentheses",
8841  "Unbalanced parentheses in taxname 'Malio malefi (abc'"));
8842  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
8843  "Organism not found in taxonomy database"));
8844  // AddChromosomeNoLocation(expected_errors, entry);
8845  unit_test_util::SetTaxname(entry, "Malio malefi (abc");
8846  eval = validator.Validate(seh, options);
8847  CheckErrors(*eval, expected_errors);
8848 
8849  expected_errors[0]->SetErrMsg("Unbalanced parentheses in taxname 'Malio malefi )abc'");
8850  unit_test_util::SetTaxname(entry, "Malio malefi )abc");
8851  eval = validator.Validate(seh, options);
8852  CheckErrors(*eval, expected_errors);
8854 
8855  CLEAR_ERRORS
8856 
8859  subtype++) {
8860  if (subtype != CSubSource::eSubtype_germline
8861  && subtype != CSubSource::eSubtype_rearranged
8862  && subtype != CSubSource::eSubtype_transgenic
8864  && subtype != CSubSource::eSubtype_metagenomic) {
8865  if (SubSourceHasOtherRules(subtype)) {
8866  continue;
8867  }
8868  CheckUnbalancedParenthesesSubSource(subtype, "no left (abc");
8869  CheckUnbalancedParenthesesSubSource(subtype, "no right )abc");
8870  CheckUnbalancedParenthesesSubSource(subtype, "no left ( parentheses");
8871  CheckUnbalancedParenthesesSubSource(subtype, "no right ) parentheses");
8872  }
8873  }
8874  // also check other
8879 
8882  subtype++) {
8883  if (OrgModHasOtherRules(subtype)) {
8884  continue;
8885  }
8886  CheckUnbalancedParenthesesOrgMod(subtype, "no left (abc");
8887  CheckUnbalancedParenthesesOrgMod(subtype, "no right )abc");
8888  }
8889  // also check old_lineage and other
8890  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnbalancedParentheses",
8891  "Unbalanced parentheses in taxname 'Malio malefi (abc'"));
8892  // AddChromosomeNoLocation(expected_errors, entry);
8893 
8895  expected_errors[0]->SetErrMsg("Unbalanced parentheses in orgmod 'no left (abc'");
8896  eval = validator.Validate(seh, options);
8897  CheckErrors(*eval, expected_errors);
8899  expected_errors[0]->SetErrMsg("Unbalanced parentheses in orgmod 'no right )abc'");
8901  eval = validator.Validate(seh, options);
8902  CheckErrors(*eval, expected_errors);
8904 
8907 
8908  CLEAR_ERRORS
8909  // should get no error for unbalanced parentheses in old name
8910  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_old_name, "no left (abc");
8911  // AddChromosomeNoLocation(expected_errors, entry);
8912  eval = validator.Validate(seh, options);
8913  CheckErrors(*eval, expected_errors);
8914 
8915  CLEAR_ERRORS
8916 }
8917 
8918 
8919 BOOST_AUTO_TEST_CASE(Test_Descr_IdenticalInstitutionCode)
8920 {
8921  // prepare entry
8923 
8925 
8926  // no errors if different institutions
8929  // AddChromosomeNoLocation(expected_errors, entry);
8930  eval = validator.Validate(seh, options);
8931  CheckErrors(*eval, expected_errors);
8932  // no errors if collection is DNA
8936  eval = validator.Validate(seh, options);
8937  CheckErrors(*eval, expected_errors);
8938 
8939  // errors if same institition:collection
8940  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IdenticalInstitutionCode",
8941  "Multiple vouchers with same institution:collection"));
8945  eval = validator.Validate(seh, options);
8946  CheckErrors(*eval, expected_errors);
8947 
8948  // errors if same institition:collection
8949  expected_errors[0]->SetErrMsg("Multiple vouchers with same institution");
8953  eval = validator.Validate(seh, options);
8954  CheckErrors(*eval, expected_errors);
8955 
8956  CLEAR_ERRORS
8957 }
8958 
8959 
8960 BOOST_AUTO_TEST_CASE(Test_Descr_BadCountryCapitalization)
8961 {
8962  // prepare entry
8964 
8966 
8968  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadGeoLocNameCapitalization",
8969  "Bad geo_loc_name capitalization [saint pierre and miquelon]"));
8970  } else {
8971  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadCountryCapitalization",
8972  "Bad country capitalization [saint pierre and miquelon]"));
8973  }
8974  // AddChromosomeNoLocation(expected_errors, entry);
8975  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "saint pierre and miquelon");
8976  eval = validator.Validate(seh, options);
8977  CheckErrors(*eval, expected_errors);
8978 
8979  CLEAR_ERRORS
8980 }
8981 
8982 
8983 BOOST_AUTO_TEST_CASE(Test_Descr_WrongVoucherType)
8984 {
8985  // prepare entry
8987 
8989 
8990  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "WrongVoucherType",
8991  "Institution code ABRC should be bio_material"));
8992  // AddChromosomeNoLocation(expected_errors, entry);
8994  eval = validator.Validate(seh, options);
8995  CheckErrors(*eval, expected_errors);
8998  eval = validator.Validate(seh, options);
8999  CheckErrors(*eval, expected_errors);
9001 
9002  expected_errors[0]->SetErrMsg("Institution code ABKMI should be culture_collection");
9004  eval = validator.Validate(seh, options);
9005  CheckErrors(*eval, expected_errors);
9008  eval = validator.Validate(seh, options);
9009  CheckErrors(*eval, expected_errors);
9011 
9012  expected_errors[0]->SetErrMsg("Institution code AA should be specimen_voucher");
9014  eval = validator.Validate(seh, options);
9015  CheckErrors(*eval, expected_errors);
9018  eval = validator.Validate(seh, options);
9019  CheckErrors(*eval, expected_errors);
9021 
9022  CLEAR_ERRORS
9023 }
9024 
9025 
9026 BOOST_AUTO_TEST_CASE(Test_Descr_TitleHasPMID)
9027 {
9028  // prepare entry
9030  SetTitle(entry, "foo bar something something (PMID 1)");
9031 
9033 
9034  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TitleHasPMID",
9035  "Title descriptor has internal PMID"));
9036  // AddChromosomeNoLocation(expected_errors, entry);
9037  eval = validator.Validate(seh, options);
9038  CheckErrors(*eval, expected_errors);
9039 
9040  CLEAR_ERRORS
9041 }
9042 
9043 
9044 BOOST_AUTO_TEST_CASE(Test_Descr_BadKeyword)
9045 {
9046  // prepare entry
9048  CRef<CSeqdesc> desc(new CSeqdesc());
9049  desc->SetGenbank().SetKeywords().push_back("BARCODE");
9050  entry->SetSeq().SetDescr().Set().push_back(desc);
9051 
9053 
9054  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadKeywordNoTechnique",
9055  "BARCODE keyword without Molinfo.tech barcode"));
9056  // AddChromosomeNoLocation(expected_errors, entry);
9057  eval = validator.Validate(seh, options);
9058  CheckErrors(*eval, expected_errors);
9059  CLEAR_ERRORS
9060 
9061  entry->SetSeq().SetDescr().Set().pop_back();
9063  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info,
9064  "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
9065  // AddChromosomeNoLocation(expected_errors, entry);
9066  eval = validator.Validate(seh, options);
9067  CheckErrors(*eval, expected_errors);
9068 
9069  CLEAR_ERRORS
9070 }
9071 
9072 
9073 BOOST_AUTO_TEST_CASE(Test_Descr_NoOrganismInTitle)
9074 {
9075  // prepare entry
9077  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123458");
9078  SetTitle(entry, "Something that does not start with organism");
9079 
9081 
9082  expected_errors.push_back(new CExpectedError("ref|NC_123458|", eDiag_Error, "NoOrganismInTitle",
9083  "RefSeq nucleotide title does not start with organism name"));
9084  // AddChromosomeNoLocation(expected_errors, entry);
9085  eval = validator.Validate(seh, options);
9086  CheckErrors(*eval, expected_errors);
9087 
9088  CLEAR_ERRORS
9089 
9090  scope.RemoveTopLevelSeqEntry(seh);
9092  CRef<CSeq_id> other_id(new CSeq_id());
9093  other_id->SetOther().SetAccession("NP_123456");
9094  unit_test_util::ChangeProtId(entry, other_id);
9095  SetTitle(entry->SetSet().SetSeq_set().back(), "Something that does not end with organism");
9096  seh = scope.AddTopLevelSeqEntry(*entry);
9097 
9098  expected_errors.push_back(new CExpectedError("ref|NP_123456|", eDiag_Error, "NoOrganismInTitle",
9099  "RefSeq protein title does not end with organism name"));
9100  expected_errors.push_back(new CExpectedError("ref|NP_123456|", eDiag_Warning, "InconsistentProteinTitle",
9101  "Instantiated protein title does not match automatically generated title"));
9102  // AddChromosomeNoLocation(expected_errors, entry);
9103  eval = validator.Validate(seh, options);
9104  CheckErrors(*eval, expected_errors);
9105 
9106  CLEAR_ERRORS
9107 }
9108 
9109 
9110 BOOST_AUTO_TEST_CASE(Test_Descr_MissingChromosome)
9111 {
9112  // prepare entry
9114  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
9116 
9118 
9119  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "MissingChromosome",
9120  "Missing chromosome qualifier on NC or AC RefSeq record"));
9121  eval = validator.Validate(seh, options);
9122  CheckErrors(*eval, expected_errors);
9123 
9124  CLEAR_ERRORS
9125 
9126  // error is suppressed if prokaryote or organelle
9127  unit_test_util::SetLineage(entry, "Viruses; foo");
9128  eval = validator.Validate(seh, options);
9129  CheckErrors(*eval, expected_errors);
9130  unit_test_util::SetLineage(entry, "Bacteria; foo");
9131  eval = validator.Validate(seh, options);
9132  CheckErrors(*eval, expected_errors);
9133  CLEAR_ERRORS
9134  unit_test_util::SetLineage(entry, "Archaea; foo");
9135  eval = validator.Validate(seh, options);
9136  CheckErrors(*eval, expected_errors);
9137  unit_test_util::SetLineage(entry, "some lineage");
9138  unit_test_util::SetDiv(entry, "BCT");
9139  eval = validator.Validate(seh, options);
9140  CheckErrors(*eval, expected_errors);
9141  unit_test_util::SetDiv(entry, "VRL");
9142  eval = validator.Validate(seh, options);
9143  CheckErrors(*eval, expected_errors);
9144  unit_test_util::SetDiv(entry, "");
9145 
9146  // error is suppressed if linkage group
9148  eval = validator.Validate(seh, options);
9149  CheckErrors(*eval, expected_errors);
9151 
9152  // error is suppressed if organelle
9154  eval = validator.Validate(seh, options);
9155  CheckErrors(*eval, expected_errors);
9157  eval = validator.Validate(seh, options);
9158  CheckErrors(*eval, expected_errors);
9160  unit_test_util::SetLineage(entry, "some lineage; Kinetoplastida");
9161  eval = validator.Validate(seh, options);
9162  CheckErrors(*eval, expected_errors);
9164  eval = validator.Validate(seh, options);
9165  CheckErrors(*eval, expected_errors);
9167  eval = validator.Validate(seh, options);
9168  CheckErrors(*eval, expected_errors);
9170  unit_test_util::SetTaxname(entry, "Bigelowiella natans");
9171  unit_test_util::SetTaxon(entry, 0);
9172  unit_test_util::SetTaxon(entry, 227086);
9173  unit_test_util::SetLineage(entry, "some lineage; Chlorarachniophyceae");
9174  eval = validator.Validate(seh, options);
9175  CheckErrors(*eval, expected_errors);
9176  CLEAR_ERRORS
9177 
9179  eval = validator.Validate(seh, options);
9180  CheckErrors(*eval, expected_errors);
9182  eval = validator.Validate(seh, options);
9183  CheckErrors(*eval, expected_errors);
9185  eval = validator.Validate(seh, options);
9186  CheckErrors(*eval, expected_errors);
9188  eval = validator.Validate(seh, options);
9189  CheckErrors(*eval, expected_errors);
9190 
9191 }
9192 
9193 
9194 BOOST_AUTO_TEST_CASE(Test_Descr_BadStructuredCommentFormat)
9195 {
9196  // prepare entry
9198  CRef<CSeqdesc> desc(new CSeqdesc());
9199  desc->SetUser().SetType().SetStr("StructuredComment");
9200  entry->SetSeq().SetDescr().Set().push_back(desc);
9201 
9203 
9204  // no prefix only empty errors
9205  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrucCommMissingUserObject",
9206  "Structured Comment user object descriptor is empty"));
9207  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UserObjectNoData",
9208  "User object with no data"));
9209  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "StrucCommMissingPrefixOrSuffix",
9210  "Structured Comment lacks prefix and/or suffix"));
9211  // AddChromosomeNoLocation(expected_errors, entry);
9212  eval = validator.Validate(seh, options);
9213  CheckErrors(*eval, expected_errors);
9214 
9215  CLEAR_ERRORS
9216 
9217  // unrecognized prefix
9218  CRef<CUser_field> prefix_field(new CUser_field());
9219  prefix_field->SetLabel().SetStr("StructuredCommentPrefix");
9220  prefix_field->SetData().SetStr("Unknown prefix");
9221  desc->SetUser().SetData().push_back(prefix_field);
9222  eval = validator.Validate(seh, options);
9223  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadStrucCommInvalidPrefix",
9224  "Unknown prefix is not a valid value for StructuredCommentPrefix"));
9225  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9226  // AddChromosomeNoLocation(expected_errors, entry);
9227  CheckErrors(*eval, expected_errors);
9228 
9229  BOOST_CHECK_EQUAL(validator.IsValidStructuredComment(*desc), false);
9230 
9231  CLEAR_ERRORS
9232 
9233  // should complain about missing required fields
9234  prefix_field->SetData().SetStr("##Genome-Assembly-Data-START##");
9235  vector<string> required_fields;
9236  /*
9237  required_fields.push_back("Finishing Goal");
9238  required_fields.push_back("Current Finishing Status");
9239  */
9240  required_fields.push_back("Assembly Method");
9241  required_fields.push_back("Genome Coverage");
9242  required_fields.push_back("Sequencing Technology");
9243 
9245 
9246  int i = 0;
9247  for (const string& it : required_fields) {
9248  expected_errors.push_back(new CExpectedError("lcl|good", levels[i], "BadStrucCommMissingField",
9249  "Required field " + it + " is missing"));
9250  i++;
9251  }
9252  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9253  // AddChromosomeNoLocation(expected_errors, entry);
9254 
9255  eval = validator.Validate(seh, options);
9256  CheckErrors(*eval, expected_errors);
9257 
9258  BOOST_CHECK_EQUAL(validator.IsValidStructuredComment(*desc), false);
9259 
9260  CLEAR_ERRORS
9261 
9262  // add fields in wrong order, with bad values where appropriate
9263  for (auto it = required_fields.crbegin(); it != required_fields.crend(); ++it) {
9264  CRef<CUser_field> field(new CUser_field());
9265  field->SetLabel().SetStr(*it);
9266  field->SetData().SetStr("bad value");
9267  desc->SetUser().SetData().push_back(field);
9268  }
9269 
9270  size_t pos = 0;
9271  for (const string& it : required_fields) {
9272  if (pos < required_fields.size() - 1) {
9273  expected_errors.push_back(new CExpectedError("lcl|good", levels[pos], "BadStrucCommFieldOutOfOrder",
9274  it + " field is out of order"));
9275  }
9276  if (!NStr::Equal(it, "Genome Coverage") && !NStr::Equal(it, "Sequencing Technology")) {
9277  expected_errors.push_back(new CExpectedError("lcl|good", levels[pos], "BadStrucCommInvalidFieldValue",
9278  "bad value is not a valid value for " + it));
9279  }
9280  ++pos;
9281  }
9282  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9283  // AddChromosomeNoLocation(expected_errors, entry);
9284 
9285  eval = validator.Validate(seh, options);
9286  CheckErrors(*eval, expected_errors);
9287 
9288  BOOST_CHECK_EQUAL(validator.IsValidStructuredComment(*desc), false);
9289 
9290  CLEAR_ERRORS
9291 
9292  prefix_field->SetData().SetStr("##MIGS-Data-START##");
9293  required_fields.clear();
9294  required_fields.push_back("alt_elev");
9295  required_fields.push_back("assembly");
9296  required_fields.push_back("collection_date");
9297  required_fields.push_back("country");
9298  required_fields.push_back("depth");
9299  required_fields.push_back("environment");
9300  required_fields.push_back("investigation_type");
9301  required_fields.push_back("isol_growth_condt");
9302  required_fields.push_back("lat_lon");
9303  required_fields.push_back("project_name");
9304  required_fields.push_back("sequencing_meth");
9305 
9306  for (const string& it : required_fields) {
9307  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommMissingField",
9308  "Required field " + it + " is missing"));
9309  }
9310  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9311  // AddChromosomeNoLocation(expected_errors, entry);
9312 
9313  eval = validator.Validate(seh, options);
9314  CheckErrors(*eval, expected_errors);
9315 
9316  BOOST_CHECK_EQUAL(validator.IsValidStructuredComment(*desc), false);
9317 
9318  CLEAR_ERRORS
9319 
9320  prefix_field->SetData().SetStr("##MIGS:4.0-Data-START##");
9321  required_fields.clear();
9322  required_fields.push_back("assembly");
9323  required_fields.push_back("collection_date");
9324  required_fields.push_back("env_biome");
9325  required_fields.push_back("env_feature");
9326  required_fields.push_back("env_material");
9327  required_fields.push_back("env_package");
9328  required_fields.push_back("geo_loc_name");
9329  required_fields.push_back("investigation_type");
9330  required_fields.push_back("isol_growth_condt");
9331  required_fields.push_back("lat_lon");
9332  required_fields.push_back("project_name");
9333  required_fields.push_back("seq_meth");
9334 
9335  for (const string& it : required_fields) {
9336  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommMissingField",
9337  "Required field " + it + " is missing"));
9338  }
9339  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9340  // AddChromosomeNoLocation(expected_errors, entry);
9341 
9342  eval = validator.Validate(seh, options);
9343  CheckErrors(*eval, expected_errors);
9344 
9345  BOOST_CHECK_EQUAL(validator.IsValidStructuredComment(*desc), false);
9346 
9347  CLEAR_ERRORS
9348 
9349  // should complain about missing required field for specific values of sequencing technology
9350  prefix_field->SetData().SetStr("##Assembly-Data-START##");
9351  desc->SetUser().ResetData();
9352  desc->SetUser().SetData().push_back(prefix_field);
9353 
9354  CRef<CUser_field> field(new CUser_field());
9355  field->SetLabel().SetStr("Sequencing Technology");
9356  field->SetData().SetStr("Singer");
9357  desc->SetUser().SetData().push_back(field);
9358 
9359  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadStrucCommMissingField",
9360  "Required field Assembly Method is missing when Sequencing Technology has value 'Singer'"));
9361  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9362  // AddChromosomeNoLocation(expected_errors, entry);
9363 
9364  eval = validator.Validate(seh, options);
9365  CheckErrors(*eval, expected_errors);
9366 
9367  BOOST_CHECK_EQUAL(validator.IsValidStructuredComment(*desc), false);
9368 
9369  CLEAR_ERRORS
9370 
9371  field->SetData().SetStr("something else");
9372  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadStrucCommMissingField",
9373  "Required field Assembly Method is missing when Sequencing Technology has value 'something else'"));
9374  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9375  // AddChromosomeNoLocation(expected_errors, entry);
9376 
9377  eval = validator.Validate(seh, options);
9378  CheckErrors(*eval, expected_errors);
9379 
9380  BOOST_CHECK_EQUAL(validator.IsValidStructuredComment(*desc), false);
9381 
9382  CLEAR_ERRORS
9383 
9384  prefix_field->SetData().SetStr("##HumanSTR-START##");
9385 
9386  eval = validator.Validate(seh, options);
9387 
9388  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadStrucCommInvalidFieldName",
9389  "Sequencing Technology is not a valid field name"));
9390  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadStrucCommMissingField",
9391  "Required field STR locus name is missing"));
9392  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadStrucCommMissingField",
9393  "Required field Length-based allele is missing"));
9394  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadStrucCommMissingField",
9395  "Required field Bracketed repeat is missing"));
9396  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9397  // AddChromosomeNoLocation(expected_errors, entry);
9398  CheckErrors(*eval, expected_errors);
9399 
9400  CLEAR_ERRORS
9401 }
9402 
9403 
9404 CRef<CUser_field> MkField(const string& label, const string& val)
9405 {
9407  f->SetLabel().SetStr(label);
9408  f->SetData().SetStr(val);
9409  return f;
9410 }
9411 
9412 
9414 {
9415  // prepare entry
9417  CRef<CUser_object> user(new CUser_object());
9418  user->SetType().SetStr("StructuredComment");
9419  user->SetData().push_back(MkField("StructuredCommentPrefix", "##Genome-Assembly-Data-START##"));
9420  user->SetData().push_back(MkField("Assembly Method", "a v. b"));
9421  user->SetData().push_back(MkField("Assembly Name", "NCBI1234"));
9422  user->SetData().push_back(MkField("Genome Coverage", "1"));
9423  user->SetData().push_back(MkField("Sequencing Technology", "2"));
9424 
9425  CRef<CSeqdesc> desc(new CSeqdesc());
9426  desc->SetUser().Assign(*user);
9427  entry->SetSeq().SetDescr().Set().push_back(desc);
9428 
9430 
9431  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadAssemblyName",
9432  "Assembly Name should not start with 'NCBI' or 'GenBank' in structured comment"));
9433  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9434  // AddChromosomeNoLocation(expected_errors, entry);
9435 
9436  eval = validator.Validate(seh, options);
9437 
9438  CheckErrors(*eval, expected_errors);
9439 
9440  CLEAR_ERRORS
9441 }
9442 
9443 
9444 BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceNeedsChromosome)
9445 {
9446  // prepare entry
9450  SetTitle(entry, "Sebaea microphylla, complete genome.");
9451 
9453  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceNeedsChromosome",
9454  "Non-viral complete genome not labeled as chromosome"));
9455  // AddChromosomeNoLocation(expected_errors, entry);
9456 
9457  eval = validator.Validate(seh, options);
9458  CheckErrors(*eval, expected_errors);
9459 
9460  CLEAR_ERRORS
9461 
9462  // AddChromosomeNoLocation(expected_errors, entry);
9463 
9464  // error goes away if viruses in lineage
9465  unit_test_util::SetLineage(entry, "Viruses; ");
9466  eval = validator.Validate(seh, options);
9467  CheckErrors(*eval, expected_errors);
9468  unit_test_util::SetLineage(entry, "some lineage");
9469 
9470  // if not genomic
9472  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
9473  eval = validator.Validate(seh, options);
9474  CheckErrors(*eval, expected_errors);
9476 
9477  // if not end with complete genome
9478  SetTitle(entry, "Sebaea microphylla, complete sequence.");
9479  eval = validator.Validate(seh, options);
9480  CheckErrors(*eval, expected_errors);
9481  SetTitle(entry, "Sebaea microphylla, complete genome.");
9482 
9483  // if source location chromosome
9484  CLEAR_ERRORS
9486  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ChromosomeLocation",
9487  "INDEXER_ONLY - BioSource location is chromosome"));
9488  eval = validator.Validate(seh, options);
9489  CheckErrors(*eval, expected_errors);
9490 
9491  CLEAR_ERRORS
9492 }
9493 
9494 
9495 BOOST_AUTO_TEST_CASE(Test_Descr_MolInfoConflictsWithBioSource)
9496 {
9497  // prepare entry
9499  // test for single-strand RNA viruses
9500  unit_test_util::SetLineage(entry, "Viruses; Avsunviroidae; foo");
9501 
9503  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MolInfoConflictsWithBioSource",
9504  "Taxonomy indicates single-stranded RNA, molecule type (DNA) is conflicting."));
9505  // AddChromosomeNoLocation(expected_errors, entry);
9506 
9507  eval = validator.Validate(seh, options);
9508  CheckErrors(*eval, expected_errors);
9509 
9510  unit_test_util::SetLineage(entry, "Viruses; Deltavirus; foo");
9511  eval = validator.Validate(seh, options);
9512  CheckErrors(*eval, expected_errors);
9513 
9514  unit_test_util::SetLineage(entry, "Viruses; Arenaviridae; foo");
9515  eval = validator.Validate(seh, options);
9516  CheckErrors(*eval, expected_errors);
9517 
9518  CLEAR_ERRORS
9519  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
9520  // "Genomic DNA viral lineage indicates no DNA stage"));
9521  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MolInfoConflictsWithBioSource",
9522  "Taxonomy indicates single-stranded RNA, molecule type (DNA) is conflicting."));
9523  // AddChromosomeNoLocation(expected_errors, entry);
9524 
9525  unit_test_util::SetLineage(entry, "Viruses; Albetovirus; foo");
9526  eval = validator.Validate(seh, options);
9527  CheckErrors(*eval, expected_errors);
9528 
9529  // error should go away if mol is rna
9530  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
9531  CLEAR_ERRORS
9532  // AddChromosomeNoLocation(expected_errors, entry);
9533  eval = validator.Validate(seh, options);
9534  CheckErrors(*eval, expected_errors);
9535 
9536  // tests for double-stranded RNA viruses
9537  unit_test_util::SetLineage(entry, "Viruses; Amalgaviridae; foo");
9538  // should be no error because rna
9539  eval = validator.Validate(seh, options);
9540  CheckErrors(*eval, expected_errors);
9541  // error if not rna
9542  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
9543  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MolInfoConflictsWithBioSource",
9544  "Taxonomy indicates double-stranded RNA, molecule type (DNA) is conflicting."));
9545  eval = validator.Validate(seh, options);
9546  CheckErrors(*eval, expected_errors);
9547 
9548  // test for single-stranded DNS viruses
9549  unit_test_util::SetLineage(entry, "Viruses; Alphasatellitidae; foo");
9550  // no errors because is dna
9551  CLEAR_ERRORS
9552  // AddChromosomeNoLocation(expected_errors, entry);
9553  eval = validator.Validate(seh, options);
9554  CheckErrors(*eval, expected_errors);
9555  // error if not dna
9556  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
9557  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MolInfoConflictsWithBioSource",
9558  "Taxonomy indicates single-stranded DNA, molecule type (RNA) is conflicting."));
9559  eval = validator.Validate(seh, options);
9560  CheckErrors(*eval, expected_errors);
9561 
9562  // test for double-stranded DNS viruses
9563  unit_test_util::SetLineage(entry, "Viruses; Hepadnaviridae; foo");
9564  // error because not dna
9565  expected_errors.back()->SetErrMsg("Taxonomy indicates double-stranded DNA, molecule type (RNA) is conflicting.");
9566  eval = validator.Validate(seh, options);
9567  CheckErrors(*eval, expected_errors);
9568  //no error if dna
9569  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
9570  CLEAR_ERRORS
9571  // AddChromosomeNoLocation(expected_errors, entry);
9572  eval = validator.Validate(seh, options);
9573  CheckErrors(*eval, expected_errors);
9574  CLEAR_ERRORS
9575 }
9576 
9577 
9578 BOOST_AUTO_TEST_CASE(Test_Descr_FakeStructuredComment)
9579 {
9580  // prepare entry
9582  CRef<CSeqdesc> sdesc(new CSeqdesc());
9583  sdesc->SetComment("This comment contains ::");
9584  entry->SetSeq().SetDescr().Set().push_back(sdesc);
9585 
9587 
9588  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "FakeStructuredComment",
9589  "Comment may be formatted to look like a structured comment."));
9590  // AddChromosomeNoLocation(expected_errors, entry);
9591  eval = validator.Validate(seh, options);
9592  CheckErrors(*eval, expected_errors);
9593 
9594  CLEAR_ERRORS
9595 }
9596 
9597 
9598 BOOST_AUTO_TEST_CASE(Test_Descr_StructuredCommentPrefixOrSuffixMissing)
9599 {
9600  // prepare entry
9602  CRef<CSeqdesc> sdesc(new CSeqdesc());
9603  sdesc->SetUser().SetType().SetStr("StructuredComment");
9604  entry->SetSeq().SetDescr().Set().push_back(sdesc);
9605 
9606  sdesc->SetUser().AddField("OneField", "some value", CUser_object::eParse_String);
9608 
9609  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "StrucCommMissingPrefixOrSuffix",
9610  "Structured Comment lacks prefix and/or suffix"));
9611  // AddChromosomeNoLocation(expected_errors, entry);
9612  eval = validator.Validate(seh, options);
9613  CheckErrors(*eval, expected_errors);
9614  CLEAR_ERRORS
9615 }
9616 
9617 
9618 BOOST_AUTO_TEST_CASE(Test_Generic_NonAsciiAsn)
9619 {
9620  // prepare entry
9622 
9624  CScope scope(*objmgr);
9625  scope.AddDefaults();
9626  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
9628  CValidator validator(*objmgr);
9629  unsigned int options = CValidator::eVal_need_isojta
9634  vector<CExpectedError*> expected_errors;
9635 
9636  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Fatal, "NonAsciiAsn",
9637  "Non-ascii chars in input ASN.1 strings"));
9638  // AddChromosomeNoLocation(expected_errors, entry);
9639  eval = validator.Validate(seh, options);
9640  CheckErrors(*eval, expected_errors);
9641 
9642  // error should only appear once
9643  scope.RemoveTopLevelSeqEntry(seh);
9645  seh = scope.AddTopLevelSeqEntry(*entry);
9646  ChangeErrorAcc(expected_errors, "lcl|nuc");
9647  eval = validator.Validate(seh, options);
9648  CheckErrors(*eval, expected_errors);
9649 
9650  CLEAR_ERRORS
9651 }
9652 
9653 
9654 BOOST_AUTO_TEST_CASE(Test_SEQ_DESCR_MissingPersonalCollectionName)
9655 {
9658 
9660  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingPersonalCollectionName",
9661  "Personal collection does not have name of collector"));
9662  // AddChromosomeNoLocation(expected_errors, entry);
9663  eval = validator.Validate(seh, options);
9664  CheckErrors(*eval, expected_errors);
9665 
9666  CLEAR_ERRORS
9667 }
9668 
9669 
9670 BOOST_AUTO_TEST_CASE(Test_Generic_AuthorListHasEtAl)
9671 {
9672  // prepare entry
9674  CRef<CAuthor> author(new CAuthor());
9675  author->SetName().SetName().SetLast("et al.");
9676  CRef<CPub> pub(new CPub());
9677  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
9679  art_title->SetName("article title");
9680  pub->SetArticle().SetTitle().Set().push_back(art_title);
9681  CRef<CSeqdesc> desc(new CSeqdesc());
9682  desc->SetPub().SetPub().Set().push_back(pub);
9683  entry->SetDescr().Set().push_back(desc);
9684 
9686 
9687  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "AuthorListHasEtAl",
9688  "Author list ends in et al."));
9689  // AddChromosomeNoLocation(expected_errors, entry);
9690  eval = validator.Validate(seh, options);
9691  CheckErrors(*eval, expected_errors);
9692 
9693  pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
9695  book_title->SetName("book title");
9696  pub->SetMan().SetCit().SetTitle().Set().push_back(book_title);
9697  eval = validator.Validate(seh, options);
9698  CheckErrors(*eval, expected_errors);
9699 
9700  pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9701  pub->SetBook().SetTitle().Set().push_back(book_title);
9702  eval = validator.Validate(seh, options);
9703  CheckErrors(*eval, expected_errors);
9704 
9705  pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9706  pub->SetProc().SetBook().SetTitle().Set().push_back(book_title);
9707  eval = validator.Validate(seh, options);
9708  CheckErrors(*eval, expected_errors);
9709 
9710  pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
9711  pub->SetGen().SetTitle("gen title");
9712  pub->SetGen().SetDate().SetStd().SetYear(2009);
9713  eval = validator.Validate(seh, options);
9714  CheckErrors(*eval, expected_errors);
9715 
9716  pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9717  pub->SetSub().SetAuthors().SetAffil().SetStr("some affiliation");
9718 
9719  pub->SetSub().SetDate().SetStd().SetYear(2009);
9720  pub->SetSub().SetDate().SetStd().SetMonth(12);
9721  pub->SetSub().SetDate().SetStd().SetDay(31);
9722 
9723  eval = validator.Validate(seh, options);
9724  CheckErrors(*eval, expected_errors);
9725 
9726  // try as pub feature
9727  scope.RemoveTopLevelSeqEntry(seh);
9728  entry->SetDescr().Set().pop_back();
9729  CRef<CSeq_feat> feat(new CSeq_feat());
9730  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
9731  feat->SetLocation().SetInt().SetFrom(0);
9732  feat->SetLocation().SetInt().SetTo(10);
9733  feat->SetData().SetPub().SetPub().Set().push_back(pub);
9734  CRef<CSeq_annot> annot(new CSeq_annot());
9735  annot->SetData().SetFtable().push_back(feat);
9736  entry->SetSeq().SetAnnot().push_back(annot);
9737  seh = scope.AddTopLevelSeqEntry(*entry);
9738 
9739  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
9740  pub->SetArticle().SetTitle().Set().push_back(art_title);
9741  eval = validator.Validate(seh, options);
9742  CheckErrors(*eval, expected_errors);
9743 
9744  pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
9745  pub->SetMan().SetCit().SetTitle().Set().push_back(book_title);
9746  eval = validator.Validate(seh, options);
9747  CheckErrors(*eval, expected_errors);
9748 
9749  pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9750  pub->SetBook().SetTitle().Set().push_back(book_title);
9751  eval = validator.Validate(seh, options);
9752  CheckErrors(*eval, expected_errors);
9753 
9754  pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9755  pub->SetProc().SetBook().SetTitle().Set().push_back(book_title);
9756  eval = validator.Validate(seh, options);
9757  CheckErrors(*eval, expected_errors);
9758 
9759  pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
9760  pub->SetGen().SetTitle("gen title");
9761  pub->SetGen().SetDate().SetStd().SetYear(2009);
9762  eval = validator.Validate(seh, options);
9763  CheckErrors(*eval, expected_errors);
9764 
9765  pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9766  pub->SetSub().SetAuthors().SetAffil().SetStr("some affiliation");
9767 
9768  pub->SetSub().SetDate().SetStd().SetYear(2009);
9769  pub->SetSub().SetDate().SetStd().SetMonth(12);
9770  pub->SetSub().SetDate().SetStd().SetDay(31);
9771 
9772  eval = validator.Validate(seh, options);
9773  CheckErrors(*eval, expected_errors);
9774 
9775  // look for contains instead of ends with
9776  scope.RemoveTopLevelSeqEntry(seh);
9777  entry->SetSeq().SetAnnot().pop_back();
9778  entry->SetDescr().Set().push_back(desc);
9779  seh = scope.AddTopLevelSeqEntry(*entry);
9780 
9781  expected_errors[0]->SetErrMsg("Author list contains et al.");
9783 
9784  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
9785  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author2);
9786  pub->SetArticle().SetTitle().Set().push_back(art_title);
9787  eval = validator.Validate(seh, options);
9788  CheckErrors(*eval, expected_errors);
9789 
9790  pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
9791  pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author2);
9792  pub->SetMan().SetCit().SetTitle().Set().push_back(book_title);
9793  eval = validator.Validate(seh, options);
9794  CheckErrors(*eval, expected_errors);
9795 
9796  pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9797  pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author2);
9798  pub->SetBook().SetTitle().Set().push_back(book_title);
9799  eval = validator.Validate(seh, options);
9800  CheckErrors(*eval, expected_errors);
9801 
9802  pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9803  pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author2);
9804  pub->SetProc().SetBook().SetTitle().Set().push_back(book_title);
9805  eval = validator.Validate(seh, options);
9806  CheckErrors(*eval, expected_errors);
9807 
9808  pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
9809  pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author2);
9810  pub->SetGen().SetTitle("gen title");
9811  pub->SetGen().SetDate().SetStd().SetYear(2009);
9812  eval = validator.Validate(seh, options);
9813  CheckErrors(*eval, expected_errors);
9814 
9815  pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9816  pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author2);
9817  pub->SetSub().SetAuthors().SetAffil().SetStr("some affiliation");
9818 
9819  pub->SetSub().SetDate().SetStd().SetYear(2009);
9820  pub->SetSub().SetDate().SetStd().SetMonth(12);
9821  pub->SetSub().SetDate().SetStd().SetDay(31);
9822 
9823  eval = validator.Validate(seh, options);
9824  CheckErrors(*eval, expected_errors);
9825 
9826  // try as pub feature
9827  scope.RemoveTopLevelSeqEntry(seh);
9828  entry->SetDescr().Set().pop_back();
9829  entry->SetSeq().SetAnnot().push_back(annot);
9830  seh = scope.AddTopLevelSeqEntry(*entry);
9831 
9832  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
9833  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author2);
9834  pub->SetArticle().SetTitle().Set().push_back(art_title);
9835  eval = validator.Validate(seh, options);
9836  CheckErrors(*eval, expected_errors);
9837 
9838  pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
9839  pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author2);
9840  pub->SetMan().SetCit().SetTitle().Set().push_back(book_title);
9841  eval = validator.Validate(seh, options);
9842  CheckErrors(*eval, expected_errors);
9843 
9844  pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9845  pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author2);
9846  pub->SetBook().SetTitle().Set().push_back(book_title);
9847  eval = validator.Validate(seh, options);
9848  CheckErrors(*eval, expected_errors);
9849 
9850  pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9851  pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author2);
9852  pub->SetProc().SetBook().SetTitle().Set().push_back(book_title);
9853  eval = validator.Validate(seh, options);
9854  CheckErrors(*eval, expected_errors);
9855 
9856  pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
9857  pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author2);
9858  pub->SetGen().SetTitle("gen title");
9859  pub->SetGen().SetDate().SetStd().SetYear(2009);
9860  eval = validator.Validate(seh, options);
9861  CheckErrors(*eval, expected_errors);
9862 
9863  pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9864  pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author2);
9865  pub->SetSub().SetAuthors().SetAffil().SetStr("some affiliation");
9866 
9867  pub->SetSub().SetDate().SetStd().SetYear(2009);
9868  pub->SetSub().SetDate().SetStd().SetMonth(12);
9869  pub->SetSub().SetDate().SetStd().SetDay(31);
9870 
9871  eval = validator.Validate(seh, options);
9872  CheckErrors(*eval, expected_errors);
9873 
9874  CLEAR_ERRORS
9875 }
9876 
9877 
9878 BOOST_AUTO_TEST_CASE(Test_Generic_MissingPubRequirement)
9879 {
9880  // validate cit-sub
9881  CRef<CSeq_submit> submit(new CSeq_submit());
9882 
9884  submit->SetData().SetEntrys().push_back(entry);
9886  submit->SetSub().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
9887  submit->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetAffil("some affiliation");
9888 
9889  submit->SetSub().SetCit().SetDate().SetStd().SetYear(2009);
9890  submit->SetSub().SetCit().SetDate().SetStd().SetMonth(12);
9891  submit->SetSub().SetCit().SetDate().SetStd().SetDay(31);
9892 
9894 
9895  vector<string> ids;
9896  ids.push_back("good");
9897  ids.push_back("NC_123456");
9898 
9899  for (const string& id_it : ids) {
9900  EDiagSev sev = eDiag_Warning;
9901  scope.RemoveTopLevelSeqEntry(seh);
9902  if (NStr::StartsWith(id_it, "NC_")) {
9903  entry->SetSeq().SetId().front()->SetOther().SetAccession(id_it);
9904  } else {
9905  entry->SetSeq().SetId().front()->SetLocal().SetStr(id_it);
9906  sev = eDiag_Critical;
9907  }
9908  seh = scope.AddTopLevelSeqEntry(*entry);
9909 
9910  submit->SetSub().SetCit().SetAuthors().ResetAffil();
9911  submit->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetAffil("some affiliation");
9912  submit->SetSub().ResetContact();
9913  string msg_acc = NStr::StartsWith(id_it, "NC") ? "ref|" + id_it + "|" : "lcl|" + id_it;
9914  expected_errors.push_back(new CExpectedError(msg_acc,
9915  sev, "MissingPubRequirement",
9916  "Submission citation affiliation has no country"));
9917  // AddChromosomeNoLocation(expected_errors, entry);
9918  eval = validator.Validate(*submit, &scope, options);
9919  CheckErrors(*eval, expected_errors);
9920 
9921  submit->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetCountry("USA");
9922  expected_errors[0]->SetErrMsg("Submission citation affiliation has no state");
9923  expected_errors[0]->SetSeverity(eDiag_Warning);
9924  eval = validator.Validate(*submit, &scope, options);
9925  CheckErrors(*eval, expected_errors);
9926  CLEAR_ERRORS
9927 
9928  submit->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetSub("VA");
9929  submit->SetSub().SetContact().SetContact().SetAffil().SetStd().SetAffil("some affiliation");
9930  expected_errors.push_back(new CExpectedError(msg_acc, sev, "MissingPubRequirement",
9931  "Submission citation affiliation has no country"));
9932  expected_errors[0]->SetAccession("");
9933  expected_errors[0]->SetSeverity(eDiag_Warning);
9934  // AddChromosomeNoLocation(expected_errors, entry);
9935  eval = validator.Validate(*submit, &scope, options);
9936  CheckErrors(*eval, expected_errors);
9937 
9938  submit->SetSub().SetContact().SetContact().SetAffil().SetStd().SetCountry("USA");
9939  expected_errors[0]->SetErrMsg("Submission citation affiliation has no state");
9940  expected_errors[0]->SetSeverity(eDiag_Warning);
9941  eval = validator.Validate(*submit, &scope, options);
9942  CheckErrors(*eval, expected_errors);
9943  CLEAR_ERRORS
9944 
9945  scope.RemoveTopLevelSeqEntry(seh);
9946  CRef<CPub> pub(new CPub());
9947  CRef<CSeqdesc> desc(new CSeqdesc());
9948  desc->SetPub().SetPub().Set().push_back(pub);
9949  entry->SetDescr().Set().push_back(desc);
9950  pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9951  pub->SetSub().SetAuthors().SetAffil().SetStd().SetAffil("some affiliation");
9952 
9953  pub->SetSub().SetDate().SetStd().SetYear(2009);
9954  pub->SetSub().SetDate().SetStd().SetMonth(12);
9955  pub->SetSub().SetDate().SetStd().SetDay(31);
9956 
9957  seh = scope.AddTopLevelSeqEntry(*entry);
9958 
9959  expected_errors.push_back(new CExpectedError(msg_acc, sev, "MissingPubRequirement",
9960  "Submission citation affiliation has no country"));
9961  // AddChromosomeNoLocation(expected_errors, entry);
9962  eval = validator.Validate(seh, options);
9963  CheckErrors(*eval, expected_errors);
9964 
9965  pub->SetSub().SetAuthors().SetAffil().SetStd().SetCountry("USA");
9966  expected_errors[0]->SetErrMsg("Submission citation affiliation has no state");
9967  expected_errors[0]->SetSeverity(eDiag_Warning);
9968  eval = validator.Validate(seh, options);
9969  CheckErrors(*eval, expected_errors);
9970 
9971  pub->SetSub().SetAuthors().SetAffil().SetStd().SetSub("VA");
9972  pub->SetSub().SetAuthors().SetNames().SetStd().pop_back();
9973 
9974  expected_errors[0]->SetErrMsg("Submission citation has no author names");
9975  expected_errors[0]->SetSeverity(eDiag_Critical);
9976  eval = validator.Validate(seh, options);
9977  CheckErrors(*eval, expected_errors);
9978  CLEAR_ERRORS
9979 
9980  pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9981  pub->SetSub().SetAuthors().SetAffil().SetStd().ResetCountry();
9982  pub->SetSub().SetAuthors().SetAffil().SetStd().ResetSub();
9983  pub->SetSub().SetAuthors().SetAffil().SetStd().ResetAffil();
9984  expected_errors.push_back(new CExpectedError(msg_acc,
9985  NStr::StartsWith(id_it, "NC_") ? eDiag_Warning : eDiag_Critical,
9986  "MissingPubRequirement",
9987  "Submission citation has no affiliation"));
9988  // AddChromosomeNoLocation(expected_errors, entry);
9989  eval = validator.Validate(seh, options);
9990  CheckErrors(*eval, expected_errors);
9991 
9992  pub->SetSub().SetAuthors().ResetAffil();
9993  eval = validator.Validate(seh, options);
9994  CheckErrors(*eval, expected_errors);
9995 
9997  expected_errors[0]->SetSeverity(eDiag_Warning);
9998  eval = validator.Validate(seh, options);
9999  CheckErrors(*eval, expected_errors);
10001  eval = validator.Validate(seh, options);
10002  CheckErrors(*eval, expected_errors);
10004  eval = validator.Validate(seh, options);
10005  CheckErrors(*eval, expected_errors);
10007 
10008  CLEAR_ERRORS
10009 
10010  pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
10011  pub->SetGen().SetCit("Does not start with expected text");
10012 
10013  expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Error, "MissingPubRequirement",
10014  "Unpublished citation text invalid"));
10015  expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingPubRequirement",
10016  "Publication date missing"));
10017  // AddChromosomeNoLocation(expected_errors, entry);
10018 
10019  eval = validator.Validate(seh, options);
10020  CheckErrors(*eval, expected_errors);
10021 
10022  delete expected_errors[1];
10023  expected_errors[1] = nullptr;
10024 
10025  pub->SetGen().SetCit("submitted starts with expected text");
10026  pub->SetGen().SetDate().SetStr("?");
10027  expected_errors[0]->SetErrMsg("Publication date marked as '?'");
10028  expected_errors[0]->SetSeverity(eDiag_Warning);
10029  eval = validator.Validate(seh, options);
10030  CheckErrors(*eval, expected_errors);
10031 
10032  pub->SetGen().SetDate().SetStd().SetYear(0);
10033  expected_errors[0]->SetErrMsg("Publication date not set");
10034  eval = validator.Validate(seh, options);
10035  CheckErrors(*eval, expected_errors);
10036 
10037  pub->SetGen().ResetDate();
10038  pub->SetGen().SetAuthors().SetNames().SetStd().pop_back();
10039  if (!NStr::StartsWith(id_it, "NC_")) {
10040  expected_errors[0]->SetSeverity(eDiag_Error);
10041  }
10042  expected_errors[0]->SetErrMsg("Publication has no author names");
10043  eval = validator.Validate(seh, options);
10044  CheckErrors(*eval, expected_errors);
10045 
10046  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
10047  expected_errors[0]->SetSeverity(eDiag_Error);
10048  expected_errors[0]->SetErrMsg("Publication has no title");
10049  eval = validator.Validate(seh, options);
10050  CheckErrors(*eval, expected_errors);
10051 
10053  art_title->SetName("article title");
10054  pub->SetArticle().SetTitle().Set().push_back(art_title);
10055  pub->SetArticle().SetAuthors().SetNames().SetStd().pop_back();
10056  expected_errors[0]->SetErrMsg("Publication has no author names");
10057  if (NStr::StartsWith(id_it, "NC_")) {
10058  expected_errors[0]->SetSeverity(eDiag_Warning);
10059  }
10060  eval = validator.Validate(seh, options);
10061  CheckErrors(*eval, expected_errors);
10062 
10063  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
10064  pub->SetArticle().SetFrom().SetJournal().SetImp().SetVolume("vol 1");
10065  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14-32");
10066  pub->SetArticle().SetFrom().SetJournal().SetImp().SetDate().SetStd().SetYear(2009);
10067  expected_errors[0]->SetSeverity(eDiag_Error);
10068  expected_errors[0]->SetErrMsg("Journal title missing");
10069  expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingISOJTA",
10070  "ISO journal title abbreviation missing"));
10071  eval = validator.Validate(seh, options);
10072  CheckErrors(*eval, expected_errors);
10074  journal_title->SetName("journal_title");
10075  pub->SetArticle().SetFrom().SetJournal().SetTitle().Set().push_back(journal_title);
10076  delete expected_errors[0];
10077  expected_errors[0] = nullptr;
10078  eval = validator.Validate(seh, options);
10079  CheckErrors(*eval, expected_errors);
10080  CLEAR_ERRORS
10081 
10082  expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingVolume",
10083  "Journal volume missing"));
10084  expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingPages",
10085  "Journal pages missing"));
10086  // AddChromosomeNoLocation(expected_errors, entry);
10088  iso_jta->SetIso_jta("abbr");
10089  pub->SetArticle().SetFrom().SetJournal().SetTitle().Set().push_back(iso_jta);
10090  pub->SetArticle().SetFrom().SetJournal().SetImp().ResetVolume();
10091  pub->SetArticle().SetFrom().SetJournal().SetImp().ResetPages();
10092  eval = validator.Validate(seh, options);
10093  CheckErrors(*eval, expected_errors);
10094  CLEAR_ERRORS
10095  expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingPages",
10096  "Journal pages missing"));
10097  // AddChromosomeNoLocation(expected_errors, entry);
10098  pub->SetArticle().SetFrom().SetJournal().SetImp().SetVolume("vol 1");
10099  eval = validator.Validate(seh, options);
10100  CheckErrors(*eval, expected_errors);
10101  CLEAR_ERRORS
10102  expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingVolume",
10103  "Journal volume missing"));
10104  // AddChromosomeNoLocation(expected_errors, entry);
10105  pub->SetArticle().SetFrom().SetJournal().SetImp().ResetVolume();
10106  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14-32");
10107  expected_errors[0]->SetErrMsg("Journal volume missing");
10108  eval = validator.Validate(seh, options);
10109  CheckErrors(*eval, expected_errors);
10110  CLEAR_ERRORS
10111  expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingPubRequirement",
10112  "Publication date missing"));
10113  // AddChromosomeNoLocation(expected_errors, entry);
10114  pub->SetArticle().SetFrom().SetJournal().SetImp().SetVolume("vol 1");
10115  pub->SetArticle().SetFrom().SetJournal().SetImp().ResetDate();
10116  expected_errors[0]->SetErrMsg("Publication date missing");
10117  expected_errors[0]->SetSeverity(eDiag_Warning);
10118  eval = validator.Validate(seh, options);
10119  CheckErrors(*eval, expected_errors);
10120  pub->SetArticle().SetFrom().SetJournal().SetImp().SetDate().SetStr("?");
10121  expected_errors[0]->SetErrMsg("Publication date marked as '?'");
10122  eval = validator.Validate(seh, options);
10123  CheckErrors(*eval, expected_errors);
10124  pub->SetArticle().SetFrom().SetJournal().SetImp().SetDate().SetStd().SetYear(0);
10125  expected_errors[0]->SetErrMsg("Publication date not set");
10126  eval = validator.Validate(seh, options);
10127  CheckErrors(*eval, expected_errors);
10128 
10129  CLEAR_ERRORS
10130  // AddChromosomeNoLocation(expected_errors, entry);
10131  //suppress ISOJTA warning if electronic journal
10132  pub->SetArticle().SetFrom().SetJournal().SetImp().SetDate().SetStd().SetYear(2009);
10133  pub->SetArticle().SetFrom().SetJournal().SetTitle().Set().pop_back();
10134  journal_title->SetName("(er) Journal Title");
10135  eval = validator.Validate(seh, options);
10136  CheckErrors(*eval, expected_errors);
10137  journal_title->SetName("(journal title");
10138  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_epublish);
10139  eval = validator.Validate(seh, options);
10140  CheckErrors(*eval, expected_errors);
10141  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_aheadofprint);
10142  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPrepub(CImprint::ePrepub_in_press);
10143  expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "PublicationInconsistency",
10144  "In-press is not expected to have page numbers"));
10145  eval = validator.Validate(seh, options);
10146  CheckErrors(*eval, expected_errors);
10147  CLEAR_ERRORS
10148 
10149  entry->SetDescr().Set().pop_back();
10150  }
10151 }
10152 
10153 
10154 BOOST_AUTO_TEST_CASE(Test_Generic_UnnecessaryPubEquiv)
10155 {
10157 
10158  CRef<CPub> pub(new CPub());
10159  pub->SetEquiv();
10160  CRef<CSeqdesc> desc(new CSeqdesc());
10161  desc->SetPub().SetPub().Set().push_back(pub);
10162  entry->SetDescr().Set().push_back(desc);
10163 
10165 
10166  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryPubEquiv",
10167  "Publication has unexpected internal Pub-equiv"));
10168  // AddChromosomeNoLocation(expected_errors, entry);
10169  eval = validator.Validate(seh, options);
10170  CheckErrors(*eval, expected_errors);
10171 
10172  CLEAR_ERRORS
10173 }
10174 
10175 
10176 BOOST_AUTO_TEST_CASE(Test_Generic_BadPageNumbering)
10177 {
10179 
10181  CRef<CSeqdesc> desc(new CSeqdesc());
10182  desc->SetPub().SetPub().Set().push_back(pub);
10183  entry->SetDescr().Set().push_back(desc);
10184 
10186 
10187  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("0-32");
10188  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPageNumbering",
10189  "Page numbering has zero value"));
10190  // AddChromosomeNoLocation(expected_errors, entry);
10191  eval = validator.Validate(seh, options);
10192  CheckErrors(*eval, expected_errors);
10193  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14-0");
10194  eval = validator.Validate(seh, options);
10195  CheckErrors(*eval, expected_errors);
10196 
10197  expected_errors[0]->SetErrMsg("Page numbering has negative value");
10198  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14--32");
10199  eval = validator.Validate(seh, options);
10200  CheckErrors(*eval, expected_errors);
10201 
10202  expected_errors[0]->SetErrMsg("Page numbering out of order");
10203  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("32-14");
10204  eval = validator.Validate(seh, options);
10205  CheckErrors(*eval, expected_errors);
10206 
10207  expected_errors[0]->SetErrMsg("Page numbering greater than 50");
10208  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14-65");
10209  eval = validator.Validate(seh, options);
10210  CheckErrors(*eval, expected_errors);
10211 
10212  expected_errors[0]->SetErrMsg("Page numbering stop looks strange");
10213  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14-A");
10214  eval = validator.Validate(seh, options);
10215  CheckErrors(*eval, expected_errors);
10216 
10217  expected_errors[0]->SetErrMsg("Page numbering start looks strange");
10218  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages(".14-32");
10219  eval = validator.Validate(seh, options);
10220  CheckErrors(*eval, expected_errors);
10221 
10222  CLEAR_ERRORS
10223 }
10224 
10225 
10226 BOOST_AUTO_TEST_CASE(Test_Generic_MedlineEntryPub)
10227 {
10229 
10230  CRef<CPub> pub(new CPub());
10231  pub->SetMedline();
10232  CRef<CSeqdesc> desc(new CSeqdesc());
10233  desc->SetPub().SetPub().Set().push_back(pub);
10234  entry->SetDescr().Set().push_back(desc);
10235 
10237 
10238  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MedlineEntryPub",
10239  "Publication is medline entry"));
10240  // AddChromosomeNoLocation(expected_errors, entry);
10241  eval = validator.Validate(seh, options);
10242  CheckErrors(*eval, expected_errors);
10243 
10244  CLEAR_ERRORS
10245 }
10246 
10247 
10248 static void MakeBadSeasonDate(CDate& date)
10249 {
10250  date.SetStd().SetYear(2009);
10251  date.SetStd().SetMonth(12);
10252  date.SetStd().SetDay(31);
10253  date.SetStd().SetSeason("1");
10254 }
10255 
10256 
10257 BOOST_AUTO_TEST_CASE(Test_Generic_BadDate)
10258 {
10260 
10261  // find sub pub and other pub
10262  CRef<CPub> subpub;
10263  CRef<CPub> otherpub;
10264  for (auto& it : entry->SetSeq().SetDescr().Set()) {
10265  if (it->IsPub()) {
10266  if (it->GetPub().GetPub().Get().front()->IsSub()) {
10267  subpub = it->SetPub().SetPub().Set().front();
10268  } else {
10269  otherpub = it->SetPub().SetPub().Set().front();
10270  }
10271  }
10272  }
10273 
10275 
10276  subpub->SetSub().SetDate().SetStr("?");
10277  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDate",
10278  "Submission citation date has error - BAD_STR"));
10279  // AddChromosomeNoLocation(expected_errors, entry);
10280  eval = validator.Validate(seh, options);
10281  CheckErrors(*eval, expected_errors);
10282 
10283  subpub->SetSub().SetDate().SetStd().SetYear(0);
10284  expected_errors[0]->SetErrMsg("Submission citation date has error - BAD_YEAR");
10285  eval = validator.Validate(seh, options);
10286  CheckErrors(*eval, expected_errors);
10287 
10288  subpub->SetSub().SetDate().SetStd().SetYear(2009);
10289  subpub->SetSub().SetDate().SetStd().SetMonth(13);
10290  expected_errors[0]->SetErrMsg("Submission citation date has error - BAD_MONTH");
10291  eval = validator.Validate(seh, options);
10292  CheckErrors(*eval, expected_errors);
10293 
10294  subpub->SetSub().SetDate().SetStd().SetYear(2009);
10295  subpub->SetSub().SetDate().SetStd().SetMonth(12);
10296  subpub->SetSub().SetDate().SetStd().SetDay(32);
10297  expected_errors[0]->SetErrMsg("Submission citation date has error - BAD_DAY");
10298  eval = validator.Validate(seh, options);
10299  CheckErrors(*eval, expected_errors);
10300 
10301  MakeBadSeasonDate(subpub->SetSub().SetDate());
10302  expected_errors[0]->SetErrMsg("Submission citation date has error - BAD_SEASON");
10303  eval = validator.Validate(seh, options);
10304  CheckErrors(*eval, expected_errors);
10306 
10308  CRef<CPub> gen(new CPub());
10309  gen->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
10310  gen->SetGen().SetTitle("gen title");
10311  MakeBadSeasonDate(gen->SetGen().SetDate());
10312  otherpub->Assign(*gen);
10313  expected_errors[0]->SetErrMsg("Publication date has error - BAD_SEASON");
10314  eval = validator.Validate(seh, options);
10315  CheckErrors(*eval, expected_errors);
10316 
10318  MakeBadSeasonDate(otherpub->SetArticle().SetFrom().SetJournal().SetImp().SetDate());
10319  eval = validator.Validate(seh, options);
10320  CheckErrors(*eval, expected_errors);
10322 
10323  CRef<CSeqdesc> desc(new CSeqdesc());
10324  entry->SetDescr().Set().push_back(desc);
10326  expected_errors[0]->SetErrMsg("Create date has error - BAD_SEASON");
10327  eval = validator.Validate(seh, options);
10328  CheckErrors(*eval, expected_errors);
10329 
10331  expected_errors[0]->SetErrMsg("Update date has error - BAD_SEASON");
10332  eval = validator.Validate(seh, options);
10333  CheckErrors(*eval, expected_errors);
10334 
10335  CLEAR_ERRORS
10336 }
10337 
10338 
10339 BOOST_AUTO_TEST_CASE(Test_Generic_StructuredCitGenCit)
10340 {
10342 
10343  CRef<CPub> pub(new CPub());
10344  pub->SetGen().SetAuthors().SetNames().SetStd().push_back(unit_test_util::BuildGoodAuthor());
10345  pub->SetGen().SetTitle("gen title");
10346  pub->SetGen().SetDate().SetStd().SetYear(2009);
10347  pub->SetGen().SetCit("submitted something Title=foo");
10348  CRef<CSeqdesc> desc(new CSeqdesc());
10349  desc->SetPub().SetPub().Set().push_back(pub);
10350  entry->SetDescr().Set().push_back(desc);
10351 
10353 
10354  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StructuredCitGenCit",
10355  "Unpublished citation has embedded Title"));
10356  // AddChromosomeNoLocation(expected_errors, entry);
10357  eval = validator.Validate(seh, options);
10358  CheckErrors(*eval, expected_errors);
10359 
10360  pub->SetGen().SetCit("submitted something Journal=bar");
10361  expected_errors[0]->SetErrMsg("Unpublished citation has embedded Journal");
10362  eval = validator.Validate(seh, options);
10363  CheckErrors(*eval, expected_errors);
10364 
10365  CLEAR_ERRORS
10366 }
10367 
10368 
10369 BOOST_AUTO_TEST_CASE(Test_Generic_CollidingSerialNumbers)
10370 {
10372  CRef<CAuthor> blank;
10373 
10375  CRef<CSeqdesc> desc(new CSeqdesc());
10376  desc->SetPub().SetPub().Set().push_back(pub);
10377  entry->SetDescr().Set().push_back(desc);
10378 
10379  CRef<CSeq_feat> feat(new CSeq_feat());
10380  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
10381  feat->SetLocation().SetInt().SetFrom(0);
10382  feat->SetLocation().SetInt().SetTo(15);
10383  feat->SetData().SetPub().SetPub().Set().push_back(unit_test_util::BuildGoodCitGenPub(blank, 1234));
10384  unit_test_util::AddFeat(feat, entry);
10386 
10387  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CollidingSerialNumbers",
10388  "Multiple publications have serial number 1234"));
10389  // AddChromosomeNoLocation(expected_errors, entry);
10390  eval = validator.Validate(seh, options);
10391  CheckErrors(*eval, expected_errors);
10392 
10393  CLEAR_ERRORS
10394 }
10395 
10396 
10397 BOOST_AUTO_TEST_CASE(Test_Generic_EmbeddedScript)
10398 {
10401  author->SetName().SetName().SetLast("foo<script");
10402 
10404  CRef<CSeqdesc> desc(new CSeqdesc());
10405  desc->SetPub().SetPub().Set().push_back(pub);
10406  entry->SetDescr().Set().push_back(desc);
10407 
10409 
10411 
10412  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadCharInAuthorLastName",
10413  "Bad characters in author foo<script"));
10414  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "EmbeddedScript",
10415  "Script tag found in item"));
10416  // AddChromosomeNoLocation(expected_errors, entry);
10417  eval = validator.Validate(seh, options);
10418  CheckErrors(*eval, expected_errors);
10419 
10420  author->SetName().SetName().SetLast("Last");
10421  delete expected_errors[0];
10422  expected_errors[0] = nullptr;
10423 
10424  feat->SetComment("<object");
10425  eval = validator.Validate(seh, options);
10426  CheckErrors(*eval, expected_errors);
10427  feat->ResetComment();
10428  feat->SetComment("misc_feature needs a comment");
10429 
10430  feat->SetTitle("<applet");
10431  eval = validator.Validate(seh, options);
10432  CheckErrors(*eval, expected_errors);
10433  feat->ResetTitle();
10434 
10436  eval = validator.Validate(seh, options);
10437  CheckErrors(*eval, expected_errors);
10439 
10441  eval = validator.Validate(seh, options);
10442  CheckErrors(*eval, expected_errors);
10444 
10445  pub->SetGen().SetTitle("javascript:");
10446  eval = validator.Validate(seh, options);
10447  CheckErrors(*eval, expected_errors);
10448  pub->SetGen().SetTitle("good title");
10449 
10450  unit_test_util::SetLineage(entry, "vbscript:");
10451  eval = validator.Validate(seh, options);
10452  CheckErrors(*eval, expected_errors);
10453  unit_test_util::SetLineage(entry, "");
10454 
10455  CLEAR_ERRORS
10456 }
10457 
10458 
10459 BOOST_AUTO_TEST_CASE(Test_Generic_PublicationInconsistency)
10460 {
10462  CRef<CSeqdesc> desc(new CSeqdesc());
10464  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_aheadofprint);
10465  desc->SetPub().SetPub().Set().push_back(pub);
10466  entry->SetSeq().SetDescr().Set().push_back(desc);
10467 
10469  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10470  "Ahead-of-print without in-press"));
10471  // AddChromosomeNoLocation(expected_errors, entry);
10472  eval = validator.Validate(seh, options);
10473  CheckErrors(*eval, expected_errors);
10474 
10475  CLEAR_ERRORS
10476  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_epublish);
10477  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPrepub(CImprint::ePrepub_in_press);
10478  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10479  "In-press is not expected to have page numbers"));
10480  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10481  "Electronic-only publication should not also be in-press"));
10482  // AddChromosomeNoLocation(expected_errors, entry);
10483  eval = validator.Validate(seh, options);
10484  CheckErrors(*eval, expected_errors);
10485  pub->SetArticle().SetFrom().SetJournal().SetImp().ResetPubstatus();
10486  pub->SetArticle().SetFrom().SetJournal().SetImp().ResetPrepub();
10487 
10488  CLEAR_ERRORS
10489  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10490  "Empty consortium"));
10491  // AddChromosomeNoLocation(expected_errors, entry);
10492  CRef<CAuthor> consortium(new CAuthor());
10493  consortium->SetName().SetConsortium("");
10494  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(consortium);
10495  eval = validator.Validate(seh, options);
10496  CheckErrors(*eval, expected_errors);
10497 
10498  consortium->SetName().SetConsortium("duplicate");
10499  CRef<CAuthor> consortium2(new CAuthor());
10500  consortium2->SetName().SetConsortium("duplicate");
10501  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(consortium2);
10502  expected_errors[0]->SetErrMsg("Duplicate consortium 'duplicate'");
10503  eval = validator.Validate(seh, options);
10504  CheckErrors(*eval, expected_errors);
10505 
10506  CLEAR_ERRORS
10507  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10508  "In-press is not expected to have page numbers"));
10509  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10510  "Duplicate consortium 'duplicate'"));
10511  // AddChromosomeNoLocation(expected_errors, entry);
10512  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPrepub(CImprint::ePrepub_in_press);
10513  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("75-84");
10514  eval = validator.Validate(seh, options);
10515  CheckErrors(*eval, expected_errors);
10516  pub->SetArticle().SetFrom().SetJournal().SetImp().ResetPrepub();
10517  pub->SetArticle().SetFrom().SetJournal().SetImp().ResetPages();
10518 
10519  CLEAR_ERRORS
10520 }
10521 
10522 
10524  vector<CExpectedError*>& expected_errors,
10525  const string& valtype,
10526  const string& val)
10527 {
10528  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SgmlPresentInText", valtype + " " + val + " has SGML"));
10529 }
10530 
10531 BOOST_AUTO_TEST_CASE(Test_Generic_SgmlPresentInText)
10532 {
10534 
10536 
10537  vector<string> sgml_tags;
10538 
10539  sgml_tags.push_back("&gt;");
10540  sgml_tags.push_back("&lt;");
10541  sgml_tags.push_back("&amp;");
10542  sgml_tags.push_back("&agr;");
10543  sgml_tags.push_back("&Agr;");
10544  sgml_tags.push_back("&bgr;");
10545  sgml_tags.push_back("&Bgr;");
10546  sgml_tags.push_back("&ggr;");
10547  sgml_tags.push_back("&Ggr;");
10548  sgml_tags.push_back("&dgr;");
10549  sgml_tags.push_back("&Dgr;");
10550  sgml_tags.push_back("&egr;");
10551  sgml_tags.push_back("&Egr;");
10552  sgml_tags.push_back("&zgr;");
10553  sgml_tags.push_back("&Zgr;");
10554  sgml_tags.push_back("&eegr;");
10555  sgml_tags.push_back("&EEgr;");
10556  sgml_tags.push_back("&thgr;");
10557  sgml_tags.push_back("&THgr;");
10558  sgml_tags.push_back("&igr;");
10559  sgml_tags.push_back("&Igr;");
10560  sgml_tags.push_back("&kgr;");
10561  sgml_tags.push_back("&Kgr;");
10562  sgml_tags.push_back("&lgr;");
10563  sgml_tags.push_back("&Lgr;");
10564  sgml_tags.push_back("&mgr;");
10565  sgml_tags.push_back("&Mgr;");
10566  sgml_tags.push_back("&ngr;");
10567  sgml_tags.push_back("&Ngr;");
10568  sgml_tags.push_back("&xgr;");
10569  sgml_tags.push_back("&Xgr;");
10570  sgml_tags.push_back("&ogr;");
10571  sgml_tags.push_back("&Ogr;");
10572  sgml_tags.push_back("&pgr;");
10573  sgml_tags.push_back("&Pgr;");
10574  sgml_tags.push_back("&rgr;");
10575  sgml_tags.push_back("&Rgr;");
10576  sgml_tags.push_back("&sgr;");
10577  sgml_tags.push_back("&Sgr;");
10578  sgml_tags.push_back("&sfgr;");
10579  sgml_tags.push_back("&tgr;");
10580  sgml_tags.push_back("&Tgr;");
10581  sgml_tags.push_back("&ugr;");
10582  sgml_tags.push_back("&Ugr;");
10583  sgml_tags.push_back("&phgr;");
10584  sgml_tags.push_back("&PHgr;");
10585  sgml_tags.push_back("&khgr;");
10586  sgml_tags.push_back("&KHgr;");
10587  sgml_tags.push_back("&psgr;");
10588  sgml_tags.push_back("&PSgr;");
10589  sgml_tags.push_back("&ohgr;");
10590  sgml_tags.push_back("&OHgr;");
10591 
10592  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SgmlPresentInText",
10593  "taxname %s has SGML"));
10594  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
10595  "Organism not found in taxonomy database"));
10596  // AddChromosomeNoLocation(expected_errors, entry);
10597  for (const string& it : sgml_tags) {
10598  string taxname = "a" + it + "b";
10599  unit_test_util::SetTaxname(entry, taxname);
10600  expected_errors[0]->SetErrMsg("taxname " + taxname + " has SGML");
10601  eval = validator.Validate(seh, options);
10602  CheckErrors(*eval, expected_errors);
10603  }
10604 
10606  delete expected_errors[1];
10607  expected_errors[1] = nullptr;
10608 
10609  size_t tag_num = 0;
10610 
10612  expected_errors[0]->SetErrMsg("subsource " + sgml_tags[tag_num] + " has SGML");
10613  eval = validator.Validate(seh, options);
10614  CheckErrors(*eval, expected_errors);
10616 
10617  ++tag_num;
10618  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_acronym, sgml_tags[tag_num]);
10619  expected_errors[0]->SetErrMsg("orgmod " + sgml_tags[tag_num] + " has SGML");
10620  eval = validator.Validate(seh, options);
10621  CheckErrors(*eval, expected_errors);
10623 
10624  CLEAR_ERRORS
10625  tag_num++;
10626  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SgmlPresentInText",
10627  "dbxref database " + sgml_tags[tag_num] + " has SGML"));
10628  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IllegalDbXref",
10629  "Illegal db_xref type " + sgml_tags[tag_num] + " (1234)"));
10630  // AddChromosomeNoLocation(expected_errors, entry);
10631 
10632  unit_test_util::SetDbxref(entry, sgml_tags[tag_num], 1234);
10633  eval = validator.Validate(seh, options);
10634  CheckErrors(*eval, expected_errors);
10635  unit_test_util::RemoveDbxref(entry, sgml_tags[tag_num], 1234);
10636 
10637  CLEAR_ERRORS
10638 
10639  tag_num++;
10640  AddSgmlError(expected_errors, "dbxref value", sgml_tags[tag_num]);
10641  // AddChromosomeNoLocation(expected_errors, entry);
10642  unit_test_util::SetDbxref(entry, "AFTOL", sgml_tags[tag_num]);
10643  eval = validator.Validate(seh, options);
10644  CheckErrors(*eval, expected_errors);
10645  unit_test_util::RemoveDbxref(entry, "AFTOL", 0);
10646 
10647  CLEAR_ERRORS
10648  ++tag_num;
10649  scope.RemoveTopLevelSeqEntry(seh);
10651  seh = scope.AddTopLevelSeqEntry(*entry);
10652  AddSgmlError(expected_errors, "dbxref database", sgml_tags[tag_num]);
10653  // AddChromosomeNoLocation(expected_errors, entry);
10654  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IllegalDbXref",
10655  "Illegal db_xref type " + sgml_tags[tag_num] + " (1234)"));
10656  unit_test_util::SetDbxref(feat, sgml_tags[tag_num], 1234);
10657  eval = validator.Validate(seh, options);
10658  CheckErrors(*eval, expected_errors);
10659  unit_test_util::RemoveDbxref(feat, sgml_tags[tag_num], 1234);
10660 
10661  CLEAR_ERRORS
10662 
10663  tag_num++;
10664  unit_test_util::SetDbxref(feat, "AFTOL", sgml_tags[tag_num]);
10665  AddSgmlError(expected_errors, "dbxref value", sgml_tags[tag_num]);
10666  // AddChromosomeNoLocation(expected_errors, entry);
10667 
10668  eval = validator.Validate(seh, options);
10669  CheckErrors(*eval, expected_errors);
10670  unit_test_util::RemoveDbxref(feat, "AFTOL", 0);
10671 
10672  CLEAR_ERRORS
10673 
10674  tag_num++;
10675  scope.RemoveTopLevelSeqEntry(seh);
10676  string foo = sgml_tags[tag_num] + "foo";
10677  feat->SetData().SetGene().SetLocus(foo);
10678  seh = scope.AddTopLevelSeqEntry(*entry);
10679  AddSgmlError(expected_errors, "gene locus", foo);
10680  // AddChromosomeNoLocation(expected_errors, entry);
10681  eval = validator.Validate(seh, options);
10682  CheckErrors(*eval, expected_errors);
10683  feat->SetData().SetGene().SetLocus("good locus");
10684 
10685  CLEAR_ERRORS
10686  tag_num++;
10687  feat->SetData().SetGene().SetLocus_tag(sgml_tags[tag_num]);
10688  AddSgmlError(expected_errors, "gene locus_tag", sgml_tags[tag_num]);
10689  // AddChromosomeNoLocation(expected_errors, entry);
10690  eval = validator.Validate(seh, options);
10691  CheckErrors(*eval, expected_errors);
10692  feat->SetData().SetGene().ResetLocus_tag();
10693 
10694  CLEAR_ERRORS
10695  tag_num++;
10696  feat->SetData().SetGene().SetDesc(sgml_tags[tag_num]);
10697  AddSgmlError(expected_errors, "gene description", sgml_tags[tag_num]);
10698  // AddChromosomeNoLocation(expected_errors, entry);
10699  eval = validator.Validate(seh, options);
10700  CheckErrors(*eval, expected_errors);
10701  feat->SetData().SetGene().ResetDesc();
10702 
10703  CLEAR_ERRORS
10704  tag_num++;
10705  feat->SetData().SetGene().SetSyn().push_back(sgml_tags[tag_num]);
10706  AddSgmlError(expected_errors, "gene synonym", sgml_tags[tag_num]);
10707  // AddChromosomeNoLocation(expected_errors, entry);
10708  eval = validator.Validate(seh, options);
10709  CheckErrors(*eval, expected_errors);
10710  feat->SetData().SetGene().ResetDesc();
10711 
10712  CLEAR_ERRORS
10713 
10714  tag_num++;
10715  scope.RemoveTopLevelSeqEntry(seh);
10716  feat->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
10717  foo = sgml_tags[tag_num] + "foo";
10718  feat->SetData().SetRna().SetExt().SetName(foo);
10719  seh = scope.AddTopLevelSeqEntry(*entry);
10720  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
10721  "No CDS location match for 1 mRNA"));
10722 
10723  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SgmlPresentInText",
10724  "mRNA name " + foo + " has SGML"));
10725  // AddChromosomeNoLocation(expected_errors, entry);
10726  eval = validator.Validate(seh, options);
10727  CheckErrors(*eval, expected_errors);
10728 
10729  CLEAR_ERRORS;
10730 
10731  tag_num++;
10732  scope.RemoveTopLevelSeqEntry(seh);
10733  feat->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
10734  foo = sgml_tags[tag_num] + "foo";
10735  feat->SetData().SetRna().SetExt().SetName(foo);
10736  seh = scope.AddTopLevelSeqEntry(*entry);
10737  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SgmlPresentInText",
10738  "rRNA name " + foo + " has SGML"));
10739  // AddChromosomeNoLocation(expected_errors, entry);
10740  eval = validator.Validate(seh, options);
10741  CheckErrors(*eval, expected_errors);
10742  feat->SetData().SetRna().SetExt().SetName("good name");
10743 
10744  tag_num++;
10745  feat->SetComment(sgml_tags[tag_num]);
10746  expected_errors[0]->SetErrMsg("feature comment " + sgml_tags[tag_num] + " has SGML");
10747  eval = validator.Validate(seh, options);
10748  CheckErrors(*eval, expected_errors);
10749  feat->ResetComment();
10750 
10751  tag_num++;
10752  CRef<CGb_qual> qual(new CGb_qual());
10753  qual->SetQual("standard_name");
10754  qual->SetVal(sgml_tags[tag_num]);
10755  feat->SetQual().push_back(qual);
10756  expected_errors[0]->SetErrMsg("feature qualifier " + sgml_tags[tag_num] + " has SGML");
10757  eval = validator.Validate(seh, options);
10758  CheckErrors(*eval, expected_errors);
10759  feat->SetQual().pop_back();
10760 
10761  tag_num++;
10762  scope.RemoveTopLevelSeqEntry(seh);
10764  feat = entry->SetSet().SetSeq_set().back()->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
10765  foo = sgml_tags[tag_num] + "foo";
10766  feat->SetData().SetProt().SetName().front().assign(foo);
10767  seh = scope.AddTopLevelSeqEntry(*entry);
10768  expected_errors[0]->SetAccession("lcl|prot");
10769  expected_errors[0]->SetErrMsg("protein name " + foo + " has SGML");
10770  eval = validator.Validate(seh, options);
10771  CheckErrors(*eval, expected_errors);
10772  feat->SetData().SetProt().SetName().pop_back();
10773  feat->SetData().SetProt().SetName().push_back("bar");
10774 
10775 
10776  tag_num++;
10777  feat->SetData().SetProt().SetDesc(sgml_tags[tag_num]);
10778  expected_errors[0]->SetErrMsg("protein description " + sgml_tags[tag_num] + " has SGML");
10779  eval = validator.Validate(seh, options);
10780  CheckErrors(*eval, expected_errors);
10781  feat->SetData().SetProt().ResetDesc();
10782  CLEAR_ERRORS
10783 }
10784 
10785 
10786 BOOST_AUTO_TEST_CASE(Test_Generic_UnexpectedPubStatusComment)
10787 {
10790  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_epublish);
10791  CRef<CSeqdesc> desc(new CSeqdesc());
10792  desc->SetPub().SetPub().Set().push_back(pub);
10793  desc->SetPub().SetComment("Publication Status");
10794  entry->SetSeq().SetDescr().Set().push_back(desc);
10795 
10797  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnexpectedPubStatusComment",
10798  "Publication status is in comment for pmid 0"));
10799  // AddChromosomeNoLocation(expected_errors, entry);
10800  eval = validator.Validate(seh, options);
10801  CheckErrors(*eval, expected_errors);
10802 
10803  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_ppublish);
10804  eval = validator.Validate(seh, options);
10805  CheckErrors(*eval, expected_errors);
10806 
10807  CLEAR_ERRORS
10808  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10809  "In-press is not expected to have page numbers"));
10810  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnexpectedPubStatusComment",
10811  "Publication status is in comment for pmid 0"));
10812  // AddChromosomeNoLocation(expected_errors, entry);
10813 
10814  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_aheadofprint);
10815  pub->SetArticle().SetFrom().SetJournal().SetImp().SetPrepub(CImprint::ePrepub_in_press);
10816  eval = validator.Validate(seh, options);
10817  CheckErrors(*eval, expected_errors);
10818 
10819  desc->SetPub().SetComment("Publication-Status");
10820  eval = validator.Validate(seh, options);
10821  CheckErrors(*eval, expected_errors);
10822 
10823  desc->SetPub().SetComment("Publication_Status");
10824  eval = validator.Validate(seh, options);
10825  CheckErrors(*eval, expected_errors);
10826 
10827  CLEAR_ERRORS
10828 }
10829 
10830 
10831 BOOST_AUTO_TEST_CASE(Test_PKG_NoCdRegionPtr)
10832 {
10835  if (pentry->SetSeq().IsSetDescr()) {
10836  auto& cont = pentry->SetSeq().SetDescr().Set();
10837  cont.remove_if(
10838  [](CSeqdesc* it) { return (it->IsSource() || it->IsPub()); });
10839  }
10840  entry->SetSet().SetSeq_set().push_back(pentry);
10841 
10843 
10844  auto orphans = validator::ListOrphanProteins(seh);
10845  BOOST_CHECK(orphans.empty());
10846 
10847  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoCdRegionPtr",
10848  "No CdRegion in nuc-prot set points to this protein"));
10849  // AddChromosomeNoLocation(expected_errors, entry);
10850  eval = validator.Validate(seh, options);
10851  CheckErrors(*eval, expected_errors);
10852 
10853 
10854  CLEAR_ERRORS
10855 }
10856 
10857 
10858 BOOST_AUTO_TEST_CASE(Test_PKG_NucProtProblem)
10859 {
10861  CRef<CSeq_entry> nentry = entry->SetSet().SetSeq_set().front();
10862  entry->SetSet().SetSeq_set().pop_front();
10864  entry->SetSet().SetAnnot().front()->SetData().SetFtable().pop_front();
10865 
10867 
10868  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "NoCdRegionPtr",
10869  "No CdRegion in nuc-prot set points to this protein"));
10870  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "NucProtProblem",
10871  "No nucleotides in nuc-prot set"));
10872  // AddChromosomeNoLocation(expected_errors, entry);
10873  eval = validator.Validate(seh, options);
10874  CheckErrors(*eval, expected_errors);
10875 
10876  scope.RemoveTopLevelSeqEntry(seh);
10877  CRef<CSeq_entry> pentry = entry->SetSet().SetSeq_set().front();
10878  entry->SetSet().SetSeq_set().pop_front();
10879  entry->SetSet().SetSeq_set().push_back(nentry);
10880  entry->SetSet().SetAnnot().front()->SetData().SetFtable().push_back(cds);
10881  seh = scope.AddTopLevelSeqEntry(*entry);
10882  delete expected_errors[0];
10883  expected_errors[0] = nullptr;
10884  expected_errors[1]->SetErrMsg("No proteins in nuc-prot set");
10885  expected_errors[1]->SetAccession("lcl|nuc");
10886  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MissingCDSproduct",
10887  "Unable to find product Bioseq from CDS feature"));
10888  eval = validator.Validate(seh, options);
10889  CheckErrors(*eval, expected_errors);
10890 
10891  scope.RemoveTopLevelSeqEntry(seh);
10893  if (nentry2->SetSeq().IsSetDescr()) {
10894  auto& cont = nentry2->SetSeq().SetDescr().Set();
10895  cont.remove_if(
10896  [](CSeqdesc* it) { return (it->IsSource() || it->IsPub()); });
10897  }
10898  entry->SetSet().SetSeq_set().push_back(nentry2);
10899  entry->SetSet().SetSeq_set().push_back(pentry);
10900  seh = scope.AddTopLevelSeqEntry(*entry);
10901  expected_errors[1]->SetSeverity(eDiag_Critical);
10902  expected_errors[1]->SetErrMsg("Multiple unsegmented nucleotides in nuc-prot set");
10903  delete expected_errors[2];
10904  expected_errors.pop_back();
10905  eval = validator.Validate(seh, options);
10906  CheckErrors(*eval, expected_errors);
10907 
10908  CLEAR_ERRORS
10909 }
10910 
10911 
10912 BOOST_AUTO_TEST_CASE(Test_PKG_SegSetProblem)
10913 {
10914  CRef<CSeq_entry> entry(new CSeq_entry());
10916  entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodSeq());
10917  entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodSeq());
10918  entry->SetSet().SetSeq_set().back()->SetSeq().SetId().front()->SetLocal().SetStr("good2");
10919 
10921 
10922  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SegSetProblem",
10923  "No segmented Bioseq in segset"));
10924  // AddChromosomeNoLocation(expected_errors, entry);
10925  eval = validator.Validate(seh, options);
10926  CheckErrors(*eval, expected_errors);
10927 
10928  CLEAR_ERRORS
10929 }
10930 
10931 
10932 BOOST_AUTO_TEST_CASE(Test_PKG_EmptySet)
10933 {
10934  CRef<CSeq_entry> entry(new CSeq_entry());
10936  entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodSeq());
10937  CRef<CSeq_entry> centry(new CSeq_entry());
10939  entry->SetSet().SetSeq_set().push_back(centry);
10940 
10942 
10943  expected_errors.push_back(new CExpectedError("", eDiag_Warning, "EmptySet",
10944  "No Bioseqs in this set"));
10945  // AddChromosomeNoLocation(expected_errors, entry);
10946  eval = validator.Validate(seh, options);
10947  CheckErrors(*eval, expected_errors);
10948 
10950  eval = validator.Validate(seh, options);
10951  CheckErrors(*eval, expected_errors);
10952 
10954  eval = validator.Validate(seh, options);
10955  CheckErrors(*eval, expected_errors);
10956 
10958  eval = validator.Validate(seh, options);
10959  CheckErrors(*eval, expected_errors);
10960 
10962  eval = validator.Validate(seh, options);
10963  CheckErrors(*eval, expected_errors);
10964 
10966  eval = validator.Validate(seh, options);
10967  CheckErrors(*eval, expected_errors);
10968 
10970  eval = validator.Validate(seh, options);
10971  CheckErrors(*eval, expected_errors);
10972 
10973  CLEAR_ERRORS
10974 }
10975 
10976 
10977 BOOST_AUTO_TEST_CASE(Test_PKG_NucProtNotSegSet)
10978 {
10980  CRef<CSeq_entry> centry(new CSeq_entry());
10982  entry->SetSet().SetSeq_set().push_back(centry);
10983 
10985  expected_errors.push_back(new CExpectedError("", eDiag_Warning, "EmptySet",
10986  "Pop/Phy/Mut/Eco set has no components"));
10987  expected_errors.push_back(new CExpectedError("", eDiag_Critical, "NucProtNotSegSet",
10988  "Nuc-prot Bioseq-set contains wrong Bioseq-set, its class is \"eco-set\"."));
10989  // AddChromosomeNoLocation(expected_errors, entry);
10990  eval = validator.Validate(seh, options);
10991  CheckErrors(*eval, expected_errors);
10992 
10993  CLEAR_ERRORS
10994 }
10995 
10996 
10997 BOOST_AUTO_TEST_CASE(Test_PKG_GenomicProductPackagingProblem)
10998 {
11000  CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
11001 
11003  CRef<CSeq_entry> nuc = stray->SetSet().SetSeq_set().front();
11004  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTAA");
11005  nuc->SetSeq().SetInst().SetLength(27);
11006  nuc->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
11008 
11009  unit_test_util::ChangeId(stray, "2");
11010  entry->SetSet().SetSeq_set().push_back(stray);
11011  CRef<CSeq_feat> cds(new CSeq_feat());
11012  cds->SetData().SetCdregion();
11013  cds->SetLocation().SetInt().SetFrom(30);
11014  cds->SetLocation().SetInt().SetTo(56);
11015  cds->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
11016  cds->SetProduct().SetWhole().SetLocal().SetStr("prot2");
11017  unit_test_util::AddFeat(cds, contig);
11018 
11020 
11021  expected_errors.push_back(new CExpectedError("lcl|nuc2", eDiag_Warning, "GenomicProductPackagingProblem",
11022  "Nucleotide bioseq should be product of mRNA feature on contig, but is not"));
11023 
11024  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSwithNoMRNA",
11025  "Unmatched CDS"));
11026  // AddChromosomeNoLocation(expected_errors, entry);
11027 
11028  eval = validator.Validate(seh, options);
11029  CheckErrors(*eval, expected_errors);
11030  CLEAR_ERRORS
11031 
11032  scope.RemoveTopLevelSeqEntry(seh);
11033  // take CDS away and add mrna - that way protein is orphan, nucleotide is product
11034  contig->SetSeq().SetAnnot().front()->SetData().SetFtable().pop_back();
11035 
11036  CRef<CSeq_feat> mrna(new CSeq_feat());
11037  mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
11038  mrna->SetData().SetRna().SetExt().SetName("fake protein name");
11039  mrna->SetLocation().SetInt().SetFrom(30);
11040  mrna->SetLocation().SetInt().SetTo(56);
11041  mrna->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
11042  mrna->SetProduct().SetWhole().SetLocal().SetStr("nuc2");
11043  unit_test_util::AddFeat(mrna, contig);
11044  seh = scope.AddTopLevelSeqEntry(*entry);
11045  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
11046  "No CDS location match for 1 mRNA"));
11047  expected_errors.push_back(new CExpectedError("lcl|prot2", eDiag_Warning, "GenomicProductPackagingProblem",
11048  "Protein bioseq should be product of CDS feature on contig, but is not"));
11049  // AddChromosomeNoLocation(expected_errors, entry);
11050 
11051  eval = validator.Validate(seh, options);
11052  CheckErrors(*eval, expected_errors);
11053 
11054  CLEAR_ERRORS
11055 
11056  // put CDS back, move annotation to gen-prod-set
11057  scope.RemoveTopLevelSeqEntry(seh);
11058  contig->SetSeq().SetAnnot().front()->SetData().SetFtable().push_back(cds);
11059  CRef<CSeq_feat> gene(new CSeq_feat());
11060  gene->SetLocation().SetInt().SetFrom(30);
11061  gene->SetLocation().SetInt().SetTo(56);
11062  gene->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
11063  gene->SetData().SetGene().SetLocus("gene locus");
11064  unit_test_util::AddFeat(gene, entry);
11065  seh = scope.AddTopLevelSeqEntry(*entry);
11066 
11067  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "GenomicProductPackagingProblem",
11068  "Seq-annot packaged directly on genomic product set"));
11069  // AddChromosomeNoLocation(expected_errors, entry);
11070  eval = validator.Validate(seh, options);
11071  CheckErrors(*eval, expected_errors);
11072 
11073 
11074  scope.RemoveTopLevelSeqEntry(seh);
11075  entry->SetSet().ResetAnnot();
11076  CRef<CSeq_feat> mrna2(new CSeq_feat());
11077  mrna2->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
11078  mrna2->SetData().SetRna().SetExt().SetName("second protein name");
11079  mrna2->SetLocation().SetInt().SetFrom(27);
11080  mrna2->SetLocation().SetInt().SetTo(29);
11081  mrna2->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
11082  mrna2->SetProduct().SetWhole().SetLocal().SetStr("nuc3");
11083  unit_test_util::AddFeat(mrna2, contig);
11084  seh = scope.AddTopLevelSeqEntry(*entry);
11085 
11086  CLEAR_ERRORS
11087 
11088  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProductFetchFailure",
11089  "Unable to fetch mRNA transcript 'lcl|nuc3'"));
11090  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingMRNAproduct",
11091  "Product Bioseq of mRNA feature is not packaged in the record"));
11092  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GenomicProductPackagingProblem",
11093  "Product of mRNA feature (lcl|nuc3) not packaged in genomic product set"));
11094  // AddChromosomeNoLocation(expected_errors, entry);
11095  eval = validator.Validate(seh, options);
11096  CheckErrors(*eval, expected_errors);
11097  CLEAR_ERRORS
11098 
11099  scope.RemoveTopLevelSeqEntry(seh);
11100  // remove product from first mRNA
11101  mrna->ResetProduct();
11102  // remove second mRNA
11103  contig->SetSeq().SetAnnot().front()->SetData().SetFtable().pop_back();
11104  seh = scope.AddTopLevelSeqEntry(*entry);
11105  eval = validator.Validate(seh, options);
11106  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureProductInconsistency",
11107  "2 mRNA features have 1 product references"));
11108  expected_errors.push_back(new CExpectedError("lcl|nuc2", eDiag_Warning, "GenomicProductPackagingProblem",
11109  "Nucleotide bioseq should be product of mRNA feature on contig, but is not"));
11110  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GenomicProductPackagingProblem",
11111  "Product of mRNA feature (?) not packaged in genomic product set"));
11112  // AddChromosomeNoLocation(expected_errors, entry);
11113  CheckErrors(*eval, expected_errors);
11114 
11115  scope.RemoveTopLevelSeqEntry(seh);
11116  mrna->SetPseudo(true);
11117  seh = scope.AddTopLevelSeqEntry(*entry);
11118  eval = validator.Validate(seh, options);
11119 
11120  CLEAR_ERRORS
11121  expected_errors.push_back(new CExpectedError("lcl|nuc2", eDiag_Warning, "GenomicProductPackagingProblem",
11122  "Nucleotide bioseq should be product of mRNA feature on contig, but is not"));
11123  // AddChromosomeNoLocation(expected_errors, entry);
11124  CheckErrors(*eval, expected_errors);
11125 
11126  CLEAR_ERRORS
11127 }
11128 
11129 
11130 #define TESTPOPPHYMUTECO(seh, entry) \
11131  entry->SetSet().SetClass(CBioseq_set::eClass_pop_set); \
11132  eval = validator.Validate(seh, options); \
11133  CheckErrors(*eval, expected_errors); \
11134  entry->SetSet().SetClass(CBioseq_set::eClass_phy_set); \
11135  eval = validator.Validate(seh, options); \
11136  CheckErrors(*eval, expected_errors); \
11137  entry->SetSet().SetClass(CBioseq_set::eClass_mut_set); \
11138  eval = validator.Validate(seh, options); \
11139  CheckErrors(*eval, expected_errors); \
11140  entry->SetSet().SetClass(CBioseq_set::eClass_eco_set); \
11141  eval = validator.Validate(seh, options); \
11142  CheckErrors(*eval, expected_errors); \
11143  entry->SetSet().SetClass(CBioseq_set::eClass_small_genome_set); \
11144  scope.RemoveTopLevelSeqEntry(seh); \
11145  unit_test_util::RemoveDescriptorType(entry, CSeqdesc::e_Title); \
11146  seh = scope.AddTopLevelSeqEntry(*entry); \
11147  eval = validator.Validate(seh, options); \
11148  CheckErrors(*eval, expected_errors);
11149 
11150 #define TESTWGS(seh, entry) \
11151  entry->SetSet().SetClass(CBioseq_set::eClass_wgs_set); \
11152  eval = validator.Validate(seh, options); \
11153  CheckErrors(*eval, expected_errors);
11154 
11155 
11156 BOOST_AUTO_TEST_CASE(Test_PKG_InconsistentMolInfoBiomols)
11157 {
11159 
11161 
11163  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "InconsistentMolType",
11164  "Molecule type (DNA) does not match biomol (RNA)"));
11165  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "InconsistentMoltypeSet",
11166  "Pop/phy/mut/eco set contains inconsistent moltype"));
11167  // AddChromosomeNoLocation(expected_errors, entry);
11168 
11169  TESTPOPPHYMUTECO(seh, entry)
11170 
11171  scope.RemoveTopLevelSeqEntry(seh);
11173  seh = scope.AddTopLevelSeqEntry(*entry);
11174 
11175  TESTWGS(seh, entry);
11176 
11177  CLEAR_ERRORS
11178 }
11179 
11180 
11181 BOOST_AUTO_TEST_CASE(Test_PKG_GraphPackagingProblem)
11182 {
11184  entry->SetSeq().SetAnnot().push_back(unit_test_util::BuildGoodGraphAnnot("notgood"));
11185 
11187 
11188  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "GraphPackagingProblem",
11189  "There is 1 mispackaged graph in this record."));
11190  // AddChromosomeNoLocation(expected_errors, entry);
11191  eval = validator.Validate(seh, options);
11192  CheckErrors(*eval, expected_errors);
11193 
11194  entry->SetSeq().SetAnnot().push_back(unit_test_util::BuildGoodGraphAnnot("alsonotgood"));
11195  expected_errors[0]->SetErrMsg("There are 2 mispackaged graphs in this record.");
11196  eval = validator.Validate(seh, options);
11197  CheckErrors(*eval, expected_errors);
11198 
11199  CLEAR_ERRORS
11200 }
11201 
11202 
11203 BOOST_AUTO_TEST_CASE(Test_PKG_InternalGenBankSet)
11204 {
11207  set->SetSet().SetClass(CBioseq_set::eClass_genbank);
11208  entry->SetSet().SetSeq_set().push_back(set);
11209 
11211 
11212  expected_errors.push_back(new CExpectedError("", eDiag_Warning, "ImproperlyNestedSets",
11213  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11214  // AddChromosomeNoLocation(expected_errors, entry);
11215 
11216  TESTPOPPHYMUTECO(seh, entry)
11217 
11218  CLEAR_ERRORS
11219  expected_errors.push_back(new CExpectedError("", eDiag_Warning, "ImproperlyNestedSets",
11220  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11221  // AddChromosomeNoLocation(expected_errors, entry);
11222  scope.RemoveTopLevelSeqEntry(seh);
11224  seh = scope.AddTopLevelSeqEntry(*entry);
11225 
11226  TESTWGS(seh, entry);
11227 
11228  CLEAR_ERRORS
11229 }
11230 
11231 
11232 BOOST_AUTO_TEST_CASE(Test_PKG_ConSetProblem)
11233 {
11237 
11239 
11240  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "ConSetProblem",
11241  "Set class should not be conset"));
11242  // AddChromosomeNoLocation(expected_errors, entry);
11243  eval = validator.Validate(seh, options);
11244  CheckErrors(*eval, expected_errors);
11245 
11246  CLEAR_ERRORS
11247 }
11248 
11249 
11250 BOOST_AUTO_TEST_CASE(Test_PKG_NoBioseqFound)
11251 {
11252  CRef<CSeq_entry> entry(new CSeq_entry());
11254 
11256 
11257  expected_errors.push_back(new CExpectedError("", eDiag_Error, "NoBioseqFound",
11258  "No Bioseqs in this entire record."));
11259  // AddChromosomeNoLocation(expected_errors, entry);
11260  eval = validator.Validate(seh, options);
11261  CheckErrors(*eval, expected_errors);
11262 
11263  CLEAR_ERRORS
11264 }
11265 
11266 
11267 BOOST_AUTO_TEST_CASE(Test_PKG_INSDRefSeqPackaging)
11268 {
11270  entry->SetSet().SetSeq_set().front()->SetSeq().SetId().front()->SetEmbl().SetAccession("EA123456");
11271  entry->SetSet().SetSeq_set().back()->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
11272 
11274 
11275  expected_errors.push_back(new CExpectedError("emb|EA123456|", eDiag_Error, "INSDRefSeqPackaging",
11276  "INSD and RefSeq records should not be present in the same set"));
11277  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "NoOrganismInTitle",
11278  "RefSeq nucleotide title does not start with organism name"));
11279  expected_errors.push_back(new CExpectedError("emb|EA123456|", eDiag_Warning, "ComponentMissingTitle",
11280  "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11281  expected_errors.push_back(new CExpectedError("lcl|good2", eDiag_Warning, "ComponentMissingTitle",
11282  "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11283  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "ComponentMissingTitle",
11284  "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11285  // AddChromosomeNoLocation(expected_errors, entry);
11286 
11287  eval = validator.Validate(seh, options);
11288  CheckErrors(*eval, expected_errors);
11289 
11290  CLEAR_ERRORS
11291 }
11292 
11293 
11294 BOOST_AUTO_TEST_CASE(Test_PKG_GPSnonGPSPackaging)
11295 {
11296  CRef<CSeq_entry> entry(new CSeq_entry());
11298  entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodEcoSet());
11299  entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodGenProdSet());
11300 
11301  // WriteOutTemp(entry);
11303 
11304  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "GPSnonGPSPackaging",
11305  "Genomic product set and mut/pop/phy/eco set records should not be present in the same set"));
11306  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "InconsistentMoltypeSet",
11307  "Pop/phy/mut/eco set contains inconsistent moltype"));
11308  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "ImproperlyNestedSets",
11309  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11310  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ImproperlyNestedSets",
11311  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11312  // AddChromosomeNoLocation(expected_errors, entry);
11313 
11314 
11315  TESTPOPPHYMUTECO(seh, entry)
11316 
11317  CLEAR_ERRORS
11318  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "GPSnonGPSPackaging",
11319  "Genomic product set and mut/pop/phy/eco set records should not be present in the same set"));
11320  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "InconsistentMoltypeSet",
11321  "Pop/phy/mut/eco set contains inconsistent moltype"));
11322  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "ImproperlyNestedSets",
11323  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11324  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ImproperlyNestedSets",
11325  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11326  // AddChromosomeNoLocation(expected_errors, entry);
11327 
11328  TESTWGS(seh, entry);
11329 
11330  CLEAR_ERRORS
11331 }
11332 
11333 
11334 BOOST_AUTO_TEST_CASE(Test_PKG_RefSeqPopSet)
11335 {
11338  entry->SetSet().SetSeq_set().front()->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
11339 
11341 
11342  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "NoOrganismInTitle",
11343  "RefSeq nucleotide title does not start with organism name"));
11344  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Critical, "RefSeqPopSet",
11345  "RefSeq record should not be a Pop-set"));
11346  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "ComponentMissingTitle",
11347  "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11348  expected_errors.push_back(new CExpectedError("lcl|good2", eDiag_Warning, "ComponentMissingTitle",
11349  "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11350  expected_errors.push_back(new CExpectedError("lcl|good3", eDiag_Warning, "ComponentMissingTitle",
11351  "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11352  // AddChromosomeNoLocation(expected_errors, entry);
11353  eval = validator.Validate(seh, options);
11354  CheckErrors(*eval, expected_errors);
11355 
11356  CLEAR_ERRORS
11357 }
11358 
11359 
11360 BOOST_AUTO_TEST_CASE(Test_PKG_BioseqSetClassNotSet)
11361 {
11365 
11367 
11368  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "BioseqSetClassNotSet",
11369  "Bioseq_set class not set"));
11370  // AddChromosomeNoLocation(expected_errors, entry);
11371  eval = validator.Validate(seh, options);
11372  CheckErrors(*eval, expected_errors);
11373 
11374  CLEAR_ERRORS
11375 }
11376 
11377 
11378 BOOST_AUTO_TEST_CASE(Test_PKG_OrphanedProtein)
11379 {
11381 
11383  set<CBioseq_Handle> orphans;
11384 
11385  eval = validator.Validate(seh, options);
11386  CheckErrors(*eval, expected_errors);
11387  orphans = validator::ListOrphanProteins(seh);
11388  BOOST_CHECK_EQUAL(orphans.size(), 0);
11389 
11390  scope.RemoveTopLevelSeqEntry(seh);
11391  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AYZ12345");
11392  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetGenbank().SetAccession("AYZ12345");
11393 
11394  expected_errors.push_back(new CExpectedError("gb|AYZ12345|", eDiag_Error, "OrphanedProtein",
11395  "Orphaned stand-alone protein"));
11396  seh = scope.AddTopLevelSeqEntry(*entry);
11397  // AddChromosomeNoLocation(expected_errors, entry);
11398  eval = validator.Validate(seh, options);
11399  CheckErrors(*eval, expected_errors);
11400  orphans = validator::ListOrphanProteins(seh);
11401  BOOST_CHECK_EQUAL(orphans.size(), 1);
11402 
11403  scope.RemoveTopLevelSeqEntry(seh);
11404  entry->SetSeq().SetId().front()->SetEmbl().SetAccession("AQZ12345");
11405  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetEmbl().SetAccession("AQZ12345");
11406  seh = scope.AddTopLevelSeqEntry(*entry);
11407  eval = validator.Validate(seh, options);
11408  ChangeErrorAcc(expected_errors, "emb|AQZ12345|");
11409  CheckErrors(*eval, expected_errors);
11410 
11411  scope.RemoveTopLevelSeqEntry(seh);
11412  entry->SetSeq().SetId().front()->SetDdbj().SetAccession("ARZ12345");
11413  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetDdbj().SetAccession("ARZ12345");
11414  seh = scope.AddTopLevelSeqEntry(*entry);
11415  eval = validator.Validate(seh, options);
11416  ChangeErrorAcc(expected_errors, "dbj|ARZ12345|");
11417  CheckErrors(*eval, expected_errors);
11418 
11419  scope.RemoveTopLevelSeqEntry(seh);
11420  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
11421  entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetOther().SetAccession("NC_123456");
11422  seh = scope.AddTopLevelSeqEntry(*entry);
11423  eval = validator.Validate(seh, options);
11424  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
11425  CheckErrors(*eval, expected_errors);
11426 
11427  scope.RemoveTopLevelSeqEntry(seh);
11428  auto& idlist = entry->SetSeq().SetId();
11429  CRef<CSeq_id> id1(new CSeq_id);
11430  id1->SetGibbmt(12345);
11431  idlist.push_back(id1);
11432  CRef<CSeq_id> id2(new CSeq_id);
11433  id2->SetGibbsq(23456);
11434  idlist.push_back(id2);
11435  CRef<CSeq_id> id_pat(new CSeq_id);
11436  id_pat->SetPatent().SetSeqid(1);
11437  id_pat->SetPatent().SetCit().SetCountry("USA");
11438  id_pat->SetPatent().SetCit().SetId().SetNumber("1");
11439  idlist.push_back(id_pat);
11440  seh = scope.AddTopLevelSeqEntry(*entry);
11441  eval = validator.Validate(seh, options);
11442  expected_errors.clear();
11443  CheckErrors(*eval, expected_errors);
11444 
11445  CLEAR_ERRORS
11446 }
11447 
11448 
11449 BOOST_AUTO_TEST_CASE(Test_PKG_MisplacedMolInfo)
11450 {
11452  CRef<CSeqdesc> molinfo(new CSeqdesc());
11454  entry->SetSet().SetDescr().Set().push_back(molinfo);
11455 
11457 
11458  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MisplacedMolInfo",
11459  "Nuc-prot set has MolInfo on set"));
11460  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic",
11461  "HTGS/STS/GSS/WGS sequence should be genomic"));
11462  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "NucleotideTechniqueOnProtein",
11463  "Protein with nucleic acid sequence method"));
11464  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic",
11465  "HTGS/STS/GSS/WGS sequence should be genomic"));
11466  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "InconsistentMolInfo",
11467  "Inconsistent Molinfo-completeness [1] and [0]"));
11468  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "MoltypeUnknown",
11469  "Molinfo-biomol unknown used"));
11470  // AddChromosomeNoLocation(expected_errors, entry);
11471 
11472  eval = validator.Validate(seh, options);
11473  CheckErrors(*eval, expected_errors);
11474 
11475  CLEAR_ERRORS
11476 }
11477 
11478 
11479 BOOST_AUTO_TEST_CASE(Test_PKG_ImproperlyNestedSets)
11480 {
11482 
11484 
11485  // no error first
11486 
11487  // AddChromosomeNoLocation(expected_errors, entry);
11488  eval = validator.Validate(seh, options);
11489  CheckErrors(*eval, expected_errors);
11490 
11491  // insert nested set
11492  scope.RemoveTopLevelSeqEntry(seh);
11493  entry->SetSet().SetSeq_set().clear();
11494  entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodEcoSet());
11495  seh = scope.AddTopLevelSeqEntry(*entry);
11496 
11497  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "SingleItemSet",
11498  "Pop/Phy/Mut/Eco set has only one component and no alignments"));
11499  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "ImproperlyNestedSets",
11500  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11501  eval = validator.Validate(seh, options);
11502  CheckErrors(*eval, expected_errors);
11503 
11504  CLEAR_ERRORS
11505 }
11506 
11507 
11508 BOOST_AUTO_TEST_CASE(Test_FEAT_InvalidForType)
11509 {
11511  CRef<CSeq_feat> feat(new CSeq_feat());
11512  feat->SetLocation().SetInt().SetFrom(0);
11513  feat->SetLocation().SetInt().SetTo(5);
11514  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("prot");
11515  feat->SetData().SetCdregion();
11516  feat->SetPseudo(true);
11517  unit_test_util::AddFeat(feat, entry->SetSet().SetSeq_set().back());
11518 
11520 
11521  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "InvalidFeatureForProtein",
11522  "Invalid feature for a protein Bioseq."));
11523  // AddChromosomeNoLocation(expected_errors, entry);
11524  eval = validator.Validate(seh, options);
11525  CheckErrors(*eval, expected_errors);
11526 
11527  scope.RemoveTopLevelSeqEntry(seh);
11528  feat->SetData().SetRna();
11529  feat->SetData().SetRna().SetType(CRNA_ref::eType_miscRNA);
11530  seh = scope.AddTopLevelSeqEntry(*entry);
11531  eval = validator.Validate(seh, options);
11532  CheckErrors(*eval, expected_errors);
11533 
11534  scope.RemoveTopLevelSeqEntry(seh);
11535  feat->SetData().SetRsite();
11536  seh = scope.AddTopLevelSeqEntry(*entry);
11537  eval = validator.Validate(seh, options);
11538  CheckErrors(*eval, expected_errors);
11539 
11540  scope.RemoveTopLevelSeqEntry(seh);
11541  feat->SetData().SetTxinit();
11542  seh = scope.AddTopLevelSeqEntry(*entry);
11543  eval = validator.Validate(seh, options);
11544  CheckErrors(*eval, expected_errors);
11545 
11546  scope.RemoveTopLevelSeqEntry(seh);
11547  feat->SetData().SetGene().SetLocus("good locus");
11548  seh = scope.AddTopLevelSeqEntry(*entry);
11549  eval = validator.Validate(seh, options);
11550  CheckErrors(*eval, expected_errors);
11551  CLEAR_ERRORS
11552 
11553  scope.RemoveTopLevelSeqEntry(seh);
11554  entry->SetSet().SetSeq_set().back()->SetSeq().SetAnnot().front()->SetData().SetFtable().pop_back();
11555  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("nuc");
11556  feat->SetData().SetProt().SetName().push_back("prot name");
11557  unit_test_util::AddFeat(feat, entry->SetSet().SetSeq_set().front());
11558  seh = scope.AddTopLevelSeqEntry(*entry);
11559  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error,
11560  "InvalidFeatureForNucleotide", "Invalid feature for a nucleotide Bioseq."));
11561  // AddChromosomeNoLocation(expected_errors, entry);
11562  eval = validator.Validate(seh, options);
11563  CheckErrors(*eval, expected_errors);
11564 
11565  scope.RemoveTopLevelSeqEntry(seh);
11566  feat->SetData().SetPsec_str();
11567  seh = scope.AddTopLevelSeqEntry(*entry);
11568  eval = validator.Validate(seh, options);
11569  CheckErrors(*eval, expected_errors);
11570  CLEAR_ERRORS
11571 
11572  scope.RemoveTopLevelSeqEntry(seh);
11573  entry = unit_test_util::BuildGoodSeq();
11574  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
11576  CRef<CSeq_loc> loc1(new CSeq_loc());
11577  loc1->SetInt().SetFrom(0);
11578  loc1->SetInt().SetTo(10);
11579  loc1->SetInt().SetId().Assign(*(entry->SetSeq().GetId().front()));
11580  CRef<CSeq_loc> loc2(new CSeq_loc());
11581  loc2->SetInt().SetFrom(21);
11582  loc2->SetInt().SetTo(35);
11583  loc2->SetInt().SetId().Assign(*(entry->SetSeq().GetId().front()));
11584  CRef<CSeq_feat> cds(new CSeq_feat());
11585  cds->SetLocation().SetMix().Set().push_back(loc1);
11586  cds->SetLocation().SetMix().Set().push_back(loc2);
11587  cds->SetData().SetCdregion();
11588  cds->SetPseudo(true);
11589  unit_test_util::AddFeat(cds, entry);
11590  seh = scope.AddTopLevelSeqEntry(*entry);
11591  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
11592  "InvalidForType", "Multi-interval CDS feature is invalid on an mRNA (cDNA) Bioseq."));
11593  // AddChromosomeNoLocation(expected_errors, entry);
11594  eval = validator.Validate(seh, options);
11595  CheckErrors(*eval, expected_errors);
11596 
11597  // different warning level if RefSeq
11598  scope.RemoveTopLevelSeqEntry(seh);
11599  CRef<CSeq_id> rsid(new CSeq_id());
11600  rsid->SetOther().SetAccession("NY_123456");
11601  unit_test_util::ChangeId(entry, rsid);
11602  seh = scope.AddTopLevelSeqEntry(*entry);
11603  expected_errors[0]->SetSeverity(eDiag_Warning);
11604  ChangeErrorAcc(expected_errors, "ref|NY_123456|");
11605  eval = validator.Validate(seh, options);
11606  CheckErrors(*eval, expected_errors);
11607 
11608  scope.RemoveTopLevelSeqEntry(seh);
11609  CRef<CSeq_id> good_id(new CSeq_id());
11610  good_id->SetLocal().SetStr("good");
11611  unit_test_util::ChangeId(entry, good_id);
11612  cds->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
11613  seh = scope.AddTopLevelSeqEntry(*entry);
11614  ChangeErrorAcc(expected_errors, "lcl|good");
11615  expected_errors[0]->SetErrCode("CDSmRNAMismatchLocation");
11616  expected_errors[0]->SetSeverity(eDiag_Warning);
11617  expected_errors[0]->SetErrMsg("No CDS location match for 1 mRNA");
11618 
11619  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFeatureForMRNA",
11620  "mRNA feature is invalid on an mRNA (cDNA) Bioseq."));
11621  eval = validator.Validate(seh, options);
11622  CheckErrors(*eval, expected_errors);
11623  CLEAR_ERRORS
11624 
11625  scope.RemoveTopLevelSeqEntry(seh);
11626  cds->SetData().SetImp().SetKey("intron");
11627  cds->SetLocation().SetInt().SetFrom(0);
11628  cds->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength() - 1);
11629  cds->SetLocation().SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
11630  seh = scope.AddTopLevelSeqEntry(*entry);
11631  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
11632  "Invalid feature for an mRNA Bioseq."));
11633  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info,
11634  "NotSpliceConsensusDonorTerminalIntron",
11635  "Splice donor consensus (GT) not found at start of terminal intron, position 1 of lcl|good"));
11636  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info,
11637  "NotSpliceConsensusAcceptorTerminalIntron",
11638  "Splice acceptor consensus (AG) not found at end of terminal intron, position 60 of lcl|good, but at end of sequence"));
11639  // AddChromosomeNoLocation(expected_errors, entry);
11640  eval = validator.Validate(seh, options);
11641  CheckErrors(*eval, expected_errors);
11642  CLEAR_ERRORS
11643 
11644  vector<string> peptide_feat;
11645  peptide_feat.push_back("mat_peptide");
11646  peptide_feat.push_back("sig_peptide");
11647  peptide_feat.push_back("transit_peptide");
11648  peptide_feat.push_back("preprotein");
11649  peptide_feat.push_back("proprotein");
11650 
11651  scope.RemoveTopLevelSeqEntry(seh);
11653  CRef<CSeq_feat> imp(new CSeq_feat());
11654  imp->SetLocation().SetInt().SetFrom(0);
11655  imp->SetLocation().SetInt().SetTo(5);
11656  imp->SetLocation().SetInt().SetId().SetLocal().SetStr("prot");
11657  unit_test_util::AddFeat(imp, entry->SetSet().SetSeq_set().back());
11658  seh = scope.AddTopLevelSeqEntry(*entry);
11659 
11660  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "PeptideFeatureLacksCDS",
11661  "Peptide processing feature should be converted to the appropriate protein feature subtype"));
11662  // AddChromosomeNoLocation(expected_errors, entry);
11663  CRef<CSeq_id> local_id(new CSeq_id());
11664  local_id->SetLocal().SetStr("good");
11665  for (const string& key : peptide_feat) {
11666  scope.RemoveTopLevelSeqEntry(seh);
11667  unit_test_util::ChangeProtId(entry, local_id);
11668  imp->SetData().SetImp().SetKey(key);
11669  seh = scope.AddTopLevelSeqEntry(*entry);
11670  expected_errors[0]->SetAccession("lcl|good");
11671  expected_errors[0]->SetSeverity(eDiag_Warning);
11672  eval = validator.Validate(seh, options);
11673  CheckErrors(*eval, expected_errors);
11674 
11675  scope.RemoveTopLevelSeqEntry(seh);
11676  unit_test_util::ChangeProtId(entry, rsid);
11677  imp->SetData().SetImp().SetKey(key);
11678  seh = scope.AddTopLevelSeqEntry(*entry);
11679  expected_errors[0]->SetAccession("ref|NY_123456|");
11680  expected_errors[0]->SetSeverity(eDiag_Error);
11681  eval = validator.Validate(seh, options);
11682  CheckErrors(*eval, expected_errors);
11683  }
11684 
11685  vector<string> rna_feat;
11686  rna_feat.push_back("mRNA");
11687  rna_feat.push_back("tRNA");
11688  rna_feat.push_back("rRNA");
11689  rna_feat.push_back("snRNA");
11690  rna_feat.push_back("scRNA");
11691  rna_feat.push_back("snoRNA");
11692  rna_feat.push_back("misc_RNA");
11693  rna_feat.push_back("precursor_RNA");
11694 
11695  scope.RemoveTopLevelSeqEntry(seh);
11696  entry = unit_test_util::BuildGoodSeq();
11697  seh = scope.AddTopLevelSeqEntry(*entry);
11698 
11699  expected_errors[0]->SetErrCode("InvalidRNAFeature");
11700  expected_errors[0]->SetErrMsg("RNA feature should be converted to the appropriate RNA feature subtype, location should be converted manually");
11701  expected_errors[0]->SetSeverity(eDiag_Error);
11702  ChangeErrorAcc(expected_errors, "lcl|good");
11703  for (const string& key : rna_feat) {
11704  scope.RemoveTopLevelSeqEntry(seh);
11705  entry->SetSeq().ResetAnnot();
11707  rna->SetData().SetImp().SetKey(key);
11708  seh = scope.AddTopLevelSeqEntry(*entry);
11709  eval = validator.Validate(seh, options);
11710  CheckErrors(*eval, expected_errors);
11711  }
11712 
11713  vector<CProt_ref::TProcessed> prot_types;
11714  prot_types.push_back(CProt_ref::eProcessed_mature);
11715  prot_types.push_back(CProt_ref::eProcessed_transit_peptide);
11716  prot_types.push_back(CProt_ref::eProcessed_signal_peptide);
11717  prot_types.push_back(CProt_ref::eProcessed_preprotein);
11718 
11719  entry->SetSeq().ResetAnnot();
11721  prot->SetLocation().SetInt().SetFrom(0);
11722  prot->SetLocation().SetInt().SetTo(10);
11723  prot->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
11724  prot->SetData().SetProt().SetName().push_back("unnamed");
11725  unit_test_util::AddFeat(prot, entry);
11726 
11727  CLEAR_ERRORS
11728  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
11729  "InvalidFeatureForNucleotide", "Invalid feature for a nucleotide Bioseq."));
11730  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidForType",
11731  "Peptide processing feature should be remapped to the appropriate protein bioseq"));
11732  // AddChromosomeNoLocation(expected_errors, entry);
11733  for (CProt_ref::EProcessed key : prot_types) {
11734  scope.RemoveTopLevelSeqEntry(seh);
11735  unit_test_util::ChangeId(entry, local_id);
11736  prot->SetData().SetProt().SetProcessed(key);
11737  seh = scope.AddTopLevelSeqEntry(*entry);
11738  ChangeErrorAcc(expected_errors, "lcl|good");
11739  expected_errors[1]->SetSeverity(eDiag_Warning);
11740  eval = validator.Validate(seh, options);
11741  CheckErrors(*eval, expected_errors);
11742 
11743  scope.RemoveTopLevelSeqEntry(seh);
11744  unit_test_util::ChangeId(entry, rsid);
11745  prot->SetData().SetProt().SetProcessed(key);
11746  seh = scope.AddTopLevelSeqEntry(*entry);
11747  ChangeErrorAcc(expected_errors, "ref|NY_123456|");
11748  expected_errors[1]->SetSeverity(eDiag_Error);
11749  expected_errors.push_back(new CExpectedError("ref|NY_123456|", eDiag_Warning, "UndesiredProteinName",
11750  "Uninformative protein name 'unnamed'"));
11751  eval = validator.Validate(seh, options);
11752  CheckErrors(*eval, expected_errors);
11753  delete expected_errors[2];
11754  expected_errors.pop_back();
11755  }
11756 
11757  CLEAR_ERRORS
11758 }
11759 
11760 
11762 {
11763  CRef<CUser_field> field(new CUser_field());
11764  field->SetLabel().SetStr(label);
11765  field->SetData().SetStr(value);
11766  return field;
11767 }
11768 
11769 
11771 {
11774  CRef<CSeq_feat> gene = AddMiscFeature(prot, prot->GetSeq().GetLength() - 1);
11775  gene->SetData().SetGene().SetLocus("x");
11776 
11777  CRef<CSeqdesc> pgap(new CSeqdesc());
11779  pgap->SetUser().SetData().push_back(MakeStructuredCommentField("StructuredCommentPrefix", "##Genome-Annotation-Data-START##"));
11780  pgap->SetUser().SetData().push_back(MakeStructuredCommentField("Annotation Provider", "NCBI"));
11781  entry->SetSet().SetDescr().Set().push_back(pgap);
11782 
11784 
11785  // AddChromosomeNoLocation(expected_errors, entry);
11786  eval = validator.Validate(seh, options);
11787  CheckErrors(*eval, expected_errors);
11788 
11789  CLEAR_ERRORS
11790 }
11791 
11792 
11794 {
11797  CRef<CSeq_feat> gene = AddMiscFeature(prot, prot->GetSeq().GetLength() - 1);
11798  gene->SetData().SetGene().SetLocus_tag("x");
11799 
11800  CRef<CSeqdesc> pgap(new CSeqdesc());
11802  pgap->SetUser().SetData().push_back(MakeStructuredCommentField("StructuredCommentPrefix", "##Genome-Annotation-Data-START##"));
11803  pgap->SetUser().SetData().push_back(MakeStructuredCommentField("Annotation Provider", "NCBI"));
11804  entry->SetSet().SetDescr().Set().push_back(pgap);
11805 
11807 
11808  eval = validator.Validate(seh, options);
11809 
11810  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "LocusTagProblem",
11811  "Genes on protein sequences with PGAP annotation should not have locus tags."));
11812  // AddChromosomeNoLocation(expected_errors, entry);
11813  CheckErrors(*eval, expected_errors);
11814  CLEAR_ERRORS
11815 }
11816 
11817 
11819 {
11821  CRef<CSeq_entry> nseq = entry->SetSet().SetSeq_set().front();
11822  CRef<CSeq_entry> pseq = entry->SetSet().SetSeq_set().back();
11824  CRef<CSeq_feat> prot = pseq->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
11825 
11826  nseq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGGTATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
11827 
11828  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nseq->SetSeq().SetId().front()));
11829 #if 0
11830  CRef<CSeq_loc> loc1(new CSeq_loc());
11831  loc1->SetInt().SetId().SetLocal().SetStr("nuc");
11832  loc1->SetInt().SetFrom(0);
11833  loc1->SetInt().SetTo(15);
11834 
11835  CRef<CSeq_loc> loc2(new CSeq_loc());
11836  loc2->SetInt().SetId().SetLocal().SetStr("nuc");
11837  loc2->SetInt().SetFrom(46);
11838  loc2->SetInt().SetTo(56);
11839 
11840  cds->SetLocation().SetMix().Set().push_back(loc1);
11841  cds->SetLocation().SetMix().Set().push_back(loc2);
11842 #endif
11843 
11844  return entry;
11845 }
11846 
11847 
11848 BOOST_AUTO_TEST_CASE(Test_FEAT_PartialProblem)
11849 {
11852  cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
11853  cds->SetPartial(true);
11855  CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
11856  CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
11857 
11859 
11860  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11861  "Coding region and protein feature partials conflict"));
11862  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
11863  "Inconsistent: Product= complete, Location= partial, Feature.partial= TRUE"));
11864  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11865  "CDS is partial but protein is complete"));
11866  // AddChromosomeNoLocation(expected_errors, entry);
11867  // cds 5' partial, protein complete
11868  eval = validator.Validate(seh, options);
11869  CheckErrors(*eval, expected_errors);
11870  CLEAR_ERRORS
11871 
11872  // cds 5' complete, protein 5' partial
11873  cds->SetLocation().SetPartialStart(false, eExtreme_Biological);
11874  cds->SetPartial(false);
11876  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
11877  "Inconsistent: Product= partial, Location= complete, Feature.partial= FALSE"));
11878  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11879  "CDS is 5' complete but protein is NH2 partial"));
11880  // AddChromosomeNoLocation(expected_errors, entry);
11881  eval = validator.Validate(seh, options);
11882  CheckErrors(*eval, expected_errors);
11883  CLEAR_ERRORS
11884 
11885  // cds 5' partial, protein 3' partial
11886  cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
11887  cds->SetPartial(true);
11889  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11890  "Coding region and protein feature partials conflict"));
11891  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
11892  "Got stop codon, but 3'end is labeled partial"));
11893  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11894  "CDS is 3' complete but protein is CO2 partial"));
11895  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblem",
11896  "CDS is 5' partial but protein is CO2 partial"));
11897  // AddChromosomeNoLocation(expected_errors, entry);
11898  eval = validator.Validate(seh, options);
11899  CheckErrors(*eval, expected_errors);
11900  CLEAR_ERRORS
11901 
11902  // cds 3' partial, protein 5' partial
11903  cds->SetLocation().SetPartialStart(false, eExtreme_Biological);
11904  cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
11906  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11907  "Coding region and protein feature partials conflict"));
11908  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
11909  "3' partial is not at end of sequence, gap, or consensus splice site"));
11910  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
11911  "Got stop codon, but 3'end is labeled partial"));
11912  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11913  "CDS is 5' complete but protein is NH2 partial"));
11914  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11915  "CDS is 3' partial but protein is NH2 partial"));
11916  // AddChromosomeNoLocation(expected_errors, entry);
11917  eval = validator.Validate(seh, options);
11918  CheckErrors(*eval, expected_errors);
11919  CLEAR_ERRORS
11920 
11921  // cds 5' partial, protein no ends
11922  cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
11923  cds->SetLocation().SetPartialStop(false, eExtreme_Biological);
11925  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11926  "Coding region and protein feature partials conflict"));
11927  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
11928  "Got stop codon, but 3'end is labeled partial"));
11929  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblem",
11930  "CDS is 5' partial but protein has neither end"));
11931  // AddChromosomeNoLocation(expected_errors, entry);
11932  eval = validator.Validate(seh, options);
11933  CheckErrors(*eval, expected_errors);
11934  CLEAR_ERRORS
11935 
11936  // cds 3' partial, protein no ends
11937  cds->SetLocation().SetPartialStart(false, eExtreme_Biological);
11938  cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
11940  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11941  "Coding region and protein feature partials conflict"));
11942  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
11943  "3' partial is not at end of sequence, gap, or consensus splice site"));
11944  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
11945  "Got stop codon, but 3'end is labeled partial"));
11946  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11947  "CDS is 3' partial but protein has neither end"));
11948  // AddChromosomeNoLocation(expected_errors, entry);
11949  eval = validator.Validate(seh, options);
11950  CheckErrors(*eval, expected_errors);
11951  CLEAR_ERRORS
11952 
11953  // cds complete, protein no ends
11954  cds->SetLocation().SetPartialStart(false, eExtreme_Biological);
11955  cds->SetLocation().SetPartialStop(false, eExtreme_Biological);
11956  cds->SetPartial(false);
11958  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
11959  "Inconsistent: Product= partial, Location= complete, Feature.partial= FALSE"));
11960  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
11961  "Got stop codon, but 3'end is labeled partial"));
11962  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11963  "CDS is complete but protein has neither end"));
11964  // AddChromosomeNoLocation(expected_errors, entry);
11965  eval = validator.Validate(seh, options);
11966  CheckErrors(*eval, expected_errors);
11967  CLEAR_ERRORS
11968 
11969  // misc feature with location whole but not marked partial
11970  scope.RemoveTopLevelSeqEntry(seh);
11973  CRef<CSeq_feat> misc_feat = unit_test_util::AddMiscFeature(nuc_seq);
11974  misc_feat->SetLocation().SetWhole().SetLocal().SetStr("nuc");
11975  seh = scope.AddTopLevelSeqEntry(*entry);
11976  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "StrandOther",
11977  "Strand 'other' in location"));
11978  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "WholeLocation",
11979  "Feature may not have whole location"));
11980  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "PartialProblem",
11981  "On partial Bioseq, SeqFeat.partial should be TRUE"));
11982  // AddChromosomeNoLocation(expected_errors, entry);
11983  eval = validator.Validate(seh, options);
11984  CheckErrors(*eval, expected_errors);
11985  CLEAR_ERRORS
11986 
11987  scope.RemoveTopLevelSeqEntry(seh);
11990  cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
11991  cds->SetLocation().SetPartialStop(false, eExtreme_Biological);
11992  cds->SetPartial(true);
11993  nuc_seq->SetSeq().SetAnnot().front()->SetData().SetFtable().pop_back();
11994  misc_feat = unit_test_util::AddMiscFeature(nuc_seq);
11995  misc_feat->SetPartial(true);
11996  misc_feat->SetProduct().SetWhole().SetLocal().SetStr("prot");
11997  seh = scope.AddTopLevelSeqEntry(*entry);
11998  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11999  "Coding region and protein feature partials conflict"));
12000  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblem",
12001  "When SeqFeat.product is a partial Bioseq, SeqFeat.location should also be partial"));
12002  // AddChromosomeNoLocation(expected_errors, entry);
12003  eval = validator.Validate(seh, options);
12004  CheckErrors(*eval, expected_errors);
12005  CLEAR_ERRORS
12006 
12007  scope.RemoveTopLevelSeqEntry(seh);
12008  nuc_seq->SetSeq().ResetAnnot();
12009  CRef<CSeq_loc> first(new CSeq_loc());
12010  first->SetInt().SetId().SetLocal().SetStr("nuc");
12011  first->SetInt().SetFrom(0);
12012  first->SetInt().SetTo(5);
12013  CRef<CSeq_loc> middle(new CSeq_loc());
12014  middle->SetNull();
12015  CRef<CSeq_loc> last(new CSeq_loc());
12016  last->SetInt().SetId().SetLocal().SetStr("nuc");
12017  last->SetInt().SetFrom(7);
12018  last->SetInt().SetTo(10);
12019 
12020  CRef<CSeq_feat> gene_feat(new CSeq_feat());
12021  gene_feat->SetData().SetGene().SetLocus("locus value");
12022  gene_feat->SetLocation().SetMix().Set().push_back(first);
12023  gene_feat->SetLocation().SetMix().Set().push_back(middle);
12024  gene_feat->SetLocation().SetMix().Set().push_back(last);
12025  unit_test_util::AddFeat(gene_feat, nuc_seq);
12026  seh = scope.AddTopLevelSeqEntry(*entry);
12027  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MultiIntervalGene",
12028  "Gene feature on non-segmented sequence should not have multiple intervals"));
12029  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSgeneRange",
12030  "gene [locus value:[lcl|nuc:1-6, ~, 8-11]] overlaps CDS but does not completely contain it"));
12031  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
12032  "Coding region and protein feature partials conflict"));
12033  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblem",
12034  "Gene of 'order' with otherwise complete location should have partial flag set"));
12035  // AddChromosomeNoLocation(expected_errors, entry);
12036  eval = validator.Validate(seh, options);
12037  CheckErrors(*eval, expected_errors);
12038  CLEAR_ERRORS
12039 
12040  scope.RemoveTopLevelSeqEntry(seh);
12041  nuc_seq->SetSeq().ResetAnnot();
12042  cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
12043  cds->SetLocation().SetPartialStop(false, eExtreme_Biological);
12044  cds->SetPartial(true);
12046  seh = scope.AddTopLevelSeqEntry(*entry);
12047 
12048  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
12049  "Coding region and protein feature partials conflict"));
12050  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblem",
12051  "5' or 3' partial location should not have unclassified partial in product molinfo descriptor"));
12052  // AddChromosomeNoLocation(expected_errors, entry);
12053  eval = validator.Validate(seh, options);
12054  CheckErrors(*eval, expected_errors);
12055  CLEAR_ERRORS
12056 
12057  scope.RemoveTopLevelSeqEntry(seh);
12058  entry = BuildGoodSpliceNucProtSet();
12059  misc_feat = unit_test_util::AddMiscFeature(entry->SetSet().SetSeq_set().front(), 15);
12060  misc_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
12061  misc_feat->SetPartial(true);
12062  seh = scope.AddTopLevelSeqEntry(*entry);
12063 
12064  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
12065  "PartialProblem3Prime",
12066  "Stop does not include first/last residue of sequence"));
12067  // AddChromosomeNoLocation(expected_errors, entry);
12068  eval = validator.Validate(seh, options);
12069  CheckErrors(*eval, expected_errors);
12070  CLEAR_ERRORS
12071 
12072  scope.RemoveTopLevelSeqEntry(seh);
12073  misc_feat->SetLocation().SetInt().SetFrom(46);
12074  misc_feat->SetLocation().SetInt().SetTo(56);
12075  misc_feat->SetLocation().SetPartialStart(true, eExtreme_Biological);
12076  misc_feat->SetLocation().SetPartialStop(false, eExtreme_Biological);
12077  seh = scope.AddTopLevelSeqEntry(*entry);
12078  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
12079  "PartialProblem5Prime",
12080  "Start does not include first/last residue of sequence"));
12081  // AddChromosomeNoLocation(expected_errors, entry);
12082  eval = validator.Validate(seh, options);
12083  CheckErrors(*eval, expected_errors);
12084  CLEAR_ERRORS
12085 
12086  scope.RemoveTopLevelSeqEntry(seh);
12087  // take misc_feat away
12088  entry->SetSet().SetSeq_set().front()->SetSeq().ResetAnnot();
12089  // cds, but splicing not expected
12090  // do not report, per V-763
12091  unit_test_util::SetDiv(entry, "BCT");
12092  entry->SetSet().ResetAnnot();
12093  cds.Reset(new CSeq_feat());
12094  cds->SetData().SetCdregion();
12095  cds->SetProduct().SetWhole().SetLocal().SetStr("prot");
12096  cds->SetLocation().SetInt().SetId().SetLocal().SetStr("nuc");
12097  cds->SetLocation().SetInt().SetFrom(0);
12098  cds->SetLocation().SetInt().SetTo(15);
12099  cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
12100  cds->SetPartial(true);
12101  unit_test_util::AddFeat(cds, entry->SetSet().SetSeq_set().front());
12102  prot_seq = entry->SetSet().SetSeq_set().back();
12103  prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKT");
12104  prot_seq->SetSeq().SetInst().SetLength(5);
12105  prot_seq->SetSeq().ResetAnnot();
12106  CRef<CSeq_feat> prot_feat = unit_test_util::AddProtFeat(prot_seq);
12107  prot_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
12108  prot_feat->SetPartial(true);
12110  seh = scope.AddTopLevelSeqEntry(*entry);
12111  // AddChromosomeNoLocation(expected_errors, entry);
12112  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "PartialProblem3Prime",
12113  "Stop does not include first/last residue of sequence (but is at consensus splice site)"));
12114  eval = validator.Validate(seh, options);
12115  CheckErrors(*eval, expected_errors);
12116  CLEAR_ERRORS
12117 
12118  // splicing expected but on mRNA
12119  unit_test_util::SetDiv(entry, "PRI");
12120  entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
12122  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemmRNASequence3Prime",
12123  "Stop does not include first/last residue of mRNA sequence"));
12124  eval = validator.Validate(seh, options);
12125  CheckErrors(*eval, expected_errors);
12126 
12127  CLEAR_ERRORS
12128 
12129  scope.RemoveTopLevelSeqEntry(seh);
12132  cds->SetLocation().SetInt().SetFrom(3);
12133  cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
12134  cds->SetPartial(true);
12135  nuc_seq = entry->SetSet().SetSeq_set().front();
12136  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[2] = '#';
12137  prot_seq = entry->SetSet().SetSeq_set().back();
12138  prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("PRKTEIN");
12139  prot_seq->SetSeq().SetInst().SetLength(7);
12140  prot_feat = prot_seq->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
12141  prot_feat->SetLocation().SetInt().SetTo(6);
12143  seh = scope.AddTopLevelSeqEntry(*entry);
12144  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue", "Invalid residue '#' at position [3]"));
12145  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
12146  "Coding region and protein feature partials conflict"));
12147  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "PartialProblem",
12148  "PartialLocation: Start does not include first/last residue of sequence (and is at bad sequence)"));
12149  // AddChromosomeNoLocation(expected_errors, entry);
12150  eval = validator.Validate(seh, options);
12151  CheckErrors(*eval, expected_errors);
12152 
12153  scope.RemoveTopLevelSeqEntry(seh);
12156  cds->SetLocation().SetInt().SetTo(23);
12157  cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
12158  cds->SetPartial(true);
12159  nuc_seq = entry->SetSet().SetSeq_set().front();
12160  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[24] = '#';
12161  prot_seq = entry->SetSet().SetSeq_set().back();
12163  seh = scope.AddTopLevelSeqEntry(*entry);
12164  expected_errors[0]->SetErrMsg("Invalid residue '#' at position [25]");
12165  expected_errors[2]->SetErrMsg("PartialLocation: Stop does not include first/last residue of sequence (and is at bad sequence)");
12166  eval = validator.Validate(seh, options);
12167  CheckErrors(*eval, expected_errors);
12168 
12169  CLEAR_ERRORS
12170 
12171  scope.RemoveTopLevelSeqEntry(seh);
12174  cds->SetLocation().SetInt().SetFrom(3);
12175  cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
12176  cds->SetPartial(true);
12177  prot_seq = entry->SetSet().SetSeq_set().back();
12178  prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("PRKTEIN");
12179  prot_seq->SetSeq().SetInst().SetLength(7);
12180  prot_feat = prot_seq->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
12181  prot_feat->SetLocation().SetInt().SetTo(6);
12183  seh = scope.AddTopLevelSeqEntry(*entry);
12184 
12185  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
12186  "Coding region and protein feature partials conflict"));
12187  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus5Prime",
12188  "5' partial is not at beginning of sequence, gap, or consensus splice site"));
12189  // AddChromosomeNoLocation(expected_errors, entry);
12190  eval = validator.Validate(seh, options);
12191  CheckErrors(*eval, expected_errors);
12192 
12193  CLEAR_ERRORS
12194 
12195  scope.RemoveTopLevelSeqEntry(seh);
12198  cds->SetLocation().SetInt().SetTo(23);
12199  cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
12200  cds->SetPartial(true);
12201  prot_seq = entry->SetSet().SetSeq_set().back();
12203  seh = scope.AddTopLevelSeqEntry(*entry);
12204 
12205  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
12206  "Coding region and protein feature partials conflict"));
12207  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
12208  "3' partial is not at end of sequence, gap, or consensus splice site"));
12209  // AddChromosomeNoLocation(expected_errors, entry);
12210  eval = validator.Validate(seh, options);
12211  CheckErrors(*eval, expected_errors);
12212 
12213  CLEAR_ERRORS
12214 
12215  scope.RemoveTopLevelSeqEntry(seh);
12216  entry = unit_test_util::BuildGoodSeq();
12217  misc_feat = unit_test_util::AddMiscFeature(entry);
12218  misc_feat->SetLocation().SetInt().SetFrom(3);
12219  misc_feat->SetLocation().SetPartialStart(true, eExtreme_Biological);
12220  misc_feat->SetPartial(true);
12221  seh = scope.AddTopLevelSeqEntry(*entry);
12222  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PartialProblem5Prime",
12223  "Start does not include first/last residue of sequence"));
12224  // AddChromosomeNoLocation(expected_errors, entry);
12225  eval = validator.Validate(seh, options);
12226  CheckErrors(*eval, expected_errors);
12227 
12228  misc_feat->SetLocation().SetPartialStart(false, eExtreme_Biological);
12229  misc_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
12230  expected_errors[0]->SetErrCode("PartialProblem3Prime");
12231  expected_errors[0]->SetErrMsg("Stop does not include first/last residue of sequence");
12232  eval = validator.Validate(seh, options);
12233  CheckErrors(*eval, expected_errors);
12234 
12235  CLEAR_ERRORS
12236  misc_feat->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
12237  misc_feat->SetLocation().SetMix().Set().front()->SetPartialStop(true, eExtreme_Biological);
12238  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "PartialProblem",
12239  "PartialLocation: Internal partial intervals do not include first/last residue of sequence"));
12240  // AddChromosomeNoLocation(expected_errors, entry);
12241  eval = validator.Validate(seh, options);
12242  CheckErrors(*eval, expected_errors);
12243 
12244  // suppress for RefSeq
12245  CLEAR_ERRORS
12246  scope.RemoveTopLevelSeqEntry(seh);
12247  CRef<CSeq_id> refseq_id(new CSeq_id());
12248  refseq_id->SetOther().SetAccession("NC_123456");
12249  entry->SetSeq().SetId().push_back(refseq_id);
12250  seh = scope.AddTopLevelSeqEntry(*entry);
12251  // AddChromosomeNoLocation(expected_errors, "ref|NC_123456|");
12252  eval = validator.Validate(seh, options);
12253  CheckErrors(*eval, expected_errors);
12254 
12255  CLEAR_ERRORS
12256 
12257  scope.RemoveTopLevelSeqEntry(seh);
12260  cds->SetPartial(true);
12261  prot_seq = entry->SetSet().SetSeq_set().back();
12262  prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("KPRKTEIN");
12263  seh = scope.AddTopLevelSeqEntry(*entry);
12264 
12265  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
12266  "Inconsistent: Product= complete, Location= complete, Feature.partial= TRUE"));
12267  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
12268  "Start of location should probably be partial"));
12269  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
12270  "This SeqFeat should not be partial"));
12271  // AddChromosomeNoLocation(expected_errors, entry);
12272 
12273  eval = validator.Validate(seh, options);
12274  CheckErrors(*eval, expected_errors);
12275 
12276  CLEAR_ERRORS
12277 
12278  scope.RemoveTopLevelSeqEntry(seh);
12281  cds->SetPartial(true);
12282  cds->SetLocation().SetInt().SetTo(23);
12283  seh = scope.AddTopLevelSeqEntry(*entry);
12284 
12285  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
12286  "Inconsistent: Product= complete, Location= complete, Feature.partial= TRUE"));
12287  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
12288  "End of location should probably be partial"));
12289  // AddChromosomeNoLocation(expected_errors, entry);
12290  eval = validator.Validate(seh, options);
12291  CheckErrors(*eval, expected_errors);
12292 
12293  scope.RemoveTopLevelSeqEntry(seh);
12296  cds->SetPartial(true);
12297  seh = scope.AddTopLevelSeqEntry(*entry);
12298 
12299  expected_errors[1]->SetErrMsg("This SeqFeat should not be partial");
12300  eval = validator.Validate(seh, options);
12301  CheckErrors(*eval, expected_errors);
12302 
12303  CLEAR_ERRORS
12304 
12305  cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
12306  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
12307  "Coding region and protein feature partials conflict"));
12308  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
12309  "3' partial is not at end of sequence, gap, or consensus splice site"));
12310  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
12311  "Inconsistent: Product= complete, Location= partial, Feature.partial= TRUE"));
12312  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
12313  "Got stop codon, but 3'end is labeled partial"));
12314  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
12315  "CDS is partial but protein is complete"));
12316  // AddChromosomeNoLocation(expected_errors, entry);
12317  eval = validator.Validate(seh, options);
12318  CheckErrors(*eval, expected_errors);
12319 
12320  CLEAR_ERRORS
12321 }
12322 
12323 
12324 void SetUpMiscForPartialTest(CSeq_feat& feat, TSeqPos start, TSeqPos stop, bool pseudo)
12325 {
12326  feat.SetLocation().SetInt().SetFrom(start);
12327  feat.SetLocation().SetInt().SetTo(stop);
12328  if (pseudo) {
12329  feat.SetPseudo(true);
12330  } else {
12331  feat.ResetPseudo();
12332  }
12333 }
12334 
12335 
12336 void CheckMiscPartialErrors(CRef<CSeq_entry> entry, bool expect_bad_5, bool expect_bad_3)
12337 {
12339 
12340  eval = validator.Validate(seh, options);
12341  if (expect_bad_5) {
12342  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
12343  "PartialProblem5Prime",
12344  "Start does not include first/last residue of sequence"));
12345  }
12346  if (expect_bad_3) {
12347  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
12348  "PartialProblem3Prime",
12349  "Stop does not include first/last residue of sequence"));
12350  }
12351  if (entry->GetSeq().GetAnnot().front()->GetData().GetFtable().front()->GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
12352  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
12353  "CDSmRNAMismatchLocation", "No CDS location match for 1 mRNA"));
12354  }
12355  // AddChromosomeNoLocation(expected_errors, entry);
12356  CheckErrors(*eval, expected_errors);
12357  CLEAR_ERRORS
12358 }
12359 
12360 
12361 void TestOneMiscPartial(CRef<CSeq_entry> entry, TSeqPos good_start, TSeqPos bad_start, TSeqPos good_stop, TSeqPos bad_stop, bool is_mrna)
12362 {
12363  entry->SetSeq().ResetAnnot();
12364  CRef<CSeq_feat> misc = AddMiscFeature(entry);
12365  if (is_mrna) {
12366  misc->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
12367  misc->SetData().SetRna().SetExt().SetName("fake mRNA name");
12368  }
12369  misc->SetLocation().SetPartialStart(true, eExtreme_Biological);
12370  misc->SetLocation().SetPartialStop(true, eExtreme_Biological);
12371  misc->SetPartial(true);
12372 
12373  SetUpMiscForPartialTest(*misc, good_start, good_stop, false);
12374  CheckMiscPartialErrors(entry, false, false);
12375 
12376  SetUpMiscForPartialTest(*misc, good_start, good_stop, true);
12377  CheckMiscPartialErrors(entry, false, false);
12378 
12379  SetUpMiscForPartialTest(*misc, bad_start, good_stop, false);
12380  CheckMiscPartialErrors(entry, true, false);
12381 
12382  SetUpMiscForPartialTest(*misc, bad_start, good_stop, true);
12383  CheckMiscPartialErrors(entry, false, false);
12384 
12385  SetUpMiscForPartialTest(*misc, good_start, bad_stop, false);
12386  CheckMiscPartialErrors(entry, false, true);
12387 
12388  SetUpMiscForPartialTest(*misc, good_start, bad_stop, true);
12389  CheckMiscPartialErrors(entry, false, false);
12390 
12391  SetUpMiscForPartialTest(*misc, bad_start, bad_stop, false);
12392  CheckMiscPartialErrors(entry, true, true);
12393 
12394  SetUpMiscForPartialTest(*misc, bad_start, bad_stop, true);
12395  CheckMiscPartialErrors(entry, false, false);
12396 }
12397 
12398 
12400 {
12401  CRef<CSeq_entry> entry = BuildGoodSeq();
12402 
12403  // ends
12404  TestOneMiscPartial(entry, 0, 1, entry->GetSeq().GetLength() - 1, entry->GetSeq().GetLength() - 2, false);
12405 #if 0
12406  TestOneMiscPartial(entry, 0, 1, entry->GetSeq().GetLength() - 1, entry->GetSeq().GetLength() - 2, true);
12407 
12408  // gap
12409  entry->SetSeq().SetInst().ResetSeq_data();
12410  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
12411  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("ATGATGATGCCCAAATTTGGGAAAA", CSeq_inst::eMol_dna);
12412  CRef<CDelta_seq> gap1(new CDelta_seq());
12413  gap1->SetLiteral().SetSeq_data().SetGap();
12414  gap1->SetLiteral().SetLength(10);
12415  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap1);
12416  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATGAAATTTGGGCCCC", CSeq_inst::eMol_dna);
12417  CRef<CDelta_seq> gap2(new CDelta_seq());
12418  gap2->SetLiteral().SetSeq_data().SetGap();
12419  gap2->SetLiteral().SetLength(10);
12420  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap2);
12421  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("AAACCCATGATGATGCCAATTCCCG", CSeq_inst::eMol_dna);
12422  entry->SetSeq().SetInst().SetLength(95);
12423  TestOneMiscPartial(entry, 36, 37, 58, 57, false);
12424  TestOneMiscPartial(entry, 36, 37, 58, 57, true);
12425 
12426  // splice
12427  entry = BuildGoodSeq();
12428  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AGTTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCGT");
12429  TestOneMiscPartial(entry, 0, 2, 59, 57, false);
12430  TestOneMiscPartial(entry, 2, 3, 57, 56, true);
12431 
12432 #endif
12433 }
12434 
12435 
12436 BOOST_AUTO_TEST_CASE(Test_FEAT_InvalidType)
12437 {
12440  misc->SetData().Reset();
12441 
12443 
12444  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidType",
12445  "Invalid SeqFeat type [0]"));
12446  // AddChromosomeNoLocation(expected_errors, entry);
12447  eval = validator.Validate(seh, options);
12448  CheckErrors(*eval, expected_errors);
12449  CLEAR_ERRORS
12450 }
12451 
12452 
12453 BOOST_AUTO_TEST_CASE(Test_FEAT_Range)
12454 {
12456  CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
12457  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(14);
12458  unit_test_util::AddFeat(trna, entry);
12459 
12461 
12462  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "tRNArange",
12463  "Anticodon is not 3 bases in length"));
12464  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "tRNArange",
12465  "Anticodon location not in tRNA"));
12466  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "tRNArange",
12467  "Anticodon location [lcl|good:15-14] out of range"));
12468  // AddChromosomeNoLocation(expected_errors, entry);
12469  eval = validator.Validate(seh, options);
12470  CheckErrors(*eval, expected_errors);
12471  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(100);
12472  expected_errors[2]->SetErrMsg("Anticodon location [lcl|good:15-101] out of range");
12473  eval = validator.Validate(seh, options);
12474  CheckErrors(*eval, expected_errors);
12475  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(50);
12476  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(kInvalidSeqPos);
12477  expected_errors[2]->SetErrMsg("Anticodon location [lcl|good:0-51] out of range");
12478  eval = validator.Validate(seh, options);
12479  CheckErrors(*eval, expected_errors);
12480 
12481  CLEAR_ERRORS
12482  scope.RemoveTopLevelSeqEntry(seh);
12485 
12486  CRef<CCode_break> codebreak(new CCode_break());
12487  codebreak->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
12488  codebreak->SetLoc().SetInt().SetFrom(27);
12489  codebreak->SetLoc().SetInt().SetTo(29);
12490  cds->SetData().SetCdregion().SetCode_break().push_back(codebreak);
12491  seh = scope.AddTopLevelSeqEntry(*entry);
12492 
12493  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "CDSrange",
12494  "Code-break location not in coding region"));
12495  // AddChromosomeNoLocation(expected_errors, entry);
12496  eval = validator.Validate(seh, options);
12497  CheckErrors(*eval, expected_errors);
12498 
12499  CLEAR_ERRORS
12500 
12501  codebreak->SetLoc().SetInt().SetFrom(0);
12502  codebreak->SetLoc().SetInt().SetTo(1);
12503  cds->SetData().SetCdregion().SetFrame(CCdregion::eFrame_three);
12505  cds->SetLocation().SetInt().SetTo(nentry->GetSeq().GetInst().GetLength() - 1);
12506  unit_test_util::SetNucProtSetPartials(entry, true, true);
12508  scope.RemoveTopLevelSeqEntry(seh);
12509  seh = scope.AddTopLevelSeqEntry(*entry);
12510 
12511  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "CDSrange",
12512  "Code-break location not in coding region - may be frame problem"));
12513  // AddChromosomeNoLocation(expected_errors, entry);
12514  SetDiagFilter(eDiagFilter_All, "!(1210.8)");
12515  eval = validator.Validate(seh, options);
12517  CheckErrors(*eval, expected_errors);
12518 
12519  CLEAR_ERRORS
12520 
12521  scope.RemoveTopLevelSeqEntry(seh);
12522  entry = unit_test_util::BuildGoodSeq();
12524  misc->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
12525  misc->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('N');
12526  misc->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetId().SetLocal().SetStr("good");
12527  misc->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(11);
12528  misc->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(13);
12529  seh = scope.AddTopLevelSeqEntry(*entry);
12530  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "tRNArange",
12531  "Anticodon location not in tRNA"));
12532  // AddChromosomeNoLocation(expected_errors, entry);
12533  eval = validator.Validate(seh, options);
12534  CheckErrors(*eval, expected_errors);
12535 
12536  misc->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(6);
12537  misc->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(10);
12538  expected_errors[0]->SetSeverity(eDiag_Warning);
12539  expected_errors[0]->SetErrMsg("Anticodon is not 3 bases in length");
12540  eval = validator.Validate(seh, options);
12541  CheckErrors(*eval, expected_errors);
12542 
12543  scope.RemoveTopLevelSeqEntry(seh);
12544  entry = unit_test_util::BuildGoodSeq();
12545  misc = unit_test_util::AddMiscFeature(entry);
12546  misc->SetLocation().SetInt().SetFrom(11);
12547  SetDiagFilter(eDiagFilter_All, "!(1204.1)");
12548  seh = scope.AddTopLevelSeqEntry(*entry);
12550  expected_errors[0]->SetErrCode("Range");
12551  expected_errors[0]->SetSeverity(eDiag_Critical);
12552  expected_errors[0]->SetErrMsg("Location: SeqLoc [lcl|good:12-11] out of range");
12553  eval = validator.Validate(seh, options);
12554  CheckErrors(*eval, expected_errors);
12555  scope.RemoveTopLevelSeqEntry(seh);
12556  entry = unit_test_util::BuildGoodSeq();
12557  misc = unit_test_util::AddMiscFeature(entry);
12558  misc->SetLocation().SetInt().SetTo(100);
12559  seh = scope.AddTopLevelSeqEntry(*entry);
12560  expected_errors[0]->SetErrMsg("Location: SeqLoc [lcl|good:1-101] out of range");
12561  eval = validator.Validate(seh, options);
12562  CheckErrors(*eval, expected_errors);
12563 
12564  CLEAR_ERRORS
12565 }
12566 
12567 
12568 BOOST_AUTO_TEST_CASE(Test_tRNA_Mixed_Loc) // Jira: VR_133
12569 {
12571  CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front()); // N(Asn)
12572  CRef<CSeq_loc> anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
12573  anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(0); // A
12574  anticodon_loc->SetMix().Set().front()->SetInt().SetTo(0);
12575  anticodon_loc->SetMix().Set().front()->SetInt().SetStrand(eNa_strand_plus);
12576  anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(2); // TT
12577  anticodon_loc->SetMix().Set().back()->SetInt().SetTo(3);
12578  anticodon_loc->SetMix().Set().back()->SetInt().SetStrand(eNa_strand_plus);
12579  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
12580  unit_test_util::AddFeat(trna, entry);
12581 
12583 
12584  eval = validator.Validate(seh, options);
12585  // AddChromosomeNoLocation(expected_errors, entry);
12586  CheckErrors(*eval, expected_errors);
12587 
12588  CLEAR_ERRORS
12589 }
12590 
12591 
12592 BOOST_AUTO_TEST_CASE(Test_FEAT_MixedStrand)
12593 {
12595  CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
12596  CRef<CSeq_loc> anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
12597  anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(0);
12598  anticodon_loc->SetMix().Set().front()->SetInt().SetTo(0);
12599  anticodon_loc->SetMix().Set().front()->SetInt().SetStrand(eNa_strand_minus);
12600  anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(9);
12601  anticodon_loc->SetMix().Set().back()->SetInt().SetTo(10);
12602  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
12603  unit_test_util::AddFeat(trna, entry);
12604 
12606 
12607  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "AnticodonMixedStrand",
12608  "Mixed strands in Anticodon [[lcl|good:c1-1, 10-11]]"));
12609  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonAA",
12610  "Codons predicted from anticodon (UAA) cannot produce amino acid (N/Asn)"));
12611  // AddChromosomeNoLocation(expected_errors, entry);
12612  eval = validator.Validate(seh, options);
12613  CheckErrors(*eval, expected_errors);
12614 
12615  scope.RemoveTopLevelSeqEntry(seh);
12616  entry = unit_test_util::BuildGoodSeq();
12617  trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
12618  anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
12619  anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(0);
12620  anticodon_loc->SetMix().Set().front()->SetInt().SetTo(0);
12621  anticodon_loc->SetMix().Set().front()->SetInt().SetStrand(eNa_strand_plus);
12622  anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(9);
12623  anticodon_loc->SetMix().Set().back()->SetInt().SetTo(10);
12624  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
12625  unit_test_util::AddFeat(trna, entry);
12626  seh = scope.AddTopLevelSeqEntry(*entry);
12627 
12628  expected_errors[0]->SetErrCode("AnticodonMixedStrand");
12629  expected_errors[0]->SetErrMsg("Mixed plus and unknown strands in Anticodon [[lcl|good:1-1, 10-11]]");
12630  expected_errors[1]->SetErrMsg("Codons predicted from anticodon (AAA) cannot produce amino acid (N/Asn)");
12631  eval = validator.Validate(seh, options);
12632  CheckErrors(*eval, expected_errors);
12633 
12634  CLEAR_ERRORS
12635 
12636  scope.RemoveTopLevelSeqEntry(seh);
12637  entry = unit_test_util::BuildGoodSeq();
12638  CRef<CSeq_feat> gene = AddMiscFeature(entry);
12639  CRef<CSeq_loc> gene_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
12640  gene_loc->SetMix().Set().front()->SetInt().SetFrom(0);
12641  gene_loc->SetMix().Set().front()->SetInt().SetTo(0);
12642  gene_loc->SetMix().Set().front()->SetInt().SetStrand(eNa_strand_minus);
12643  gene_loc->SetMix().Set().back()->SetInt().SetFrom(9);
12644  gene_loc->SetMix().Set().back()->SetInt().SetTo(10);
12645  gene->SetLocation().Assign(*gene_loc);
12646  seh = scope.AddTopLevelSeqEntry(*entry);
12647  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MixedStrand",
12648  "Location: Mixed strands in SeqLoc [(lcl|good:c1-1, 10-11)]"));
12649  // AddChromosomeNoLocation(expected_errors, entry);
12650  eval = validator.Validate(seh, options);
12651  CheckErrors(*eval, expected_errors);
12652 
12653  // warning if gene is pseudo
12654  scope.RemoveTopLevelSeqEntry(seh);
12655  gene->SetPseudo(true);
12656  seh = scope.AddTopLevelSeqEntry(*entry);
12657  expected_errors[0]->SetSeverity(eDiag_Warning);
12658  eval = validator.Validate(seh, options);
12659  CheckErrors(*eval, expected_errors);
12660  CLEAR_ERRORS
12661 }
12662 
12663 
12664 BOOST_AUTO_TEST_CASE(Test_FEAT_SeqLocOrder)
12665 {
12667  CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
12668  CRef<CSeq_loc> anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
12669  anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(9);
12670  anticodon_loc->SetMix().Set().front()->SetInt().SetTo(10);
12671  anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(0);
12672  anticodon_loc->SetMix().Set().back()->SetInt().SetTo(0);
12673  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
12674  unit_test_util::AddFeat(trna, entry);
12675 
12677 
12678  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SeqLocOrder",
12679  "Intervals out of order in Anticodon [[lcl|good:10-11, 1-1]]"));
12680  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonAA",
12681  "Codons predicted from anticodon (AAA) cannot produce amino acid (N/Asn)"));
12682  // AddChromosomeNoLocation(expected_errors, entry);
12683  eval = validator.Validate(seh, options);
12684  CheckErrors(*eval, expected_errors);
12685 
12686  CLEAR_ERRORS
12687 
12688  scope.RemoveTopLevelSeqEntry(seh);
12689  entry = unit_test_util::BuildGoodSeq();
12691  misc->SetLocation().Assign(*anticodon_loc);
12692  seh = scope.AddTopLevelSeqEntry(*entry);
12693  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLocOrder",
12694  "Location: Intervals out of order in SeqLoc [(lcl|good:10-11, 1-1)]"));
12695  // AddChromosomeNoLocation(expected_errors, entry);
12696  eval = validator.Validate(seh, options);
12697  CheckErrors(*eval, expected_errors);
12698 
12699  CLEAR_ERRORS
12700 }
12701 
12702 
12703 BOOST_AUTO_TEST_CASE(Test_FEAT_CdTransFail)
12704 {
12705  SetDiagFilter(eDiagFilter_All, "!(1204.1)");
12708  cds->SetLocation().SetInt().SetFrom(27);
12710 
12711  BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
12712 
12713  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "Range",
12714  "Location: SeqLoc [lcl|nuc:28-27] out of range"));
12715  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ProductLength",
12716  "Protein product length [8] is more than 120% of the translation length [0]"));
12717  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
12718  "Given protein length [8] does not match translation length [0]"));
12719  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoStop",
12720  "Missing stop codon"));
12721  // AddChromosomeNoLocation(expected_errors, entry);
12722  eval = validator.Validate(seh, options);
12723  CheckErrors(*eval, expected_errors);
12724 
12725  CLEAR_ERRORS
12727 }
12728 
12729 
12730 #define START_CODON_AND_INT_STOP_ERR \
12731 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon",\
12732  "Illegal start codon (and 1 internal stops). Probably wrong genetic code [0]"));
12733 #define INTERNAL_STOP_ERR \
12734 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop",\
12735  "1 internal stops (and illegal start codon). Genetic code [0]"));
12736 #define NO_STOP_ERR \
12737 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoStop", "Missing stop codon"));
12738 #define NO_PUB_ERR \
12739 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoPubFound", "No publications anywhere on this entire record."));
12740 #define PROT_LEN_ERR \
12741 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",\
12742  "Given protein length [8] does not match translation length [9]"));
12743 #define NO_SUB_ERR \
12744 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "MissingPubRequirement", "No submission citation anywhere on this entire record."));
12745 #define EXCEPTION_PROBLEM_ERR \
12746 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "ExceptionProblem", "unclassified translation discrepancy is not a legal exception explanation"));
12747 #define NO_SRC_ERR \
12748 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoSourceDescriptor", "No source information included on this record."));
12749 
12750 
12751 BOOST_AUTO_TEST_CASE(Test_FEAT_StartCodon)
12752 {
12755  cds->SetLocation().SetInt().SetFrom(1);
12756  cds->SetLocation().SetInt().SetTo(27);
12757 
12760  CRef<CSeq_feat> nuc_only_cds(new CSeq_feat());
12761  nuc_only_cds->Assign(*cds);
12762  unit_test_util::AddFeat(nuc_only_cds, nuc);
12763 
12765 
12766  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
12767  BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
12768  BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
12769 
12772  NO_STOP_ERR
12773  PROT_LEN_ERR
12774  // AddChromosomeNoLocation(expected_errors, entry);
12775 
12776  eval = validator.Validate(seh, options);
12777  CheckErrors(*eval, expected_errors);
12778  CLEAR_ERRORS
12779 
12780  scope.RemoveTopLevelSeqEntry(seh);
12781  seh = scope.AddTopLevelSeqEntry(*nuc);
12782  eval = validator.Validate(seh, options);
12783 
12786  NO_STOP_ERR
12787  NO_PUB_ERR
12788  NO_SUB_ERR
12789  NO_SRC_ERR
12790  CheckErrors(*eval, expected_errors);
12791  CLEAR_ERRORS
12792 
12793 
12794  scope.RemoveTopLevelSeqEntry(seh);
12795  seh = scope.AddTopLevelSeqEntry(*entry);
12796 
12797  // don't report start codon if unclassified exception
12798  cds->SetExcept(true);
12799  cds->SetExcept_text("unclassified translation discrepancy");
12800 
12803  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
12804  "CDS has unnecessary translated product replaced exception"));
12805  // AddChromosomeNoLocation(expected_errors, entry);
12806 
12807  expected_errors[0]->SetSeverity(eDiag_Warning);
12808  eval = validator.Validate(seh, options);
12809  CheckErrors(*eval, expected_errors);
12810  CLEAR_ERRORS
12811 
12812  scope.RemoveTopLevelSeqEntry(seh);
12813  nuc_only_cds->Assign(*cds);
12814  seh = scope.AddTopLevelSeqEntry(*nuc);
12815  eval = validator.Validate(seh, options);
12818  NO_PUB_ERR
12819  NO_SUB_ERR
12820  NO_SRC_ERR
12821  expected_errors[1]->SetSeverity(eDiag_Warning);
12822  CheckErrors(*eval, expected_errors);
12823  CLEAR_ERRORS
12824 
12825  scope.RemoveTopLevelSeqEntry(seh);
12828  cds->SetExcept(false);
12829  cds->ResetExcept_text();
12830  CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
12831  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[0] = 'C';
12832  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[1] = 'C';
12833  seh = scope.AddTopLevelSeqEntry(*entry);
12834  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon",
12835  "Illegal start codon used. Wrong genetic code [0] or protein should be partial"));
12836  // AddChromosomeNoLocation(expected_errors, entry);
12837  eval = validator.Validate(seh, options);
12838  CheckErrors(*eval, expected_errors);
12839  CLEAR_ERRORS
12840 
12841  // don't report start codon if unclassified exception
12842  cds->SetExcept(true);
12843  cds->SetExcept_text("unclassified translation discrepancy");
12845  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
12846  "CDS has unnecessary translated product replaced exception"));
12847  // AddChromosomeNoLocation(expected_errors, entry);
12848 
12849  eval = validator.Validate(seh, options);
12850  CheckErrors(*eval, expected_errors);
12851  CLEAR_ERRORS
12852 }
12853 
12854 
12855 BOOST_AUTO_TEST_CASE(Test_FEAT_InternalStop)
12856 {
12859  cds->SetLocation().SetInt().SetFrom(1);
12860  cds->SetLocation().SetInt().SetTo(27);
12861 
12863 
12864  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
12865  BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
12866  BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
12867 
12868  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon",
12869  "Illegal start codon (and 1 internal stops). Probably wrong genetic code [0]"));
12870  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop",
12871  "1 internal stops (and illegal start codon). Genetic code [0]"));
12872  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
12873  "Given protein length [8] does not match translation length [9]"));
12874  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoStop",
12875  "Missing stop codon"));
12876  // AddChromosomeNoLocation(expected_errors, entry);
12877 
12878  eval = validator.Validate(seh, options);
12879  CheckErrors(*eval, expected_errors);
12880 
12881  CLEAR_ERRORS
12882 
12883  scope.RemoveTopLevelSeqEntry(seh);
12886  CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
12887  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[9] = 'T';
12888  entry->SetSet().SetSeq_set().back()->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPR*TEIN");
12889  seh = scope.AddTopLevelSeqEntry(*entry);
12890 
12891  BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
12892  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
12893 
12894  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein",
12895  "[1] termination symbols in protein sequence (gene? - fake protein name)"));
12896  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop",
12897  "1 internal stops. Genetic code [0]"));
12898  // AddChromosomeNoLocation(expected_errors, entry);
12899  eval = validator.Validate(seh, options);
12900  CheckErrors(*eval, expected_errors);
12901 
12902  CValidErrorFormat format(*objmgr);
12903  string rval = format.FormatForSubmitterReport(*eval, scope, eErr_SEQ_FEAT_InternalStop);
12904  BOOST_CHECK_EQUAL(rval, "InternalStop\nlcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\t\n");
12905 
12906  // try again with locus tag for report
12907  scope.RemoveTopLevelSeqEntry(seh);
12908  CRef<CSeq_feat> gene = MakeGeneForFeature(cds);
12909  gene->SetData().SetGene().SetLocus_tag("a_locus_tag");
12910  AddFeat(gene, nuc_seq);
12911  seh = scope.AddTopLevelSeqEntry(*entry);
12912  eval = validator.Validate(seh, options);
12913  rval = format.FormatForSubmitterReport(*eval, scope, eErr_SEQ_FEAT_InternalStop);
12914  BOOST_CHECK_EQUAL(rval, "InternalStop\nlcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\ta_locus_tag\n");
12915 
12916  CLEAR_ERRORS
12917 }
12918 
12919 
12920 BOOST_AUTO_TEST_CASE(Test_FEAT_NoProtein)
12921 {
12923  entry->SetSet().SetSeq_set().pop_back();
12925  cds->ResetProduct();
12927 
12928  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NucProtProblem",
12929  "No proteins in nuc-prot set"));
12930  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoProtein",
12931  "No protein Bioseq given"));
12932  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "MissingCDSproduct",
12933  "Expected CDS product absent"));
12934  // AddChromosomeNoLocation(expected_errors, entry);
12935 
12937  eval = validator.Validate(seh, options);
12938  CheckErrors(*eval, expected_errors);
12939 
12940  CLEAR_ERRORS
12941 }
12942 
12943 
12944 BOOST_AUTO_TEST_CASE(Test_FEAT_MisMatchAA)
12945 {
12948  CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
12949  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set()[0] = 'A';
12950 
12952 
12953  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "MisMatchAA",
12954  "Residue 1 in protein [A] != translation [M] at lcl|nuc:1-3"));
12955  // AddChromosomeNoLocation(expected_errors, entry);
12956 
12957  eval = validator.Validate(seh, options);
12958  CheckErrors(*eval, expected_errors);
12959 
12960  for (int i = 0; i < 11; i++) {
12961  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set()[i] = 'A';
12962  }
12963 
12964 
12965  expected_errors[0]->SetErrMsg("11 mismatches found. First mismatch at 1, residue in protein [A] != translation [M] at lcl|nuc:1-3. Last mismatch at 11, residue in protein [A] != translation [M] at lcl|nuc:31-33. Genetic code [0]");
12966 
12967  eval = validator.Validate(seh, options);
12968  CheckErrors(*eval, expected_errors);
12969 
12970  CLEAR_ERRORS
12971 }
12972 
12973 
12974 BOOST_AUTO_TEST_CASE(Test_FEAT_TransLen)
12975 {
12977  CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
12978  prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEI");
12979  prot_seq->SetSeq().SetInst().SetLength(7);
12981 
12983 
12984  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
12985  "Given protein length [7] does not match translation length [9]"));
12986  // AddChromosomeNoLocation(expected_errors, entry);
12987 
12988  eval = validator.Validate(seh, options);
12989  CheckErrors(*eval, expected_errors);
12990 
12991  CLEAR_ERRORS
12992 
12993  scope.RemoveTopLevelSeqEntry(seh);
12995  CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
12996  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[27] = 'A';
12997  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[28] = 'T';
12999  cds->SetLocation().SetInt().SetTo(28);
13000  seh = scope.AddTopLevelSeqEntry(*entry);
13001  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
13002  "Coding region extends 2 base(s) past stop codon"));
13003  // AddChromosomeNoLocation(expected_errors, entry);
13004 
13005  eval = validator.Validate(seh, options);
13006  CheckErrors(*eval, expected_errors);
13007 
13008  CLEAR_ERRORS
13009 
13010  scope.RemoveTopLevelSeqEntry(seh);
13013  prot_seq = entry->SetSet().SetSeq_set().back();
13014  prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEINQQLLLLLLLLLLQQQQQQQQQQ");
13015  prot_seq->SetSeq().SetInst().SetLength(30);
13017  seh = scope.AddTopLevelSeqEntry(*entry);
13018  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ProductLength",
13019  "Protein product length [30] is more than 120% of the translation length [9]"));
13020  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
13021  "Given protein length [30] does not match translation length [9]"));
13022  // AddChromosomeNoLocation(expected_errors, entry);
13023 
13024  eval = validator.Validate(seh, options);
13025  CheckErrors(*eval, expected_errors);
13026 
13027  CLEAR_ERRORS
13028 
13029  // setting this exception suppresses the error
13030  cds->SetExcept(true);
13031  cds->SetExcept_text("annotated by transcript or proteomic data");
13032  // inference is required for exception
13033  cds->AddQualifier("inference", "similar to DNA sequence:INSD:AY123456.1");
13034  // expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
13035  // "CDS has unnecessary translated product replaced exception"));
13036  // AddChromosomeNoLocation(expected_errors, entry);
13037  eval = validator.Validate(seh, options);
13038  CheckErrors(*eval, expected_errors);
13039 
13040  CLEAR_ERRORS
13041 }
13042 
13043 
13044 BOOST_AUTO_TEST_CASE(Test_FEAT_NoStop)
13045 {
13048  cds->SetLocation().SetInt().SetTo(23);
13049 
13051 
13052  BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
13053 
13054  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoStop",
13055  "Missing stop codon"));
13056  // AddChromosomeNoLocation(expected_errors, entry);
13057 
13058  eval = validator.Validate(seh, options);
13059  CheckErrors(*eval, expected_errors);
13060 
13061  CLEAR_ERRORS
13062 }
13063 
13064 
13065 BOOST_AUTO_TEST_CASE(Test_FEAT_TranslExcept)
13066 {
13069  cds->AddQualifier("transl_except", "abc");
13070  CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
13071  prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set()[4] = 'E';
13072 
13074 
13075  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "MisMatchAA",
13076  "Residue 5 in protein [E] != translation [T] at lcl|nuc:13-15"));
13077  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranslExcept",
13078  "Unparsed transl_except qual. Skipped"));
13079  // AddChromosomeNoLocation(expected_errors, entry);
13080 
13081  eval = validator.Validate(seh, options);
13082  CheckErrors(*eval, expected_errors);
13083 
13084  CLEAR_ERRORS
13085  scope.RemoveTopLevelSeqEntry(seh);
13088  cds->AddQualifier("transl_except", "abc");
13089  seh = scope.AddTopLevelSeqEntry(*entry);
13090 
13091  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranslExcept",
13092  "Unparsed transl_except qual (but protein is okay). Skipped"));
13093  // AddChromosomeNoLocation(expected_errors, entry);
13094 
13095  eval = validator.Validate(seh, options);
13096  CheckErrors(*eval, expected_errors);
13097 
13098  CLEAR_ERRORS
13099 }
13100 
13101 
13102 BOOST_AUTO_TEST_CASE(Test_FEAT_NoProtRefFound)
13103 {
13105  CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
13106  CRef<CSeq_feat> prot_feat = prot_seq->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
13107  prot_feat->SetLocation().SetInt().SetTo(6);
13108 
13110 
13111  // see this error if prot-ref present, but wrong size, or if absent completely
13112  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "MissingProteinName",
13113  "The product name is missing from this protein."));
13114  // AddChromosomeNoLocation(expected_errors, entry);
13115 
13116  eval = validator.Validate(seh, options);
13117  CheckErrors(*eval, expected_errors);
13118 
13119  scope.RemoveTopLevelSeqEntry(seh);
13120  prot_seq->SetSeq().ResetAnnot();
13121  seh = scope.AddTopLevelSeqEntry(*entry);
13122 
13123  eval = validator.Validate(seh, options);
13124  CheckErrors(*eval, expected_errors);
13125 
13126  CLEAR_ERRORS
13127 }
13128 
13129 
13130 BOOST_AUTO_TEST_CASE(Test_FEAT_OrfCdsHasProduct)
13131 {
13134  cds->SetData().SetCdregion().SetOrf(true);
13135 
13137 
13138  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "OrfCdsHasProduct",
13139  "An ORF coding region should not have a product"));
13140  // AddChromosomeNoLocation(expected_errors, entry);
13141 
13142  eval = validator.Validate(seh, options);
13143  CheckErrors(*eval, expected_errors);
13144 
13145  CLEAR_ERRORS
13146 }
13147 
13148 
13149 BOOST_AUTO_TEST_CASE(Test_FEAT_GeneRefHasNoData)
13150 {
13154  gene->SetData().SetGene();
13155  gene->SetLocation().SetInt().SetTo(26);
13156 
13158 
13159  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "GeneRefHasNoData",
13160  "There is a gene feature where all fields are empty"));
13161  // AddChromosomeNoLocation(expected_errors, entry);
13162 
13163  eval = validator.Validate(seh, options);
13164  CheckErrors(*eval, expected_errors);
13165 
13166  CLEAR_ERRORS
13167 }
13168 
13169 
13170 BOOST_AUTO_TEST_CASE(Test_FEAT_ExceptInconsistent)
13171 {
13172  string except_text = "trans-splicing";
13175  cds->AddQualifier("exception", except_text);
13176 
13178 
13179  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MissingExceptionFlag",
13180  "Exception flag should be set in coding region"));
13181  // AddChromosomeNoLocation(expected_errors, entry);
13182 
13183  eval = validator.Validate(seh, options);
13184  CheckErrors(*eval, expected_errors);
13185 
13186  CLEAR_ERRORS
13187 
13188  cds->ResetQual();
13189  cds->SetExcept_text(except_text);
13190  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MissingExceptionFlag",
13191  "Exception text is present, but exception flag is not set"));
13192  // AddChromosomeNoLocation(expected_errors, entry);
13193 
13194  eval = validator.Validate(seh, options);
13195  CheckErrors(*eval, expected_errors);
13196 
13197  CLEAR_ERRORS
13198 
13199  cds->ResetExcept_text();
13200  cds->SetExcept(true);
13201 
13202  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ExceptionMissingText",
13203  "Exception flag is set, but exception text is empty"));
13204  // AddChromosomeNoLocation(expected_errors, entry);
13205 
13206  eval = validator.Validate(seh, options);
13207  CheckErrors(*eval, expected_errors);
13208  CLEAR_ERRORS
13209 }
13210 
13211 
13212 BOOST_AUTO_TEST_CASE(Test_FEAT_ProtRefHasNoData)
13213 {
13215  CRef<CSeq_feat> prot_feat = entry->SetSet().SetSeq_set().back()->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
13216  prot_feat->SetData().SetProt().Reset();
13217 
13219 
13220  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ProtRefHasNoData",
13221  "There is a protein feature where all fields are empty"));
13222  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "NoNameForProtein",
13223  "Protein feature has no name"));
13224  // AddChromosomeNoLocation(expected_errors, entry);
13225 
13226  eval = validator.Validate(seh, options);
13227  CheckErrors(*eval, expected_errors);
13228 
13229  CLEAR_ERRORS
13230 }
13231 
13232 
13233 BOOST_AUTO_TEST_CASE(Test_FEAT_GenCodeMismatch)
13234 {
13238  ce->SetId(3);
13239  CRef<CGenetic_code> gcode(new CGenetic_code());
13240  cds->SetData().SetCdregion().SetCode().Set().push_back(ce);
13242  unit_test_util::SetGcode(entry, 2);
13243  CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
13244  prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set()[6] = 'M';
13245 
13247  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "GenCodeMismatch",
13248  "Genetic code conflict between CDS (code 3) and BioSource.genome biological context (apicoplast) (uses code 11)"));
13249  // AddChromosomeNoLocation(expected_errors, entry);
13250 
13251  eval = validator.Validate(seh, options);
13252  CheckErrors(*eval, expected_errors);
13253 
13255 
13256  expected_errors[0]->SetErrMsg("Genetic code conflict between CDS (code 3) and BioSource (code 2)");
13257 
13258  eval = validator.Validate(seh, options);
13259  CheckErrors(*eval, expected_errors);
13260 
13261  CLEAR_ERRORS
13262 
13263  // ignore gencode mismatch for specified exception text
13264  cds->SetExcept(true);
13265  cds->SetExcept_text("genetic code exception");
13266  // AddChromosomeNoLocation(expected_errors, entry);
13267  eval = validator.Validate(seh, options);
13268  CheckErrors(*eval, expected_errors);
13269 
13270  CLEAR_ERRORS
13271 }
13272 
13273 
13274 BOOST_AUTO_TEST_CASE(Test_FEAT_RNAtype0)
13275 {
13278  rna->SetData().SetRna().SetType(CRNA_ref::eType_unknown);
13279 
13281  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "RNAtype0",
13282  "RNA type 0 (unknown) not supported"));
13283  // AddChromosomeNoLocation(expected_errors, entry);
13284 
13285  eval = validator.Validate(seh, options);
13286  CheckErrors(*eval, expected_errors);
13287 
13288  CLEAR_ERRORS
13289 }
13290 
13291 
13292 BOOST_AUTO_TEST_CASE(Test_FEAT_UnknownImpFeatKey)
13293 {
13296  misc->SetData().SetImp().SetKey("bad value");
13297 
13299  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnknownImpFeatKey",
13300  "Unknown feature key bad value"));
13301 
13302  // AddChromosomeNoLocation(expected_errors, entry);
13303  eval = validator.Validate(seh, options);
13304  CheckErrors(*eval, expected_errors);
13305 
13306  scope.RemoveTopLevelSeqEntry(seh);
13307  misc->SetData().SetImp().SetKey("");
13308  seh = scope.AddTopLevelSeqEntry(*entry);
13309  expected_errors[0]->SetErrMsg("NULL feature key");
13310  eval = validator.Validate(seh, options);
13311  CheckErrors(*eval, expected_errors);
13312 
13313  vector<string> illegal_keys;
13314  illegal_keys.push_back("virion");
13315  illegal_keys.push_back("mutation");
13316  illegal_keys.push_back("allele");
13317  illegal_keys.push_back("Import");
13318 
13319  expected_errors[0]->SetSeverity(eDiag_Error);
13320  for (const string& it : illegal_keys) {
13321  scope.RemoveTopLevelSeqEntry(seh);
13322  misc->SetData().SetImp().SetKey(it);
13323  seh = scope.AddTopLevelSeqEntry(*entry);
13324  expected_errors[0]->SetErrMsg("Feature key " + it + " is no longer legal");
13325  eval = validator.Validate(seh, options);
13326  CheckErrors(*eval, expected_errors);
13327  }
13328 
13329  CLEAR_ERRORS
13330 }
13331 
13332 
13333 BOOST_AUTO_TEST_CASE(Test_FEAT_UnknownImpFeatQual)
13334 {
13337  misc->AddQualifier("bad name", "some value");
13338 
13340  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnknownImpFeatQual",
13341  "Unknown qualifier bad name"));
13342  // AddChromosomeNoLocation(expected_errors, entry);
13343 
13344  eval = validator.Validate(seh, options);
13345  CheckErrors(*eval, expected_errors);
13346 
13347  misc->SetQual().front()->SetQual("");
13348  expected_errors[0]->SetErrMsg("NULL qualifier");
13349  eval = validator.Validate(seh, options);
13350  CheckErrors(*eval, expected_errors);
13351 
13352  CLEAR_ERRORS
13353 }
13354 
13355 
13356 // begin automatically generated section
13357 BOOST_AUTO_TEST_CASE(Test_FEAT_MissingQualOnImpFeat)
13358 {
13359 
13362 
13364 
13365  scope.RemoveTopLevelSeqEntry(seh);
13366  entry = unit_test_util::BuildGoodSeq();
13367  misc_feat = unit_test_util::AddMiscFeature(entry);
13368  misc_feat->SetData().SetImp().SetKey("conflict");
13369  seh = scope.AddTopLevelSeqEntry(*entry);
13370  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13371  "Missing qualifier citation for feature conflict"));
13372  // AddChromosomeNoLocation(expected_errors, entry);
13373  eval = validator.Validate(seh, options);
13374  CheckErrors(*eval, expected_errors);
13375  CLEAR_ERRORS
13376  scope.RemoveTopLevelSeqEntry(seh);
13377  entry = unit_test_util::BuildGoodSeq();
13378  misc_feat = unit_test_util::AddMiscFeature(entry);
13379  misc_feat->SetData().SetImp().SetKey("misc_binding");
13380  seh = scope.AddTopLevelSeqEntry(*entry);
13381  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13382  "Missing qualifier bound_moiety for feature misc_binding"));
13383  // AddChromosomeNoLocation(expected_errors, entry);
13384  eval = validator.Validate(seh, options);
13385  CheckErrors(*eval, expected_errors);
13386  CLEAR_ERRORS
13387  scope.RemoveTopLevelSeqEntry(seh);
13388  entry = unit_test_util::BuildGoodSeq();
13389  misc_feat = unit_test_util::AddMiscFeature(entry);
13390  misc_feat->SetData().SetImp().SetKey("modified_base");
13391  seh = scope.AddTopLevelSeqEntry(*entry);
13392  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13393  "Missing qualifier mod_base for feature modified_base"));
13394  // AddChromosomeNoLocation(expected_errors, entry);
13395  eval = validator.Validate(seh, options);
13396  CheckErrors(*eval, expected_errors);
13397  CLEAR_ERRORS
13398  scope.RemoveTopLevelSeqEntry(seh);
13399  entry = unit_test_util::BuildGoodSeq();
13400  misc_feat = unit_test_util::AddMiscFeature(entry);
13401  misc_feat->SetData().SetImp().SetKey("old_sequence");
13402  seh = scope.AddTopLevelSeqEntry(*entry);
13403  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13404  "Missing qualifier citation for feature old_sequence"));
13405  // AddChromosomeNoLocation(expected_errors, entry);
13406  eval = validator.Validate(seh, options);
13407  CheckErrors(*eval, expected_errors);
13408  CLEAR_ERRORS
13409  scope.RemoveTopLevelSeqEntry(seh);
13410  entry = unit_test_util::BuildGoodSeq();
13411  misc_feat = unit_test_util::AddMiscFeature(entry);
13412  misc_feat->SetData().SetImp().SetKey("operon");
13413  seh = scope.AddTopLevelSeqEntry(*entry);
13414  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13415  "Missing qualifier operon for feature operon"));
13416  // AddChromosomeNoLocation(expected_errors, entry);
13417  eval = validator.Validate(seh, options);
13418  CheckErrors(*eval, expected_errors);
13419  CLEAR_ERRORS
13420  scope.RemoveTopLevelSeqEntry(seh);
13421  entry = unit_test_util::BuildGoodSeq();
13422  misc_feat = unit_test_util::AddMiscFeature(entry);
13423  misc_feat->SetData().SetImp().SetKey("protein_bind");
13424  seh = scope.AddTopLevelSeqEntry(*entry);
13425  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13426  "Missing qualifier bound_moiety for feature protein_bind"));
13427  // AddChromosomeNoLocation(expected_errors, entry);
13428  eval = validator.Validate(seh, options);
13429  CheckErrors(*eval, expected_errors);
13430  CLEAR_ERRORS
13431  scope.RemoveTopLevelSeqEntry(seh);
13432  entry = unit_test_util::BuildGoodSeq();
13433  misc_feat = unit_test_util::AddMiscFeature(entry);
13434  misc_feat->SetData().SetImp().SetKey("source");
13435  seh = scope.AddTopLevelSeqEntry(*entry);
13436  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13437  "Missing qualifier organism for feature source"));
13438  // AddChromosomeNoLocation(expected_errors, entry);
13439  eval = validator.Validate(seh, options);
13440  CheckErrors(*eval, expected_errors);
13441  CLEAR_ERRORS
13442 }
13443 //end automatically generated section
13444 
13445 
13446 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoCdsHasProduct)
13447 {
13450  cds->SetPseudo(true);
13453  gene->SetPseudo(true);
13455 
13457 
13458  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PseudoCdsHasProduct", "A pseudo coding region should not have a product"));
13459  // AddChromosomeNoLocation(expected_errors, entry);
13460  eval = validator.Validate(seh, options);
13461  CheckErrors(*eval, expected_errors);
13462 
13463  gene->SetPseudo(false);
13464  eval = validator.Validate(seh, options);
13465  CheckErrors(*eval, expected_errors);
13466 
13467  scope.RemoveTopLevelSeqEntry(seh);
13470  cds->SetPseudo(true);
13471  seh = scope.AddTopLevelSeqEntry(*entry);
13472  eval = validator.Validate(seh, options);
13473  CheckErrors(*eval, expected_errors);
13474 
13475  CLEAR_ERRORS
13476 }
13477 
13478 
13479 static string MakeWrongCap(const string& str)
13480 {
13481  string bad;
13482  for (char c : str) {
13483  if (isupper(c)) {
13484  c = tolower(c);
13485  } else if (islower(c)) {
13486  c = toupper(c);
13487  }
13488  bad += c;
13489  }
13490  return bad;
13491 }
13492 
13493 
13494 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_IllegalDbXref)
13495 {
13497 
13499 
13500  vector<string> legal_strings;
13501  legal_strings.push_back("AceView/WormGenes");
13502  legal_strings.push_back("AFTOL");
13503  legal_strings.push_back("AntWeb");
13504  legal_strings.push_back("APHIDBASE");
13505  legal_strings.push_back("ApiDB");
13506  legal_strings.push_back("ApiDB_CryptoDB");
13507  legal_strings.push_back("ApiDB_PlasmoDB");
13508  legal_strings.push_back("ApiDB_ToxoDB");
13509  legal_strings.push_back("ASAP");
13510  legal_strings.push_back("ATCC");
13511  legal_strings.push_back("ATCC(in host)");
13512  legal_strings.push_back("ATCC(dna)");
13513  legal_strings.push_back("Axeldb");
13514  legal_strings.push_back("BDGP_EST");
13515  legal_strings.push_back("BDGP_INS");
13516  legal_strings.push_back("BEETLEBASE");
13517  legal_strings.push_back("BOLD");
13518  legal_strings.push_back("CDD");
13519  legal_strings.push_back("CK");
13520  legal_strings.push_back("COG");
13521  legal_strings.push_back("dbClone");
13522  legal_strings.push_back("dbCloneLib");
13523  legal_strings.push_back("dbEST");
13524  legal_strings.push_back("dbProbe");
13525  legal_strings.push_back("dbSNP");
13526  legal_strings.push_back("dbSTS");
13527  legal_strings.push_back("dictyBase");
13528  legal_strings.push_back("DDBJ");
13529  legal_strings.push_back("EcoGene");
13530  legal_strings.push_back("EMBL");
13531  //legal_strings.push_back("ENSEMBL");
13532  legal_strings.push_back("Ensembl");
13533  legal_strings.push_back("ESTLIB");
13534  legal_strings.push_back("FANTOM_DB");
13535  legal_strings.push_back("FLYBASE");
13536  legal_strings.push_back("GABI");
13537  legal_strings.push_back("GDB");
13538  legal_strings.push_back("GeneDB");
13539  legal_strings.push_back("GeneID");
13540  legal_strings.push_back("GO");
13541  legal_strings.push_back("GOA");
13542  legal_strings.push_back("Greengenes");
13543  legal_strings.push_back("GRIN");
13544  legal_strings.push_back("H-InvDB");
13545  legal_strings.push_back("HGNC");
13546  legal_strings.push_back("HMP");
13547  legal_strings.push_back("HOMD");
13548  legal_strings.push_back("HSSP");
13549  legal_strings.push_back("IMGT/GENE-DB");
13550  legal_strings.push_back("IMGT/HLA");
13551  legal_strings.push_back("IMGT/LIGM");
13552  legal_strings.push_back("InterimID");
13553  legal_strings.push_back("InterPro");
13554  legal_strings.push_back("IRD");
13555  legal_strings.push_back("ISD");
13556  legal_strings.push_back("ISFinder");
13557  legal_strings.push_back("JCM");
13558  legal_strings.push_back("JGIDB");
13559  legal_strings.push_back("LocusID");
13560  legal_strings.push_back("MaizeGDB");
13561  legal_strings.push_back("MGI");
13562  legal_strings.push_back("MIM");
13563  legal_strings.push_back("miRBase");
13564  legal_strings.push_back("MycoBank");
13565  legal_strings.push_back("NBRC");
13566  legal_strings.push_back("NextDB");
13567  legal_strings.push_back("niaEST");
13568  legal_strings.push_back("NMPDR");
13569  legal_strings.push_back("NRESTdb");
13570  legal_strings.push_back("Osa1");
13571  legal_strings.push_back("Pathema");
13572  legal_strings.push_back("PBmice");
13573  legal_strings.push_back("PDB");
13574  legal_strings.push_back("PFAM");
13575  legal_strings.push_back("PGN");
13576  legal_strings.push_back("PIR");
13577  legal_strings.push_back("PSEUDO");
13578  // legal_strings.push_back("PseudoCap");
13579  legal_strings.push_back("PseudoCAP");
13580  legal_strings.push_back("RAP-DB");
13581  legal_strings.push_back("RATMAP");
13582  legal_strings.push_back("RFAM");
13583  legal_strings.push_back("RGD");
13584  legal_strings.push_back("RiceGenes");
13585  legal_strings.push_back("RZPD");
13586  legal_strings.push_back("SEED");
13587  legal_strings.push_back("SGD");
13588  legal_strings.push_back("SGN");
13589  legal_strings.push_back("SoyBase");
13590  legal_strings.push_back("SubtiList");
13591  legal_strings.push_back("TAIR");
13592  legal_strings.push_back("taxon");
13593  legal_strings.push_back("TIGRFAM");
13594  legal_strings.push_back("UniGene");
13595  legal_strings.push_back("UNILIB");
13596  legal_strings.push_back("UniProtKB/Swiss-Prot");
13597  legal_strings.push_back("UniProtKB/TrEMBL");
13598  legal_strings.push_back("UniSTS");
13599  legal_strings.push_back("UNITE");
13600  legal_strings.push_back("VBASE2");
13601  legal_strings.push_back("VectorBase");
13602  legal_strings.push_back("WorfDB");
13603  legal_strings.push_back("WormBase");
13604  legal_strings.push_back("Xenbase");
13605  legal_strings.push_back("ZFIN");
13606  vector<string> src_strings;
13607  src_strings.push_back("AFTOL");
13608  src_strings.push_back("AntWeb");
13609  src_strings.push_back("ATCC");
13610  src_strings.push_back("ATCC(dna)");
13611  src_strings.push_back("ATCC(in host)");
13612  src_strings.push_back("BOLD");
13613  src_strings.push_back("FANTOM_DB");
13614  src_strings.push_back("FLYBASE");
13615  src_strings.push_back("Greengenes");
13616  src_strings.push_back("GRIN");
13617  src_strings.push_back("HMP");
13618  src_strings.push_back("HOMD");
13619  src_strings.push_back("IMGT/HLA");
13620  src_strings.push_back("IMGT/LIGM");
13621  src_strings.push_back("JCM");
13622  src_strings.push_back("MGI");
13623  src_strings.push_back("MycoBank");
13624  src_strings.push_back("NBRC");
13625  src_strings.push_back("RZPD");
13626  src_strings.push_back("taxon");
13627  src_strings.push_back("UNILIB");
13628  src_strings.push_back("UNITE");
13629  vector<string> refseq_strings;
13630  refseq_strings.push_back("CCDS");
13631  refseq_strings.push_back("CGNC");
13632  refseq_strings.push_back("CloneID");
13633  refseq_strings.push_back("HPRD");
13634  refseq_strings.push_back("LRG");
13635  refseq_strings.push_back("PBR");
13636  refseq_strings.push_back("REBASE");
13637  refseq_strings.push_back("SK-FST");
13638  refseq_strings.push_back("VBRC");
13639 
13640  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IllegalDbXref",
13641  "db_xref type %s (1234) should not be used on an OrgRef"));
13642  // AddChromosomeNoLocation(expected_errors, entry);
13643 
13644  string bad;
13645  for (const string& sit : src_strings) {
13646  if (NStr::Equal(sit, "taxon")) {
13647  unit_test_util::RemoveDbxref(entry, sit, 0);
13648  }
13649  bad = MakeWrongCap(sit);
13650  unit_test_util::SetDbxref(entry, bad, 1234);
13651  expected_errors[0]->SetErrMsg("Illegal db_xref type " + bad + " (1234), legal capitalization is " + sit);
13652  eval = validator.Validate(seh, options);
13653  CheckErrors(*eval, expected_errors);
13654  unit_test_util::RemoveDbxref(entry, bad, 0);
13655  if (NStr::Equal(sit, "taxon")) {
13656  unit_test_util::SetTaxon(entry, 592768);
13657  }
13658  }
13659 
13660  for (const string& sit : legal_strings) {
13661  bool found = false;
13662  for (const string& ss : src_strings) {
13663  if (NStr::Equal(ss, sit)) {
13664  found = true;
13665  break;
13666  }
13667  }
13668  if (found) {
13669  continue;
13670  }
13671  bad = MakeWrongCap(sit);
13672  unit_test_util::SetDbxref(entry, bad, 1234);
13673  expected_errors[0]->SetErrMsg("Illegal db_xref type " + bad + " (1234), legal capitalization is " + sit
13674  + ", but should not be used on an OrgRef");
13675  eval = validator.Validate(seh, options);
13676  CheckErrors(*eval, expected_errors);
13677  unit_test_util::RemoveDbxref(entry, bad, 0);
13678 
13679  unit_test_util::SetDbxref(entry, sit, 1234);
13680  expected_errors[0]->SetErrMsg("db_xref type " + sit + " (1234) should not be used on an OrgRef");
13681  eval = validator.Validate(seh, options);
13682  CheckErrors(*eval, expected_errors);
13683  unit_test_util::RemoveDbxref(entry, sit, 0);
13684  }
13685 
13686  for (const string& sit : refseq_strings) {
13687  unit_test_util::SetDbxref(entry, sit, 1234);
13688  expected_errors[0]->SetErrMsg("RefSeq-specific db_xref type " + sit + " (1234) should not be used on a non-RefSeq OrgRef");
13689  eval = validator.Validate(seh, options);
13690  CheckErrors(*eval, expected_errors);
13691  unit_test_util::RemoveDbxref(entry, sit, 0);
13692  }
13693 
13694  unit_test_util::SetDbxref(entry, "unrecognized", 1234);
13695  expected_errors[0]->SetErrMsg("Illegal db_xref type unrecognized (1234)");
13696  eval = validator.Validate(seh, options);
13697  CheckErrors(*eval, expected_errors);
13698  unit_test_util::RemoveDbxref(entry, "unrecognized", 0);
13699 
13700  scope.RemoveTopLevelSeqEntry(seh);
13701  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
13702  seh = scope.AddTopLevelSeqEntry(*entry);
13703  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
13704  for (const string& sit : refseq_strings) {
13705  unit_test_util::SetDbxref(entry, sit, 1234);
13706  expected_errors[0]->SetErrMsg("RefSeq-specific db_xref type " + sit + " (1234) should not be used on an OrgRef");
13707  eval = validator.Validate(seh, options);
13708  CheckErrors(*eval, expected_errors);
13709  unit_test_util::RemoveDbxref(entry, sit, 0);
13710  }
13711 
13712  scope.RemoveTopLevelSeqEntry(seh);
13713  entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
13715  seh = scope.AddTopLevelSeqEntry(*entry);
13716  ChangeErrorAcc(expected_errors, "lcl|good");
13717 
13718  for (const string& sit : legal_strings) {
13719  bad = MakeWrongCap(sit);
13720  unit_test_util::SetDbxref(feat, bad, 1234);
13721  if (NStr::Equal(sit, "taxon")) {
13722  expected_errors[0]->SetErrMsg("Illegal db_xref type TAXON (1234), legal capitalization is taxon, but should only be used on an OrgRef");
13723  } else {
13724  expected_errors[0]->SetErrMsg("Illegal db_xref type " + bad + " (1234), legal capitalization is " + sit);
13725  }
13726  eval = validator.Validate(seh, options);
13727  CheckErrors(*eval, expected_errors);
13728  unit_test_util::RemoveDbxref(feat, bad, 0);
13729  }
13730 
13731  for (const string& sit : refseq_strings) {
13732  unit_test_util::SetDbxref(feat, sit, 1234);
13733  expected_errors[0]->SetErrMsg("db_xref type " + sit + " (1234) is only legal for RefSeq");
13734  eval = validator.Validate(seh, options);
13735  CheckErrors(*eval, expected_errors);
13736  unit_test_util::RemoveDbxref(feat, sit, 0);
13737  }
13738 
13739  unit_test_util::SetDbxref(feat, "taxon", 1234);
13740  expected_errors[0]->SetErrMsg("db_xref type taxon (1234) should only be used on an OrgRef");
13741  eval = validator.Validate(seh, options);
13742  CheckErrors(*eval, expected_errors);
13743  unit_test_util::RemoveDbxref(feat, "taxon", 0);
13744 
13745  unit_test_util::SetDbxref(feat, "unrecognized", 1234);
13746  expected_errors[0]->SetErrMsg("Illegal db_xref type unrecognized (1234)");
13747  eval = validator.Validate(seh, options);
13748  CheckErrors(*eval, expected_errors);
13749  unit_test_util::RemoveDbxref(feat, "unrecognized", 0);
13750 
13751  CLEAR_ERRORS
13752 }
13753 
13754 
13755 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FarLocation)
13756 {
13759  misc->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
13760  misc->SetLocation().SetMix().Set().back()->SetInt().SetId().SetGenbank().SetAccession("AY123456");
13761 
13763 
13764  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FarLocation", "Feature has 'far' location - accession not packaged in record"));
13765  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadLocation", "Feature location intervals should all be on the same sequence"));
13766  // AddChromosomeNoLocation(expected_errors, entry);
13767  eval = validator.Validate(seh, options);
13768  CheckErrors(*eval, expected_errors);
13769 
13770  CLEAR_ERRORS
13771 }
13772 
13773 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateFeat)
13774 {
13778  feat2->SetComment("a");
13779 
13781 
13782  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicateFeat", "Features have identical intervals, but labels differ"));
13783  // AddChromosomeNoLocation(expected_errors, entry);
13784  eval = validator.Validate(seh, options);
13785  CheckErrors(*eval, expected_errors);
13786 
13787  // error if genbank accession
13788  scope.RemoveTopLevelSeqEntry(seh);
13789  entry = unit_test_util::BuildGoodSeq();
13790  entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
13791  feat1 = unit_test_util::AddMiscFeature(entry);
13792  feat1->SetData().SetGene().SetLocus("locus1");
13793  feat2 = unit_test_util::AddMiscFeature(entry);
13794  feat2->SetData().SetGene().SetLocus("locus2");
13795  seh = scope.AddTopLevelSeqEntry(*entry);
13796  ChangeErrorAcc(expected_errors, "gb|AY123456|");
13797  eval = validator.Validate(seh, options);
13798  CheckErrors(*eval, expected_errors);
13799 
13801  expected_errors[0]->SetSeverity(eDiag_Warning);
13802  eval = validator.Validate(seh, options);
13803  CheckErrors(*eval, expected_errors);
13804 
13805  // warning if genes are partial
13807  feat1->SetPartial(true);
13808  feat1->SetLocation().SetPartialStart(true, eExtreme_Biological);
13809  feat2->SetPartial(true);
13810  feat2->SetLocation().SetPartialStart(true, eExtreme_Biological);
13811  expected_errors[0]->SetSeverity(eDiag_Warning);
13812  eval = validator.Validate(seh, options);
13813  CheckErrors(*eval, expected_errors);
13814 
13815  // warning if genes are pseudo
13816  feat1->SetPartial(false);
13817  feat1->SetLocation().SetPartialStart(false, eExtreme_Biological);
13818  feat2->SetPartial(false);
13819  feat2->SetLocation().SetPartialStart(false, eExtreme_Biological);
13820  feat1->SetPseudo(true);
13821  feat2->SetPseudo(true);
13822  expected_errors[0]->SetSeverity(eDiag_Warning);
13823  eval = validator.Validate(seh, options);
13824  CheckErrors(*eval, expected_errors);
13825 
13826  // error if general ID
13827  scope.RemoveTopLevelSeqEntry(seh);
13828  entry = unit_test_util::BuildGoodSeq();
13829  entry->SetSeq().SetId().front()->SetGeneral().SetDb("abc");
13830  entry->SetSeq().SetId().front()->SetGeneral().SetTag().SetId(123456);
13831  feat1 = unit_test_util::AddMiscFeature(entry);
13832  feat1->SetData().SetGene().SetLocus("locus1");
13833  feat2 = unit_test_util::AddMiscFeature(entry);
13834  feat2->SetData().SetGene().SetLocus("locus2");
13835  seh = scope.AddTopLevelSeqEntry(*entry);
13836  ChangeErrorAcc(expected_errors, "gnl|abc|123456");
13837  eval = validator.Validate(seh, options);
13838  CheckErrors(*eval, expected_errors);
13839 
13841  expected_errors[0]->SetSeverity(eDiag_Warning);
13842  eval = validator.Validate(seh, options);
13843  CheckErrors(*eval, expected_errors);
13844 
13845  // warning if genes are partial
13847  feat1->SetPartial(true);
13848  feat1->SetLocation().SetPartialStart(true, eExtreme_Biological);
13849  feat2->SetPartial(true);
13850  feat2->SetLocation().SetPartialStart(true, eExtreme_Biological);
13851  expected_errors[0]->SetSeverity(eDiag_Warning);
13852  eval = validator.Validate(seh, options);
13853  CheckErrors(*eval, expected_errors);
13854 
13855  // warning if genes are pseudo
13856  feat1->SetPartial(false);
13857  feat1->SetLocation().SetPartialStart(false, eExtreme_Biological);
13858  feat2->SetPartial(false);
13859  feat2->SetLocation().SetPartialStart(false, eExtreme_Biological);
13860  feat1->SetPseudo(true);
13861  feat2->SetPseudo(true);
13862  expected_errors[0]->SetSeverity(eDiag_Warning);
13863  eval = validator.Validate(seh, options);
13864  CheckErrors(*eval, expected_errors);
13865 
13866 
13867  // always warning if on different annots
13868  scope.RemoveTopLevelSeqEntry(seh);
13869  entry = unit_test_util::BuildGoodSeq();
13870  feat1 = unit_test_util::AddMiscFeature(entry);
13871  CRef<CSeq_annot> annot2(new CSeq_annot());
13872  feat2->Assign(*feat1);
13873  feat2->SetComment("a");
13874  annot2->SetData().SetFtable().push_back(feat2);
13875  entry->SetSeq().SetAnnot().push_back(annot2);
13876  seh = scope.AddTopLevelSeqEntry(*entry);
13877  ChangeErrorAcc(expected_errors, "lcl|good");
13878  expected_errors[0]->SetSeverity(eDiag_Warning);
13879  expected_errors[0]->SetErrMsg("Features have identical intervals, but labels differ (packaged in different feature table)");
13880  eval = validator.Validate(seh, options);
13881  CheckErrors(*eval, expected_errors);
13882 
13883  CLEAR_ERRORS
13884 }
13885 
13886 
13887 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryGeneXref)
13888 {
13892  gene->SetData().SetGene().SetLocus("foo");
13893  feat1->SetGeneXref().SetLocus("foo");
13894 
13896 
13897  eval = validator.Validate(seh, options);
13898  // AddChromosomeNoLocation(expected_errors, entry);
13899  CheckErrors(*eval, expected_errors);
13900 
13901  // now gene xref is necessary
13902  scope.RemoveTopLevelSeqEntry(seh);
13904  gene2->SetLocation().SetPartialStart(true, eExtreme_Biological);
13905  gene2->SetPartial(true);
13906  gene2->SetData().SetGene().SetLocus("bar");
13907  seh = scope.AddTopLevelSeqEntry(*entry);
13908  eval = validator.Validate(seh, options);
13909  CLEAR_ERRORS
13910  // AddChromosomeNoLocation(expected_errors, entry);
13911  CheckErrors(*eval, expected_errors);
13912 
13913  // error if gene references itself
13914  gene2->SetGeneXref().SetLocus("bar");
13915  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryGeneXref",
13916  "Gene feature has gene cross-reference"));
13917  eval = validator.Validate(seh, options);
13918  CheckErrors(*eval, expected_errors);
13919 
13920  CLEAR_ERRORS
13921 }
13922 
13923 
13924 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranslExceptPhase)
13925 {
13929  CRef<CSeq_feat> gene = MakeGeneForFeature(cds);
13930  gene->SetData().SetGene().SetLocus_tag("xyz");
13931  AddFeat(gene, nuc);
13932 
13933  CRef<CCode_break> codebreak(new CCode_break());
13934  codebreak->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
13935  codebreak->SetLoc().SetInt().SetFrom(4);
13936  codebreak->SetLoc().SetInt().SetTo(6);
13937  cds->SetData().SetCdregion().SetCode_break().push_back(codebreak);
13938 
13940 
13941  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "CDSrange",
13942  "Code-break location not in coding region - may be frame problem"));
13943  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranslExceptPhase",
13944  "transl_except qual out of frame."));
13945  // AddChromosomeNoLocation(expected_errors, entry);
13946  eval = validator.Validate(seh, options);
13947  CheckErrors(*eval, expected_errors);
13948 
13949 
13950  CValidErrorFormat format(*objmgr);
13951  vector<string> expected;
13952  expected.push_back("TranslExceptPhase");
13953  expected.push_back("lcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\txyz");
13954  expected.push_back("");
13955  expected.push_back("CDSrange");
13956  expected.push_back("lcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\txyz");
13957  expected.push_back("");
13958  vector<string> seen;
13959  vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
13960  for (const string& it : cat_list) {
13961  vector<string> sublist;
13962  NStr::Split(it, "\n", sublist);
13963  for (const string& sit : sublist) {
13964  seen.push_back(sit);
13965  }
13966  }
13967 
13968  CheckStrings(seen, expected);
13969 
13970 
13971  // only see locus tags when requested
13972  for (auto it : eval->GetErrs()) {
13973  BOOST_CHECK_EQUAL(it->IsSetLocus_tag(), false);
13974  }
13975 
13976  eval = validator.Validate(seh, options | CValidator::eVal_collect_locus_tags);
13977  CheckErrors(*eval, expected_errors);
13978  for (const auto& it : eval->GetErrs()) {
13979  if (!NStr::Equal(it->GetErrCode(), "ChromosomeWithoutLocation")) {
13980  BOOST_CHECK_EQUAL(it->IsSetLocus_tag(), true);
13981  BOOST_CHECK_EQUAL(it->GetLocus_tag(), "xyz");
13982  }
13983  }
13984 
13985  expected.clear();
13986  expected.push_back("TranslExceptPhase");
13987  expected.push_back("lcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\txyz");
13988  expected.push_back("");
13989  expected.push_back("CDSrange");
13990  expected.push_back("lcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\txyz");
13991  expected.push_back("");
13992 
13993  cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
13994  seen.clear();
13995  for (const string& it : cat_list) {
13996  vector<string> sublist;
13997  NStr::Split(it, "\n", sublist);
13998  for (const string& sit : sublist) {
13999  seen.push_back(sit);
14000  }
14001  }
14002 
14003  CheckStrings(seen, expected);
14004 
14005  CLEAR_ERRORS
14006 }
14007 
14008 
14009 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TrnaCodonWrong)
14010 {
14013  trna->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
14014  trna->SetData().SetRna().SetExt().SetTRNA().SetCodon().push_back(0);
14015  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('A');
14016 
14018 
14019  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TrnaCodonWrong",
14020  "Codon recognized by tRNA (UUU) does not match amino acid (A/Ala) specified by genetic code (1/Standard)"));
14021  // AddChromosomeNoLocation(expected_errors, entry);
14022  eval = validator.Validate(seh, options);
14023  CheckErrors(*eval, expected_errors);
14024 
14025  // drop to warning if aa is 'U' or 'O'
14026  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('U');
14027  expected_errors[0]->SetSeverity(eDiag_Warning);
14028  expected_errors[0]->SetErrMsg("Codon recognized by tRNA (UUU) does not match amino acid (U/Sec) specified by genetic code (1/Standard)");
14029  eval = validator.Validate(seh, options);
14030  CheckErrors(*eval, expected_errors);
14031 
14032  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('O');
14033  expected_errors[0]->SetErrMsg("Codon recognized by tRNA (UUU) does not match amino acid (O/Pyl) specified by genetic code (1/Standard)");
14034  eval = validator.Validate(seh, options);
14035  CheckErrors(*eval, expected_errors);
14036 
14037  CLEAR_ERRORS
14038 }
14039 
14040 
14041 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BothStrands)
14042 {
14045  feat->SetData().SetGene().SetLocus("X");
14046  feat->SetLocation().SetInt().SetStrand(eNa_strand_both);
14047 
14049 
14050  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BothStrands",
14051  "gene may not be on both (forward) strands"));
14052  // AddChromosomeNoLocation(expected_errors, entry);
14053  eval = validator.Validate(seh, options);
14054  CheckErrors(*eval, expected_errors);
14055 
14056  scope.RemoveTopLevelSeqEntry(seh);
14057  entry = unit_test_util::BuildGoodSeq();
14058  feat = unit_test_util::AddMiscFeature(entry);
14059  feat->SetData().SetGene().SetLocus("X");
14060  feat->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
14061  feat->SetLocation().SetMix().Set().front()->SetInt().SetStrand(eNa_strand_both);
14062  feat->SetLocation().SetMix().Set().back()->SetInt().SetStrand(eNa_strand_both_rev);
14063  // set trans-splicing exception to prevent mixed-strand error
14064  feat->SetExcept(true);
14065  feat->SetExcept_text("trans-splicing");
14066  seh = scope.AddTopLevelSeqEntry(*entry);
14067  expected_errors[0]->SetErrMsg("gene may not be on both (forward and reverse) strands");
14068  eval = validator.Validate(seh, options);
14069  CheckErrors(*eval, expected_errors);
14070 
14071  scope.RemoveTopLevelSeqEntry(seh);
14072  entry = unit_test_util::BuildGoodSeq();
14073  feat = unit_test_util::AddMiscFeature(entry);
14074  feat->SetData().SetGene().SetLocus("X");
14075  feat->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
14076  feat->SetLocation().SetMix().Set().front()->SetInt().SetStrand(eNa_strand_both_rev);
14077  feat->SetLocation().SetMix().Set().back()->SetInt().SetStrand(eNa_strand_both_rev);
14078  seh = scope.AddTopLevelSeqEntry(*entry);
14079  expected_errors[0]->SetErrMsg("gene may not be on both (reverse) strands");
14080  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultiIntervalGene",
14081  "Gene feature on non-segmented sequence should not have multiple intervals"));
14082  eval = validator.Validate(seh, options);
14083  CheckErrors(*eval, expected_errors);
14084 
14085  CLEAR_ERRORS
14086 
14087  scope.RemoveTopLevelSeqEntry(seh);
14088  entry = unit_test_util::BuildGoodSeq();
14089  feat = unit_test_util::AddMiscFeature(entry);
14090  feat->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
14091  feat->SetLocation().SetMix().Set().front()->SetInt().SetStrand(eNa_strand_both);
14092  feat->SetLocation().SetMix().Set().back()->SetInt().SetStrand(eNa_strand_both_rev);
14093  feat->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14094  feat->SetData().SetRna().SetExt().SetName("mRNA product");
14096  unit_test_util::AddFeat(gene, entry);
14097  // make pseudo to prevent splice errors
14098  feat->SetPseudo(true);
14099  // set trans-splicing exception to prevent mixed-strand error
14100  feat->SetExcept(true);
14101  feat->SetExcept_text("trans-splicing");
14102  seh = scope.AddTopLevelSeqEntry(*entry);
14103 
14104  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14105  "No CDS location match for 1 mRNA"));
14106  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BothStrands",
14107  "mRNA may not be on both (forward and reverse) strands"));
14108  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "mRNAgeneRange",
14109  "gene [gene locus:lcl|good:1-57] overlaps mRNA but does not completely contain it"));
14110  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrandOther",
14111  "Strand 'other' in location"));
14112  // AddChromosomeNoLocation(expected_errors, entry);
14113 
14114  eval = validator.Validate(seh, options);
14115  CheckErrors(*eval, expected_errors);
14116 
14117  CLEAR_ERRORS
14118 
14119  scope.RemoveTopLevelSeqEntry(seh);
14120  entry = unit_test_util::BuildGoodSeq();
14121  feat = unit_test_util::AddMiscFeature(entry);
14122  feat->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
14123  feat->SetLocation().SetMix().Set().front()->SetInt().SetStrand(eNa_strand_both_rev);
14124  feat->SetLocation().SetMix().Set().back()->SetInt().SetStrand(eNa_strand_both_rev);
14125  feat->SetPseudo(true);
14126  feat->SetData().SetCdregion();
14127  seh = scope.AddTopLevelSeqEntry(*entry);
14128  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BothStrands",
14129  "CDS may not be on both (reverse) strands"));
14130  // AddChromosomeNoLocation(expected_errors, entry);
14131  eval = validator.Validate(seh, options);
14132  CheckErrors(*eval, expected_errors);
14133 
14134  CLEAR_ERRORS
14135 }
14136 
14137 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSmRNArange)
14138 {
14140  CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
14143  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc_seq->SetSeq().SetId().front()));
14144  CRef<CSeq_feat> mrna(new CSeq_feat());
14145  mrna->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc_seq->SetSeq().SetId().front()));
14146  mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14147  mrna->SetData().SetRna().SetExt().SetName("mRNA product");
14148  mrna->SetLocation().SetMix().Set().front()->SetInt().SetTo(17);
14149  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[18] = 'G';
14150  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[19] = 'T';
14151  unit_test_util::AddFeat(mrna, entry->SetSet().SetSeq_set().front());
14152 
14154 
14155  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSwithNoMRNA",
14156  "Unmatched CDS"));
14157  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNAMismatchLocation",
14158  "No CDS location match for 1 mRNA"));
14159  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNArange",
14160  "mRNA contains CDS but internal intron-exon boundaries do not match"));
14161  // AddChromosomeNoLocation(expected_errors, entry);
14162  eval = validator.Validate(seh, options);
14163  CheckErrors(*eval, expected_errors);
14164 
14165  // turn off error for ribosomal slippage and trans-splicing
14166  CLEAR_ERRORS
14167  scope.RemoveTopLevelSeqEntry(seh);
14168  cds->SetExcept(true);
14169  cds->SetExcept_text("ribosomal slippage");
14170  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = 'A';
14171  seh = scope.AddTopLevelSeqEntry(*entry);
14172  // AddChromosomeNoLocation(expected_errors, entry);
14173  eval = validator.Validate(seh, options);
14174  CheckErrors(*eval, expected_errors);
14175 
14176  cds->SetExcept_text("trans-splicing");
14177  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = 'G';
14178  eval = validator.Validate(seh, options);
14179  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSwithNoMRNA",
14180  "Unmatched CDS"));
14181  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNAMismatchLocation",
14182  "No CDS location match for 1 mRNA"));
14183  CheckErrors(*eval, expected_errors);
14184 
14185  // overlap problem rather than internal boundary problem
14186  scope.RemoveTopLevelSeqEntry(seh);
14188  nuc_seq = entry->SetSet().SetSeq_set().front();
14191  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc_seq->SetSeq().SetId().front()));
14192  mrna = new CSeq_feat();
14193  mrna->SetLocation().Assign(cds->GetLocation());
14194  mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14195  mrna->SetData().SetRna().SetExt().SetName("mRNA product");
14196  mrna->SetLocation().SetMix().Set().front()->SetInt().SetTo(12);
14197  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[13] = 'G';
14198  nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[14] = 'T';
14199  unit_test_util::AddFeat(mrna, entry->SetSet().SetSeq_set().front());
14200  CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
14201  prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set()[4] = 'S';
14202  seh = scope.AddTopLevelSeqEntry(*entry);
14203  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNArange",
14204  "mRNA overlaps or contains CDS but does not completely contain intervals"));
14205  eval = validator.Validate(seh, options);
14206  CheckErrors(*eval, expected_errors);
14207  CLEAR_ERRORS
14208 }
14209 
14210 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_OverlappingPeptideFeat)
14211 {
14213  CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
14215  p1->SetData().SetProt().SetProcessed(CProt_ref::eProcessed_signal_peptide);
14216  p1->SetData().SetProt().SetName().push_back("unnamed");
14218  p2->SetData().SetProt().SetProcessed(CProt_ref::eProcessed_signal_peptide);
14219  p2->SetData().SetProt().SetName().push_back("unnamed");
14220 
14222 
14223  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "OverlappingPeptideFeat",
14224  "Signal, Transit, or Mature peptide features overlap (parent CDS is on lcl|nuc)"));
14225  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "OverlappingPeptideFeat",
14226  "Signal, Transit, or Mature peptide features overlap (parent CDS is on lcl|nuc)"));
14227  // AddChromosomeNoLocation(expected_errors, entry);
14228  eval = validator.Validate(seh, options);
14229  CheckErrors(*eval, expected_errors);
14230 
14231  CLEAR_ERRORS
14232 
14233  scope.RemoveTopLevelSeqEntry(seh);
14235  p1 = unit_test_util::AddMiscFeature(entry, 4);
14236  p1->SetData().SetProt().SetProcessed(CProt_ref::eProcessed_mature);
14237  p1->SetData().SetProt().SetName().push_back("unnamed");
14238  p2 = unit_test_util::AddMiscFeature(entry, 5);
14239  p2->SetData().SetProt().SetProcessed(CProt_ref::eProcessed_transit_peptide);
14240  p2->SetData().SetProt().SetName().push_back("unnamed");
14241  seh = scope.AddTopLevelSeqEntry(*entry);
14242  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingPeptideFeat",
14243  "Signal, Transit, or Mature peptide features overlap"));
14244  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingPeptideFeat",
14245  "Signal, Transit, or Mature peptide features overlap"));
14246  // AddChromosomeNoLocation(expected_errors, entry);
14247  eval = validator.Validate(seh, options);
14248  CheckErrors(*eval, expected_errors);
14249 
14250  CLEAR_ERRORS
14251 
14252  //no error if peptide exceptions
14253  p1->SetExcept(true);
14254  p1->SetExcept_text("alternative processing");
14255  // AddChromosomeNoLocation(expected_errors, entry);
14256  eval = validator.Validate(seh, options);
14257  CheckErrors(*eval, expected_errors);
14258 
14259  CLEAR_ERRORS
14260 }
14261 
14262 
14263 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SerialInComment)
14264 {
14267  misc->SetComment("blah blah [123456]");
14268 
14270 
14271  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "SerialInComment",
14272  "Feature comment may refer to reference by serial number - attach reference specific comments to the reference REMARK instead."));
14273  // AddChromosomeNoLocation(expected_errors, entry);
14274  eval = validator.Validate(seh, options);
14275  CheckErrors(*eval, expected_errors);
14276 
14277  CLEAR_ERRORS
14278 }
14279 
14280 
14281 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleCDSproducts)
14282 {
14285  cds2->SetData().SetCdregion();
14286  cds2->SetProduct().SetWhole().Assign(*(entry->SetSet().SetSeq_set().back()->SetSeq().SetId().front()));
14287  cds2->SetLocation().SetInt().SetFrom(30);
14288  cds2->SetLocation().SetInt().SetTo(56);
14289 
14291 
14292  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "MultipleCDSproducts",
14293  "Same product Bioseq from multiple CDS features"));
14294  // AddChromosomeNoLocation(expected_errors, entry);
14295  eval = validator.Validate(seh, options);
14296  CheckErrors(*eval, expected_errors);
14297 
14298  CLEAR_ERRORS
14299 }
14300 
14301 
14302 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FocusOnBioSourceFeature)
14303 {
14306  src->SetData().SetBiosrc().SetIs_focus();
14307  unit_test_util::SetFocus(entry);
14308 
14310 
14311  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "FocusOnBioSourceFeature",
14312  "Focus must be on BioSource descriptor, not BioSource feature."));
14313  // AddChromosomeNoLocation(expected_errors, entry);
14314  eval = validator.Validate(seh, options);
14315  CheckErrors(*eval, expected_errors);
14316 
14317  CLEAR_ERRORS
14318 }
14319 
14320 
14321 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PeptideFeatOutOfFrame)
14322 {
14324  CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
14325  CRef<CSeq_feat> peptide = unit_test_util::AddMiscFeature(nuc_seq, 6);
14326  peptide->SetData().SetImp().SetKey("sig_peptide");
14327 
14329 
14330  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PeptideFeatureLacksCDS",
14331  "Peptide processing feature should be converted to the appropriate protein feature subtype"));
14332  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PeptideFeatOutOfFrame",
14333  "Stop of sig_peptide is out of frame with CDS codons"));
14334  // AddChromosomeNoLocation(expected_errors, entry);
14335  eval = validator.Validate(seh, options);
14336  CheckErrors(*eval, expected_errors);
14337 
14338  scope.RemoveTopLevelSeqEntry(seh);
14340  nuc_seq = entry->SetSet().SetSeq_set().front();
14341  peptide = unit_test_util::AddMiscFeature(nuc_seq, 5);
14342  peptide->SetLocation().SetInt().SetFrom(1);
14343  peptide->SetData().SetImp().SetKey("sig_peptide");
14344  seh = scope.AddTopLevelSeqEntry(*entry);
14345  expected_errors[1]->SetErrMsg("Start of sig_peptide is out of frame with CDS codons");
14346  eval = validator.Validate(seh, options);
14347  CheckErrors(*eval, expected_errors);
14348 
14349  scope.RemoveTopLevelSeqEntry(seh);
14351  nuc_seq = entry->SetSet().SetSeq_set().front();
14352  peptide = unit_test_util::AddMiscFeature(nuc_seq, 6);
14353  peptide->SetLocation().SetInt().SetFrom(1);
14354  peptide->SetData().SetImp().SetKey("sig_peptide");
14355  seh = scope.AddTopLevelSeqEntry(*entry);
14356  expected_errors[1]->SetErrMsg("Start and stop of sig_peptide are out of frame with CDS codons");
14357  eval = validator.Validate(seh, options);
14358  CheckErrors(*eval, expected_errors);
14359 
14360  CLEAR_ERRORS
14361 }
14362 
14363 
14364 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSgeneRange)
14365 {
14369  gene->SetLocation().SetInt().SetFrom(1);
14370  unit_test_util::AddFeat(gene, entry->SetSet().SetSeq_set().front());
14371 
14373 
14374  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSgeneRange",
14375  "gene [gene locus:lcl|nuc:2-27] overlaps CDS but does not completely contain it"));
14376  // AddChromosomeNoLocation(expected_errors, entry);
14377  eval = validator.Validate(seh, options);
14378  CheckErrors(*eval, expected_errors);
14379 
14380  CLEAR_ERRORS
14381 
14382  scope.RemoveTopLevelSeqEntry(seh);
14383  gene->SetId().SetLocal().SetId(1);
14384  cds->SetId().SetLocal().SetId(2);
14385  CRef<CSeqFeatXref> gene_xref(new CSeqFeatXref());
14386  gene_xref->SetId().SetLocal().SetId(1);
14387  cds->SetXref().push_back(gene_xref);
14388  CRef<CSeqFeatXref> cds_xref(new CSeqFeatXref());
14389  cds_xref->SetId().SetLocal().SetId(2);
14390  gene->SetXref().push_back(cds_xref);
14391 
14392  seh = scope.AddTopLevelSeqEntry(*entry);
14393 
14394  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSgeneRange",
14395  "gene [gene locus:lcl|nuc:2-27] overlaps CDS but does not completely contain it"));
14396  // AddChromosomeNoLocation(expected_errors, entry);
14397  eval = validator.Validate(seh, options);
14398  CheckErrors(*eval, expected_errors);
14399 
14400  CLEAR_ERRORS
14401 
14402  // for VR-821
14403  scope.RemoveTopLevelSeqEntry(seh);
14406 
14407  CRef<CSeq_loc> cl1(new CSeq_loc());
14408  cl1->SetInt().SetFrom(0);
14409  cl1->SetInt().SetTo(8);
14410  cl1->SetInt().SetId().Assign(*(nuc->GetSeq().GetId().front()));
14411  CRef<CSeq_loc> cl2(new CSeq_loc());
14412  cl2->SetInt().SetFrom(21);
14413  cl2->SetInt().SetTo(26);
14414  cl2->SetInt().SetId().Assign(*(nuc->GetSeq().GetId().front()));
14415  CRef<CSeq_loc> gl1(new CSeq_loc());
14416  gl1->Assign(*cl2);
14417  CRef<CSeq_loc> gl2(new CSeq_loc());
14418  gl2->Assign(*cl1);
14419 
14420  cds->SetLocation().SetMix().Set().push_back(cl1);
14421  cds->SetLocation().SetMix().Set().push_back(cl2);
14422 
14423  gene->SetLocation().SetMix().Set().push_back(gl1);
14424  gene->SetLocation().SetMix().Set().push_back(gl2);
14425 
14426  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAGTAACAGAGAAGAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
14427 
14428  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRN");
14429  prot->SetSeq().SetInst().SetLength(4);
14431  prot_feat->SetLocation().SetInt().SetTo(3);
14432 
14433  seh = scope.AddTopLevelSeqEntry(*entry);
14434 
14435  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
14436  "MultiIntervalGene", "Gene feature on non-segmented sequence should not have multiple intervals"));
14437  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
14438  "CDSgeneRange", "gene [gene locus:[lcl|nuc:22-27, 1-9]] overlaps CDS but does not completely contain it"));
14439  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error,
14440  "SeqLocOrder", "Location: Intervals out of order in SeqLoc [(lcl|nuc:22-27, 1-9)]"));
14441  // AddChromosomeNoLocation(expected_errors, entry);
14442  eval = validator.Validate(seh, options);
14443  CheckErrors(*eval, expected_errors);
14444 
14445  CLEAR_ERRORS
14446 
14447  // no CDSGeneRange error if trans-spliced
14448  cds->SetExcept(true);
14449  cds->SetExcept_text("trans-splicing");
14450 
14451  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
14452  "MultiIntervalGene", "Gene feature on non-segmented sequence should not have multiple intervals"));
14453  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error,
14454  "SeqLocOrder", "Location: Intervals out of order in SeqLoc [(lcl|nuc:22-27, 1-9)]"));
14455  // AddChromosomeNoLocation(expected_errors, entry);
14456  eval = validator.Validate(seh, options);
14457  CheckErrors(*eval, expected_errors);
14458 
14459  CLEAR_ERRORS
14460 }
14461 
14462 
14463 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleMRNAproducts)
14464 {
14466  CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
14468  feat->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14469  feat->SetData().SetRna().SetExt().SetName("fake protein name");
14470  feat->SetProduct().SetWhole().SetLocal().SetStr("nuc");
14471 
14473 
14474  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureProductInconsistency",
14475  "mRNA products are not unique"));
14476  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14477  "No CDS location match for 1 mRNA"));
14478  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TranscriptLen",
14479  "Transcript length [11] less than product length [27], and tail < 95% polyA"));
14480  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "IdenticalMRNAtranscriptIDs",
14481  "Identical transcript IDs found on multiple mRNAs"));
14482  // AddChromosomeNoLocation(expected_errors, entry);
14483  eval = validator.Validate(seh, options);
14484  CheckErrors(*eval, expected_errors);
14485 
14486  CLEAR_ERRORS
14487 }
14488 
14489 
14490 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_mRNAgeneRange)
14491 {
14494  gene->SetData().SetGene().SetLocus("locus");
14495  gene->SetLocation().SetInt().SetFrom(5);
14497  mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14498  mrna->SetLocation().SetInt().SetTo(10);
14499 
14501 
14502  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "mRNAgeneRange",
14503  "gene [locus:lcl|good:6-11] overlaps mRNA but does not completely contain it"));
14504  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14505  "No CDS location match for 1 mRNA"));
14506  // AddChromosomeNoLocation(expected_errors, entry);
14507  eval = validator.Validate(seh, options);
14508  CheckErrors(*eval, expected_errors);
14509 
14510  CLEAR_ERRORS
14511 
14512  // if there is an overlapping gene or operon, error is suppressed
14513  scope.RemoveTopLevelSeqEntry(seh);
14515  overlap->SetData().SetGene().SetLocus("locus2");
14516  overlap->SetLocation().SetInt().SetTo(10);
14517  seh = scope.AddTopLevelSeqEntry(*entry);
14518  eval = validator.Validate(seh, options);
14519  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14520  "No CDS location match for 1 mRNA"));
14521  // AddChromosomeNoLocation(expected_errors, entry);
14522  CheckErrors(*eval, expected_errors);
14523 
14524  scope.RemoveTopLevelSeqEntry(seh);
14525  overlap->SetData().SetImp().SetKey("operon");
14526  overlap->AddQualifier("operon", "operon name");
14527  seh = scope.AddTopLevelSeqEntry(*entry);
14528  eval = validator.Validate(seh, options);
14529  CheckErrors(*eval, expected_errors);
14530 
14531  CLEAR_ERRORS
14532 }
14533 
14534 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranscriptLen)
14535 {
14537  CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
14538  CRef<CSeq_feat> mrna = contig->SetSeq().SetAnnot().front()->SetData().SetFtable().back();
14539  mrna->SetLocation().SetInt().SetTo(10);
14540 
14542  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSwithNoMRNA",
14543  "Unmatched CDS"));
14544  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14545  "No CDS location match for 1 mRNA"));
14546  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNArange",
14547  "mRNA overlaps or contains CDS but does not completely contain intervals"));
14548  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TranscriptLen",
14549  "Transcript length [11] less than product length [27], and tail < 95% polyA"));
14550  // AddChromosomeNoLocation(expected_errors, entry);
14551  eval = validator.Validate(seh, options);
14552  CheckErrors(*eval, expected_errors);
14553  // allow for polyA tail
14554  scope.RemoveTopLevelSeqEntry(seh);
14555  mrna->SetLocation().SetInt().SetTo(25);
14556  seh = scope.AddTopLevelSeqEntry(*entry);
14557  expected_errors[3]->SetErrCode("PolyATail");
14558  expected_errors[3]->SetSeverity(eDiag_Info);
14559  expected_errors[3]->SetErrMsg("Transcript length [26] less than product length [27], but tail is 100% polyA");
14560  eval = validator.Validate(seh, options);
14561  CheckErrors(*eval, expected_errors);
14562 
14563  CLEAR_ERRORS
14564 
14565  scope.RemoveTopLevelSeqEntry(seh);
14566  mrna->SetLocation().SetInt().SetTo(37);
14567  seh = scope.AddTopLevelSeqEntry(*entry);
14568  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TranscriptLen",
14569  "Transcript length [38] greater than product length [27]"));
14570  // AddChromosomeNoLocation(expected_errors, entry);
14571  eval = validator.Validate(seh, options);
14572  CheckErrors(*eval, expected_errors);
14573  CLEAR_ERRORS
14574 }
14575 
14576 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranscriptMismatches)
14577 {
14581  mrna_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAATTAA");
14582 
14584 
14585  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TranscriptMismatches",
14586  "There are 1 mismatches out of 27 bases between the transcript and product sequence"));
14587  // AddChromosomeNoLocation(expected_errors, entry);
14588  eval = validator.Validate(seh, options);
14589  CheckErrors(*eval, expected_errors);
14590 
14591  CLEAR_ERRORS
14592 
14593  // suppress error if exception
14594  scope.RemoveTopLevelSeqEntry(seh);
14595  CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
14596  CRef<CSeq_feat> mrna = contig->SetSeq().SetAnnot().front()->SetData().SetFtable().back();
14597  mrna->SetExcept(true);
14598  mrna->SetExcept_text("mismatches in transcription");
14599  seh = scope.AddTopLevelSeqEntry(*entry);
14600  // AddChromosomeNoLocation(expected_errors, entry);
14601  eval = validator.Validate(seh, options);
14602  CheckErrors(*eval, expected_errors);
14603 
14604  CLEAR_ERRORS
14605 }
14606 
14607 
14608 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSproductPackagingProblem)
14609 {
14612 
14614 
14615  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "CDSproductPackagingProblem",
14616  "Protein product not packaged in nuc-prot set with nucleotide"));
14617  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "FeaturePackagingProblem",
14618  "There is 1 mispackaged feature in this record."));
14619  // AddChromosomeNoLocation(expected_errors, "lcl|nuc");
14620  eval = validator.Validate(seh, options);
14621  CheckErrors(*eval, expected_errors);
14622 
14623  CLEAR_ERRORS
14624 }
14625 
14626 
14627 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateInterval)
14628 {
14629  // error for duplicate in tRNA anticodon location
14631  CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
14632  CRef<CSeq_loc> anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
14633  anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(8);
14634  anticodon_loc->SetMix().Set().front()->SetInt().SetTo(10);
14635  anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(8);
14636  anticodon_loc->SetMix().Set().back()->SetInt().SetTo(10);
14637  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
14638  unit_test_util::AddFeat(trna, entry);
14639 
14641 
14642  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "tRNArange",
14643  "Anticodon is not 3 bases in length"));
14644  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicateAnticodonInterval",
14645  "Duplicate anticodon exons in location"));
14646  // AddChromosomeNoLocation(expected_errors, entry);
14647  eval = validator.Validate(seh, options);
14648  CheckErrors(*eval, expected_errors);
14649 
14650  CLEAR_ERRORS
14651 
14652  // different error for feature location
14653  scope.RemoveTopLevelSeqEntry(seh);
14654  entry = unit_test_util::BuildGoodSeq();
14656  CRef<CSeq_loc> loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
14657  loc->SetMix().Set().back()->SetInt().SetFrom(0);
14658  loc->SetMix().Set().back()->SetInt().SetTo(15);
14659  feat->SetLocation().Assign(*loc);
14660 
14661  seh = scope.AddTopLevelSeqEntry(*entry);
14662  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DuplicateExonInterval",
14663  "Duplicate exons in location"));
14664  // AddChromosomeNoLocation(expected_errors, entry);
14665  eval = validator.Validate(seh, options);
14666  CheckErrors(*eval, expected_errors);
14667 
14668 
14669  CLEAR_ERRORS
14670 }
14671 
14672 
14673 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PolyAsiteNotPoint)
14674 {
14677  feat->SetData().SetImp().SetKey("polyA_site");
14678 
14680 
14681  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PolyAsiteNotPoint",
14682  "PolyA_site should be a single point"));
14683  // AddChromosomeNoLocation(expected_errors, entry);
14684  eval = validator.Validate(seh, options);
14685  CheckErrors(*eval, expected_errors);
14686 
14687  CLEAR_ERRORS
14688  // error should go away if feature location is single point
14689  feat->SetLocation().SetPnt().SetId().SetLocal().SetStr("good");
14690  feat->SetLocation().SetPnt().SetPoint(5);
14691 
14692  // AddChromosomeNoLocation(expected_errors, entry);
14693  eval = validator.Validate(seh, options);
14694  CheckErrors(*eval, expected_errors);
14695 
14696  CLEAR_ERRORS
14697 }
14698 
14699 
14700 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImpFeatBadLoc)
14701 {
14704  feat->SetData().SetImp().SetLoc("one-of three");
14705 
14707 
14708  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ImpFeatBadLoc",
14709  "ImpFeat loc one-of three has obsolete 'one-of' text for feature misc_feature"));
14710  // AddChromosomeNoLocation(expected_errors, entry);
14711  eval = validator.Validate(seh, options);
14712  CheckErrors(*eval, expected_errors);
14713 
14714  feat->SetData().SetImp().SetLoc("5..12");
14715  expected_errors[0]->SetErrMsg("ImpFeat loc 5..12 does not equal feature location 1..11 for feature misc_feature");
14716  eval = validator.Validate(seh, options);
14717  CheckErrors(*eval, expected_errors);
14718 
14719  CLEAR_ERRORS
14720 }
14721 
14722 
14723 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryCitPubEquiv)
14724 {
14727  CRef<CPub> pub(new CPub());
14729  feat->SetCit().SetPub().push_back(pub);
14730  CRef<CPub> pub2(new CPub());
14731  pub2->SetEquiv();
14732  feat->SetCit().SetPub().push_back(pub2);
14734 
14735  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryCitPubEquiv",
14736  "Citation on feature has unexpected internal Pub-equiv"));
14737  // AddChromosomeNoLocation(expected_errors, entry);
14738  eval = validator.Validate(seh, options);
14739  CheckErrors(*eval, expected_errors);
14740 
14741  CLEAR_ERRORS
14742 }
14743 
14744 
14745 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImpCDShasTranslation)
14746 {
14749  feat->SetPseudo(true);
14750  feat->SetData().SetImp().SetKey("CDS");
14751  feat->AddQualifier("translation", "unexpected translation");
14752 
14754 
14755  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ImpCDShasTranslation",
14756  "ImpFeat CDS with /translation found"));
14757  // AddChromosomeNoLocation(expected_errors, entry);
14758  eval = validator.Validate(seh, options);
14759  CheckErrors(*eval, expected_errors);
14760 
14761  CLEAR_ERRORS
14762 }
14763 
14764 
14765 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImpCDSnotPseudo)
14766 {
14769  feat->SetData().SetImp().SetKey("CDS");
14770 
14772 
14773  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ImpCDSnotPseudo",
14774  "ImpFeat CDS should be pseudo"));
14775  // AddChromosomeNoLocation(expected_errors, entry);
14776  eval = validator.Validate(seh, options);
14777  CheckErrors(*eval, expected_errors);
14778 
14779  CLEAR_ERRORS
14780 
14781  // overlapping pseudogene should suppress
14782  scope.RemoveTopLevelSeqEntry(seh);
14784  gene->SetPseudo(true);
14785  unit_test_util::AddFeat(gene, entry);
14786  seh = scope.AddTopLevelSeqEntry(*entry);
14787 
14788  // AddChromosomeNoLocation(expected_errors, entry);
14789  eval = validator.Validate(seh, options);
14790  CheckErrors(*eval, expected_errors);
14791 
14792  CLEAR_ERRORS
14793 }
14794 
14795 
14796 BOOST_FIXTURE_TEST_CASE(Test_SEQ_FEAT_MissingMRNAproduct, CGenBankFixture)
14797 {
14799  CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
14801  feat->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14802  feat->SetData().SetRna().SetExt().SetName("fake protein name");
14803  feat->SetProduct().SetWhole().SetLocal().SetStr("not_present_ever");
14804 
14806 
14807  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14808  "No CDS location match for 1 mRNA"));
14809  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProductFetchFailure",
14810  "Unable to fetch mRNA transcript 'lcl|not_present_ever'"));
14811  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingMRNAproduct",
14812  "Product Bioseq of mRNA feature is not packaged in the record"));
14813  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GenomicProductPackagingProblem",
14814  "Product of mRNA feature (lcl|not_present_ever) not packaged in genomic product set"));
14815  // AddChromosomeNoLocation(expected_errors, entry);
14816  eval = validator.Validate(seh, options);
14817  CheckErrors(*eval, expected_errors);
14818 
14819  CLEAR_ERRORS
14820 }
14821 
14822 
14823 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_AbuttingIntervals)
14824 {
14825  // error for abutting tRNA anticodon location
14827  CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
14828  CRef<CSeq_loc> anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
14829  anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(8);
14830  anticodon_loc->SetMix().Set().front()->SetInt().SetTo(8);
14831  anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(9);
14832  anticodon_loc->SetMix().Set().back()->SetInt().SetTo(10);
14833  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
14834  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('F');
14835  unit_test_util::AddFeat(trna, entry);
14836 
14838 
14839  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "AbuttingIntervals",
14840  "Adjacent intervals in Anticodon"));
14841  // AddChromosomeNoLocation(expected_errors, entry);
14842  eval = validator.Validate(seh, options);
14843  CheckErrors(*eval, expected_errors);
14844 
14845  CLEAR_ERRORS
14846 
14847  // different error for feature location
14848  scope.RemoveTopLevelSeqEntry(seh);
14849  entry = unit_test_util::BuildGoodSeq();
14851  CRef<CSeq_loc> loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
14852  loc->SetMix().Set().front()->SetInt().SetFrom(0);
14853  loc->SetMix().Set().front()->SetInt().SetTo(7);
14854  loc->SetMix().Set().back()->SetInt().SetFrom(8);
14855  loc->SetMix().Set().back()->SetInt().SetTo(15);
14856  feat->SetLocation().Assign(*loc);
14857 
14858  seh = scope.AddTopLevelSeqEntry(*entry);
14859  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "AbuttingIntervals",
14860  "Location: Adjacent intervals in SeqLoc [(lcl|good:1-8, 9-16)]"));
14861  // AddChromosomeNoLocation(expected_errors, entry);
14862  eval = validator.Validate(seh, options);
14863  CheckErrors(*eval, expected_errors);
14864 
14865 
14866  CLEAR_ERRORS
14867 }
14868 
14869 
14870 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CollidingGeneNames)
14871 {
14874  gene1->SetLocation().SetInt().SetFrom(0);
14875  gene1->SetLocation().SetInt().SetTo(7);
14876  gene1->SetData().SetGene().SetLocus("see_it_twice");
14877 
14879  gene2->SetLocation().SetInt().SetFrom(15);
14880  gene2->SetLocation().SetInt().SetTo(20);
14881  gene2->SetData().SetGene().SetLocus("see_it_twice");
14882 
14884  // used to produce an error, removed per VR-811
14885  // AddChromosomeNoLocation(expected_errors, entry);
14886  eval = validator.Validate(seh, options);
14887  CheckErrors(*eval, expected_errors);
14888 
14889  scope.RemoveTopLevelSeqEntry(seh);
14890  gene2->SetData().SetGene().SetLocus("See_It_Twice");
14891  seh = scope.AddTopLevelSeqEntry(*entry);
14892  eval = validator.Validate(seh, options);
14893  CheckErrors(*eval, expected_errors);
14894 
14895  CLEAR_ERRORS
14896 
14897  scope.RemoveTopLevelSeqEntry(seh);
14898  gene2->SetLocation().SetInt().SetFrom(0);
14899  gene2->SetLocation().SetInt().SetTo(7);
14900  seh = scope.AddTopLevelSeqEntry(*entry);
14901  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicateFeat",
14902  "Features have identical intervals, but labels differ"));
14903  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "DuplicateGeneConflictingLocusTag",
14904  "Colliding names (with different capitalization) in gene features, but feature locations are identical"));
14905  // AddChromosomeNoLocation(expected_errors, entry);
14906  eval = validator.Validate(seh, options);
14907  CheckErrors(*eval, expected_errors);
14908 
14909  CLEAR_ERRORS
14910 
14911  scope.RemoveTopLevelSeqEntry(seh);
14912  // this situation used to produce an error, removed VR-801
14913  gene2->SetLocation().SetInt().SetFrom(10);
14914  gene2->SetLocation().SetInt().SetTo(17);
14915  seh = scope.AddTopLevelSeqEntry(*entry);
14916  // AddChromosomeNoLocation(expected_errors, entry);
14917  eval = validator.Validate(seh, options);
14918  CheckErrors(*eval, expected_errors);
14919 
14920  CLEAR_ERRORS
14921 }
14922 
14923 
14924 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultiIntervalGene)
14925 {
14928  gene->SetData().SetGene().SetLocus("multi-interval");
14929  CRef<CSeq_loc> loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
14930  gene->SetLocation().Assign(*loc);
14931 
14933  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultiIntervalGene",
14934  "Gene feature on non-segmented sequence should not have multiple intervals"));
14935  // AddChromosomeNoLocation(expected_errors, entry);
14936 
14937  eval = validator.Validate(seh, options);
14938  CheckErrors(*eval, expected_errors);
14939 
14940  CLEAR_ERRORS
14941 }
14942 
14943 
14944 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatContentDup)
14945 {
14949 
14951  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatContentDup",
14952  "Duplicate feature"));
14953  // AddChromosomeNoLocation(expected_errors, entry);
14954  eval = validator.Validate(seh, options);
14955  CheckErrors(*eval, expected_errors);
14957  BOOST_CHECK_EQUAL(dups.size(), 1);
14958 
14959 
14960  // many suppression conditions
14961  // region
14962  scope.RemoveTopLevelSeqEntry(seh);
14963  feat1->SetData().SetRegion("region");
14964  feat2->SetData().SetRegion("region");
14965  seh = scope.AddTopLevelSeqEntry(*entry);
14966  eval = validator.Validate(seh, options);
14967  CheckErrors(*eval, expected_errors);
14968 
14970  BOOST_CHECK_EQUAL(dups.size(), 1);
14971 
14972  CLEAR_ERRORS
14973  //suppress if different dbxrefs
14974  scope.RemoveTopLevelSeqEntry(seh);
14975  unit_test_util::SetDbxref(feat1, "ASAP", "first");
14976  unit_test_util::SetDbxref(feat2, "ASAP", "second");
14977  seh = scope.AddTopLevelSeqEntry(*entry);
14978  // AddChromosomeNoLocation(expected_errors, entry);
14979  eval = validator.Validate(seh, options);
14980  CheckErrors(*eval, expected_errors);
14981 
14983  BOOST_CHECK_EQUAL(dups.size(), 0);
14984 
14985  // variation
14986  scope.RemoveTopLevelSeqEntry(seh);
14987  feat1->SetData().SetImp().SetKey("variation");
14988  feat2->SetData().SetImp().SetKey("variation");
14989  seh = scope.AddTopLevelSeqEntry(*entry);
14990  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatContentDup",
14991  "Duplicate feature"));
14992  eval = validator.Validate(seh, options);
14993  CheckErrors(*eval, expected_errors);
14994 
14996  BOOST_CHECK_EQUAL(dups.size(), 1);
14997 
14998  CLEAR_ERRORS
14999  // suppress if different replace qualifiers
15000  scope.RemoveTopLevelSeqEntry(seh);
15001  feat1->AddQualifier("replace", "a");
15002  feat2->AddQualifier("replace", "t");
15003  seh = scope.AddTopLevelSeqEntry(*entry);
15004  // AddChromosomeNoLocation(expected_errors, entry);
15005  eval = validator.Validate(seh, options);
15006  CheckErrors(*eval, expected_errors);
15007 
15008  CLEAR_ERRORS
15010  BOOST_CHECK_EQUAL(dups.size(), 0);
15011 
15012  // coding regions/mRNAs with different links
15013  scope.RemoveTopLevelSeqEntry(seh);
15017  unit_test_util::AddFeat(cds2, entry);
15019  entry->SetSet().SetSeq_set().push_back(pentry);
15020  CRef<CSeq_entry> nentry = entry->SetSet().SetSeq_set().front();
15022  mrna1->ResetProduct();
15023  mrna1->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
15024  unit_test_util::AddFeat(mrna1, nentry);
15026  mrna2->ResetProduct();
15027  mrna2->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
15028  unit_test_util::AddFeat(mrna2, nentry);
15029  seh = scope.AddTopLevelSeqEntry(*entry);
15030 
15031  // two duplicate feature errors, one for cds, one for mRNA
15032 // expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "CDSwithMultipleMRNAs",
15033 // "CDS overlapped by 2 mRNAs, but product locations are unique"));
15034 // expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "CDSwithMultipleMRNAs",
15035 // "CDS overlapped by 2 mRNAs, but product locations are unique"));
15036  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "FeatContentDup",
15037  "Duplicate feature"));
15038  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "FeatContentDup",
15039  "Duplicate feature"));
15040  // AddChromosomeNoLocation(expected_errors, entry);
15041 
15042  eval = validator.Validate(seh, options);
15043  CheckErrors(*eval, expected_errors);
15044 
15046  BOOST_CHECK_EQUAL(dups.size(), 2);
15047 
15048  // suppress errors if cdss and mrnas are linked AND mRNAs have different locations
15049  CLEAR_ERRORS
15050  scope.RemoveTopLevelSeqEntry(seh);
15051  cds1->SetId().SetLocal().SetId(1);
15052  cds2->SetId().SetLocal().SetId(2);
15053  mrna1->SetId().SetLocal().SetId(3);
15054  mrna2->SetId().SetLocal().SetId(4);
15055  cds1->AddSeqFeatXref(mrna1->GetId());
15056  cds2->AddSeqFeatXref(mrna2->GetId());
15057  mrna1->AddSeqFeatXref(cds1->GetId());
15058  mrna2->AddSeqFeatXref(cds2->GetId());
15059  mrna2->SetLocation().SetInt().SetTo(mrna2->GetLocation().GetInt().GetTo() + 10);
15060  seh = scope.AddTopLevelSeqEntry(*entry);
15061  // AddChromosomeNoLocation(expected_errors, entry);
15062  eval = validator.Validate(seh, options);
15063  CheckErrors(*eval, expected_errors);
15064 
15066  BOOST_CHECK_EQUAL(dups.size(), 0);
15067 
15068  CLEAR_ERRORS
15069 }
15070 
15071 
15073 {
15078 
15079  cds->SetProduct().SetWhole().SetGenbank().SetName(name);
15080  prot_seq->SetSeq().SetId().front()->SetGenbank().SetName(name);
15081  prot_feat->SetLocation().SetInt().SetId().SetGenbank().SetName(name);
15082 }
15083 
15084 
15085 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadProductSeqId)
15086 {
15088  // try one that looks like a valid ID
15089  ChangeGoodNucProtSetIdToGenbankName(entry, "AY123456");
15090 
15092  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadProductSeqId",
15093  "Feature product should not put an accession in the Textseq-id 'name' slot"));
15094  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadProductSeqId",
15095  "Protein bioseq has Textseq-id 'name' that looks like it is derived from a nucleotide accession"));
15096  // AddChromosomeNoLocation(expected_errors, entry);
15097  eval = validator.Validate(seh, options);
15098  CheckErrors(*eval, expected_errors);
15099 
15100  CLEAR_ERRORS
15101  // try one that looks like a local ID
15102  scope.RemoveTopLevelSeqEntry(seh);
15103  ChangeGoodNucProtSetIdToGenbankName(entry, "lcl|prot");
15104  seh = scope.AddTopLevelSeqEntry(*entry);
15105  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadProductSeqId",
15106  "Feature product should not use Textseq-id 'name' slot"));
15107  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadProductSeqId",
15108  "Protein bioseq has Textseq-id 'name' and no accession"));
15109  // AddChromosomeNoLocation(expected_errors, entry);
15110  eval = validator.Validate(seh, options);
15111  CheckErrors(*eval, expected_errors);
15112 
15113  CLEAR_ERRORS
15114 
15115  // change capitalization
15116  scope.RemoveTopLevelSeqEntry(seh);
15117  entry = BuildGoodNucProtSet();
15119  NStr::ToUpper(cds->SetProduct().SetWhole().SetLocal().SetStr());
15120  seh = scope.AddTopLevelSeqEntry(*entry);
15121  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "BadProductSeqId",
15122  "Capitalization change from product location on feature to product sequence"));
15123  // AddChromosomeNoLocation(expected_errors, entry);
15124  eval = validator.Validate(seh, options);
15125  CheckErrors(*eval, expected_errors);
15126 
15127  CLEAR_ERRORS
15128 }
15129 
15130 
15131 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RnaProductMismatch)
15132 {
15135  rna_feat->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
15136  rna_feat->SetLocation().SetInt().SetTo(59);
15137  rna_feat->SetProduct().SetWhole().SetLocal().SetStr("rna");
15138 
15140  rna_seq->SetSeq().SetId().front()->SetLocal().SetStr("rna");
15141 
15142  CRef<CSeq_entry> entry(new CSeq_entry());
15144  entry->SetSet().SetSeq_set().push_back(nuc);
15145  entry->SetSet().SetSeq_set().push_back(rna_seq);
15146 
15148 
15149  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
15150  "No CDS location match for 1 mRNA"));
15151  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RnaProductMismatch",
15152  "Type of RNA does not match MolInfo of product Bioseq"));
15153  // AddChromosomeNoLocation(expected_errors, entry);
15154  eval = validator.Validate(seh, options);
15155  CheckErrors(*eval, expected_errors);
15156 
15157  // make error go away
15158  CLEAR_ERRORS
15159  scope.RemoveTopLevelSeqEntry(seh);
15160  rna_seq->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
15162  seh = scope.AddTopLevelSeqEntry(*entry);
15163  eval = validator.Validate(seh, options);
15164  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
15165  "No CDS location match for 1 mRNA"));
15166  // AddChromosomeNoLocation(expected_errors, entry);
15167  CheckErrors(*eval, expected_errors);
15168 
15169  CLEAR_ERRORS
15170  // also get errors for tRNA
15171  scope.RemoveTopLevelSeqEntry(seh);
15172  rna_feat->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
15173  rna_feat->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('N');
15174  rna_feat->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetId().SetLocal().SetStr("good");
15175  rna_feat->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(11);
15176  rna_feat->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(13);
15177  seh = scope.AddTopLevelSeqEntry(*entry);
15178  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RnaProductMismatch",
15179  "Type of RNA does not match MolInfo of product Bioseq"));
15180  // AddChromosomeNoLocation(expected_errors, entry);
15181  eval = validator.Validate(seh, options);
15182  CheckErrors(*eval, expected_errors);
15183 
15184  // make error go away
15185  CLEAR_ERRORS
15186  scope.RemoveTopLevelSeqEntry(seh);
15188  seh = scope.AddTopLevelSeqEntry(*entry);
15189  // AddChromosomeNoLocation(expected_errors, entry);
15190  eval = validator.Validate(seh, options);
15191  CheckErrors(*eval, expected_errors);
15192 
15193  // also get errors for rRNA
15194  scope.RemoveTopLevelSeqEntry(seh);
15195  rna_feat->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
15196  rna_feat->SetData().SetRna().SetExt().SetName("a ribosomal RNA");
15197  seh = scope.AddTopLevelSeqEntry(*entry);
15198  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RnaProductMismatch",
15199  "Type of RNA does not match MolInfo of product Bioseq"));
15200  eval = validator.Validate(seh, options);
15201  CheckErrors(*eval, expected_errors);
15202 
15203  // make error go away
15204  CLEAR_ERRORS
15205  scope.RemoveTopLevelSeqEntry(seh);
15207  seh = scope.AddTopLevelSeqEntry(*entry);
15208  // AddChromosomeNoLocation(expected_errors, entry);
15209  eval = validator.Validate(seh, options);
15210  CheckErrors(*eval, expected_errors);
15211 
15212  CLEAR_ERRORS
15213 }
15214 
15215 
15216 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingCDSproduct)
15217 {
15218  CRef<CSeq_entry> entry(new CSeq_entry());
15222  cds->SetData().SetCdregion();
15223  cds->SetProduct().SetWhole().SetLocal().SetStr("not_present_ever");
15224  entry->SetSet().SetSeq_set().push_back(nuc);
15226 
15227  BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
15228  BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
15229 
15230  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StartCodon",
15231  "Illegal start codon used. Wrong genetic code [0] or protein should be partial"));
15232  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoStop",
15233  "Missing stop codon"));
15234  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingCDSproduct",
15235  "Unable to find product Bioseq from CDS feature"));
15236  // AddChromosomeNoLocation(expected_errors, entry);
15237  eval = validator.Validate(seh, options);
15238  CheckErrors(*eval, expected_errors);
15239 
15240  cds->ResetProduct();
15241  expected_errors[2]->SetErrMsg("Expected CDS product absent");
15242  expected_errors[2]->SetSeverity(eDiag_Error);
15243  eval = validator.Validate(seh, options);
15244  CheckErrors(*eval, expected_errors);
15245 
15246  // ok if pseudo
15247  CLEAR_ERRORS
15248  cds->SetPseudo(true);
15249  // AddChromosomeNoLocation(expected_errors, entry);
15250  eval = validator.Validate(seh, options);
15251  CheckErrors(*eval, expected_errors);
15252 
15253  // also ok if exception
15254  cds->ResetPseudo();
15255  cds->SetExcept(true);
15256  cds->SetExcept_text("rearrangement required for product");
15257  eval = validator.Validate(seh, options);
15258  CheckErrors(*eval, expected_errors);
15259 
15260  // also ok if CDS contains just stop codon
15261  scope.RemoveTopLevelSeqEntry(seh);
15262  cds->ResetExcept();
15263  cds->ResetExcept_text();
15264  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATAAGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
15265  cds->SetLocation().SetInt().SetTo(4);
15266  cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
15267  cds->SetPartial(true);
15268  cds->SetData().SetCdregion().SetFrame(CCdregion::eFrame_three);
15269  seh = scope.AddTopLevelSeqEntry(*entry);
15270  eval = validator.Validate(seh, options);
15271  CheckErrors(*eval, expected_errors);
15272 
15273  CLEAR_ERRORS
15274 }
15275 
15276 
15277 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrnaCodon)
15278 {
15280  CRef<CSeq_feat> trna = unit_test_util::BuildGoodtRNA(entry->SetSeq().SetId().front());
15281  trna->SetData().SetRna().SetExt().SetTRNA().SetCodon().push_back(64);
15282  unit_test_util::AddFeat(trna, entry);
15283 
15285  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadTrnaCodon",
15286  "tRNA codon value 64 is greater than maximum 63"));
15287  // AddChromosomeNoLocation(expected_errors, entry);
15288  eval = validator.Validate(seh, options);
15289  CheckErrors(*eval, expected_errors);
15290 
15291  CLEAR_ERRORS
15292 }
15293 
15294 
15295 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrnaAA)
15296 {
15298  CRef<CSeq_feat> trna = unit_test_util::BuildGoodtRNA(entry->SetSeq().SetId().front());
15299  trna->SetData().SetRna().SetExt().SetTRNA().ResetAa();
15300  unit_test_util::AddFeat(trna, entry);
15301 
15303  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadTrnaAA",
15304  "Missing tRNA amino acid"));
15305  // AddChromosomeNoLocation(expected_errors, entry);
15306  eval = validator.Validate(seh, options);
15307  CheckErrors(*eval, expected_errors);
15308 
15309  CLEAR_ERRORS
15310  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa(29);
15311 
15312  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonAA",
15313  "Codons predicted from anticodon (AAA) cannot produce amino acid ( /OTHER)"));
15314  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadTrnaAA",
15315  "Invalid tRNA amino acid"));
15316  // AddChromosomeNoLocation(expected_errors, entry);
15317  eval = validator.Validate(seh, options);
15318  CheckErrors(*eval, expected_errors);
15319 
15320  CLEAR_ERRORS
15321 }
15322 
15323 
15324 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_OnlyGeneXrefs)
15325 {
15328  feat->SetGeneXref().SetLocus("foo");
15329 
15331  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GeneXrefWithoutGene",
15332  "Feature has gene locus cross-reference but no equivalent gene feature exists"));
15333  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OnlyGeneXrefs",
15334  "There are 1 gene xrefs and no gene features in this record."));
15335  // AddChromosomeNoLocation(expected_errors, entry);
15336  eval = validator.Validate(seh, options);
15337  CheckErrors(*eval, expected_errors);
15338  CLEAR_ERRORS
15339 }
15340 
15341 
15342 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UTRdoesNotAbutCDS)
15343 {
15348  cds->SetLocation().SetInt().SetFrom(3);
15349  nseq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("CCCATGAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
15350  pseq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MRKTEIN");
15351  pseq->SetSeq().SetInst().SetLength(7);
15353 
15355  utr5->SetData().SetImp().SetKey("5'UTR");
15356  utr5->SetLocation().SetInt().SetTo(1);
15357 
15359  utr3->SetData().SetImp().SetKey("3'UTR");
15360  utr3->SetLocation().SetInt().SetFrom(28);
15361  utr3->SetLocation().SetInt().SetTo(59);
15362 
15364 
15365  expected_errors.push_back(new CExpectedError("lcl|nuc",eDiag_Warning,"UTRdoesNotAbutCDS",
15366  "5'UTR does not abut CDS"));
15367  expected_errors.push_back(new CExpectedError("lcl|nuc",eDiag_Warning,"UTRdoesNotAbutCDS",
15368  "CDS does not abut 3'UTR"));
15369  // AddChromosomeNoLocation(expected_errors, entry);
15370 
15371  eval = validator.Validate(seh, options);
15372  CheckErrors(*eval, expected_errors);
15373 
15374  scope.RemoveTopLevelSeqEntry(seh);
15375  utr5->SetLocation().SetInt().SetTo(2);
15376  utr5->SetLocation().SetInt().SetStrand(eNa_strand_minus);
15377  utr3->SetLocation().SetInt().SetFrom(27);
15378  utr3->SetLocation().SetInt().SetStrand(eNa_strand_minus);
15379  seh = scope.AddTopLevelSeqEntry(*entry);
15380 
15381  expected_errors[0]->SetErrMsg("5'UTR is not on plus strand");
15382  expected_errors[1]->SetErrMsg("3'UTR is not on plus strand");
15383  eval = validator.Validate(seh, options);
15384  CheckErrors(*eval, expected_errors);
15385 
15386  scope.RemoveTopLevelSeqEntry(seh);
15387  unit_test_util::RevComp(entry);
15388  seh = scope.AddTopLevelSeqEntry(*entry);
15389  expected_errors[0]->SetErrMsg("3'UTR is not on minus strand");
15390  expected_errors[1]->SetErrMsg("5'UTR is not on minus strand");
15391  eval = validator.Validate(seh, options);
15392  CheckErrors(*eval, expected_errors);
15393 
15394  CLEAR_ERRORS
15395 }
15396 
15397 
15398 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ExceptionProblem)
15399 {
15402 
15404 
15405  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ExceptionProblem", "Exception explanation text is also found in feature comment"));
15406  // AddChromosomeNoLocation(expected_errors, entry);
15407 
15408  feat->SetExcept(true);
15409 
15410  // look for exception in comment
15411  feat->SetExcept_text("RNA editing");
15412  feat->SetComment("RNA editing");
15413  eval = validator.Validate(seh, options);
15414  CheckErrors(*eval, expected_errors);
15415 
15416  // look for one exception in comment
15417  feat->SetExcept_text("RNA editing, rearrangement required for product");
15418  eval = validator.Validate(seh, options);
15419  CheckErrors(*eval, expected_errors);
15420 
15421  // no citation
15422  feat->SetExcept_text("reasons given in citation");
15423  expected_errors[0]->SetErrMsg("Reasons given in citation exception does not have the required citation");
15424  eval = validator.Validate(seh, options);
15425  CheckErrors(*eval, expected_errors);
15426 
15427  // no inference
15428  feat->SetExcept_text("annotated by transcript or proteomic data");
15429  expected_errors[0]->SetErrMsg("Annotated by transcript or proteomic data exception does not have the required inference qualifier");
15430  eval = validator.Validate(seh, options);
15431  CheckErrors(*eval, expected_errors);
15432 
15433  // not legal
15434  feat->SetExcept_text("not a legal exception");
15435  expected_errors[0]->SetErrMsg("not a legal exception is not a legal exception explanation");
15436  expected_errors[0]->SetSeverity(eDiag_Error);
15437  eval = validator.Validate(seh, options);
15438  CheckErrors(*eval, expected_errors);
15439 
15440  // change to ref-seq
15441  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
15442  scope.RemoveTopLevelSeqEntry(seh);
15443  seh = scope.AddTopLevelSeqEntry(*entry);
15444  feat->SetLocation().SetInt().SetId().SetOther().SetAccession("NC_123456");
15445 
15446 
15447  // multiple ref-seq exceptions
15448  feat->SetExcept_text("unclassified transcription discrepancy, RNA editing");
15449  feat->SetComment("misc_feature needs a comment");
15450  expected_errors[0]->SetErrMsg("Genome processing exception should not be combined with other explanations");
15451  expected_errors[0]->SetSeverity(eDiag_Warning);
15452  ChangeErrorAcc(expected_errors, "ref|NC_123456|");
15453  eval = validator.Validate(seh, options);
15454  CheckErrors(*eval, expected_errors);
15455 
15456  CLEAR_ERRORS
15457  // not legal (is warning for NC or NT)
15458  feat->SetExcept_text("not a legal exception");
15459  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "ExceptionProblem", "not a legal exception is not a legal exception explanation"));
15460  // AddChromosomeNoLocation(expected_errors, entry);
15461  eval = validator.Validate(seh, options);
15462  CheckErrors(*eval, expected_errors);
15463 
15464  CLEAR_ERRORS
15465 
15466  // these are now legal for RefSeq
15467  feat->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
15468  feat->SetData().SetRna().SetExt().SetName("23S ribosomal RNA");
15469  feat->ResetComment();
15470  feat->SetExcept_text("23S ribosomal RNA and 5S ribosomal RNA overlap");
15471  // AddChromosomeNoLocation(expected_errors, entry);
15472  eval = validator.Validate(seh, options);
15473  CheckErrors(*eval, expected_errors);
15474  feat->SetExcept_text("5S ribosomal RNA and 16S ribosomal RNA overlap");
15475  eval = validator.Validate(seh, options);
15476  CheckErrors(*eval, expected_errors);
15477  feat->SetExcept_text("5S ribosomal RNA and 23S ribosomal RNA overlap");
15478  eval = validator.Validate(seh, options);
15479  CheckErrors(*eval, expected_errors);
15480  feat->SetExcept_text("23S ribosomal RNA and 16S ribosomal RNA overlap");
15481  eval = validator.Validate(seh, options);
15482  CheckErrors(*eval, expected_errors);
15483 
15484  CLEAR_ERRORS
15485 }
15486 
15487 
15488 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqDataLenWrong)
15489 {
15491 
15493  // need to call this statement before calling AddDefaults
15494  // to make sure that we can fetch the sequence referenced by the
15495  // delta sequence so that we can detect that the loc in the
15496  // delta sequence is longer than the referenced sequence
15498  CScope scope(*objmgr);
15499  scope.AddDefaults();
15500 
15501  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
15502 
15503  CValidator validator(*objmgr);
15504 
15505  // Set validator options
15506  unsigned int options = CValidator::eVal_need_isojta
15510 
15511  // list of expected errors
15512  vector<CExpectedError*> expected_errors;
15513 
15514  // validate - should be fine
15515  // AddChromosomeNoLocation(expected_errors, entry);
15516  CConstRef<CValidError> eval = validator.Validate(seh, options);
15517  CheckErrors(*eval, expected_errors);
15518 
15519  // longer and shorter for iupacna
15520  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "SeqDataLenWrong", "Bioseq.seq_data too short [60] for given length [65]"));
15521  entry->SetSeq().SetInst().SetLength(65);
15522  eval = validator.Validate(seh, options);
15523  CheckErrors(*eval, expected_errors);
15524 
15525  entry->SetSeq().SetInst().SetLength(55);
15526  expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [60] than given length [55]");
15527  eval = validator.Validate(seh, options);
15528  CheckErrors(*eval, expected_errors);
15529 
15530  // try other divisors
15531  entry->SetSeq().SetInst().SetLength(60);
15532  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('A');
15533  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('T');
15534  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('G');
15535  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('C');
15536  CRef<CSeq_data> packed_data(new CSeq_data);
15537  // convert seq data to another format
15538  // (NCBI2na = 2 bit nucleic acid code)
15539  CSeqportUtil::Convert(entry->SetSeq().SetInst().GetSeq_data(),
15540  packed_data,
15542  entry->SetSeq().SetInst().SetSeq_data(*packed_data);
15543  expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [64] than given length [60]");
15544  eval = validator.Validate(seh, options);
15545  CheckErrors(*eval, expected_errors);
15546 
15547  entry->SetSeq().SetInst().SetSeq_data().SetNcbi2na().Set().pop_back();
15548  entry->SetSeq().SetInst().SetSeq_data().SetNcbi2na().Set().pop_back();
15549  expected_errors[0]->SetErrMsg("Bioseq.seq_data too short [56] for given length [60]");
15550  eval = validator.Validate(seh, options);
15551  CheckErrors(*eval, expected_errors);
15552 
15553  CSeqportUtil::Convert(entry->SetSeq().SetInst().GetSeq_data(),
15554  packed_data,
15556  entry->SetSeq().SetInst().SetSeq_data(*packed_data);
15557  eval = validator.Validate(seh, options);
15558  CheckErrors(*eval, expected_errors);
15559 
15560  entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na().Set().push_back('1');
15561  entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na().Set().push_back('8');
15562  entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na().Set().push_back('1');
15563  entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na().Set().push_back('8');
15564  expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [64] than given length [60]");
15565  eval = validator.Validate(seh, options);
15566  CheckErrors(*eval, expected_errors);
15567 
15568  CRef<CSeq_id> id(new CSeq_id("gb|AY123456"));
15569 #if 0
15570  // removed per VR-779
15571  // now try seg and ref
15572  entry->SetSeq().SetInst().ResetSeq_data();
15573  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
15574  CRef<CSeq_loc> loc(new CSeq_loc(*id, 0, 55));
15575  entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc);
15576  expected_errors[0]->SetErrMsg("Bioseq.seq_data too short [56] for given length [60]");
15577  eval = validator.Validate(seh, options);
15578  CheckErrors(*eval, expected_errors);
15579 
15580  loc->SetInt().SetTo(63);
15581  expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [64] than given length [60]");
15582  eval = validator.Validate(seh, options);
15583  CheckErrors(*eval, expected_errors);
15584 
15585  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_ref);
15586  entry->SetSeq().SetInst().SetExt().SetRef().SetInt().SetId(*id);
15587  entry->SetSeq().SetInst().SetExt().SetRef().SetInt().SetFrom(0);
15588  entry->SetSeq().SetInst().SetExt().SetRef().SetInt().SetTo(55);
15589  expected_errors[0]->SetErrMsg("Bioseq.seq_data too short [56] for given length [60]");
15590  eval = validator.Validate(seh, options);
15591  CheckErrors(*eval, expected_errors);
15592 
15593  entry->SetSeq().SetInst().SetExt().SetRef().SetInt().SetTo(63);
15594  expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [64] than given length [60]");
15595  eval = validator.Validate(seh, options);
15596  CheckErrors(*eval, expected_errors);
15597 #endif
15598 
15599  CLEAR_ERRORS
15600  entry->SetSeq().SetInst().ResetSeq_data();
15601  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "SeqDataLenWrong",
15602  "Bioseq.seq_data too short [56] for given length [60]"));
15603  // AddChromosomeNoLocation(expected_errors, entry);
15604  // delta sequence
15605  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
15606  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*id, 0, 55);
15607  eval = validator.Validate(seh, options);
15608  CheckErrors(*eval, expected_errors);
15609  entry->SetSeq().SetInst().SetExt().Reset();
15610  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*id, 0, 30);
15611  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*id, 40, 72);
15612  expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [64] than given length [60]");
15613  eval = validator.Validate(seh, options);
15614  CheckErrors(*eval, expected_errors);
15615 
15616  entry->SetSeq().SetInst().SetExt().Reset();
15617  entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*id, 0, 59);
15618  CRef<CDelta_seq> delta_seq;
15619  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(delta_seq);
15620  expected_errors[0]->SetErrMsg("NULL pointer in delta seq_ext valnode (segment 2)");
15621  expected_errors[0]->SetSeverity(eDiag_Error);
15622  eval = validator.Validate(seh, options);
15623  CheckErrors(*eval, expected_errors);
15624 
15625  entry->SetSeq().SetInst().SetExt().Reset();
15626  CRef<CDelta_seq> delta_seq2(new CDelta_seq());
15627  delta_seq2->SetLoc().SetInt().SetId(*id);
15628  delta_seq2->SetLoc().SetInt().SetFrom(0);
15629  delta_seq2->SetLoc().SetInt().SetTo(485);
15630  entry->SetSeq().SetInst().SetLength(486);
15631  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(delta_seq2);
15632  expected_errors[0]->SetErrMsg("Seq-loc extent (486) greater than length of gb|AY123456| (485)");
15633  expected_errors[0]->SetSeverity(eDiag_Critical);
15634  eval = validator.Validate(seh, options);
15635  CheckErrors(*eval, expected_errors);
15636 
15637  CLEAR_ERRORS
15638 }
15639 
15640 
15641 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadConflictFlag)
15642 {
15645  cds_feat->SetData().SetCdregion().SetConflict(true);
15646 
15648 
15649  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "BadConflictFlag",
15650  "Coding region conflict flag should not be set"));
15651  // AddChromosomeNoLocation(expected_errors, entry);
15652  eval = validator.Validate(seh, options);
15653  CheckErrors(*eval, expected_errors);
15654 
15655  CLEAR_ERRORS
15656 }
15657 
15658 
15659 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ConflictFlagSet)
15660 {
15663  cds_feat->SetData().SetCdregion().SetConflict(true);
15664  CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
15665  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEIXX");
15666  prot->SetSeq().SetInst().SetLength(9);
15667  CRef<CSeq_feat> prot_feat = prot->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
15668  prot_feat->SetLocation().SetInt().SetTo(8);
15669 
15671 
15672  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ConflictFlagSet",
15673  "Coding region conflict flag is set"));
15674  // AddChromosomeNoLocation(expected_errors, entry);
15675  eval = validator.Validate(seh, options);
15676  CheckErrors(*eval, expected_errors);
15677 
15678  CLEAR_ERRORS
15679 }
15680 
15681 
15682 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_LocusTagProblem)
15683 {
15686  gene->SetData().SetGene().SetLocus_tag("a b c");
15687 
15689 
15690  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LocusTagHasSpace",
15691  "Gene locus_tag 'a b c' should be a single word without any spaces"));
15692  // AddChromosomeNoLocation(expected_errors, entry);
15693  eval = validator.Validate(seh, options);
15694  CheckErrors(*eval, expected_errors);
15695 
15696  gene->AddQualifier("old_locus_tag", "a b c");
15697  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "RedundantFields",
15698  "old_locus_tag has same value as gene locus_tag"));
15699  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "LocusTagProblem",
15700  "Gene locus_tag and old_locus_tag 'a b c' match"));
15701  eval = validator.Validate(seh, options);
15702  CheckErrors(*eval, expected_errors);
15703 
15704  CLEAR_ERRORS
15705  gene->ResetQual();
15706  gene->SetData().SetGene().SetLocus_tag("abc");
15707  gene->SetData().SetGene().SetLocus("abc");
15708  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "LocusTagGeneLocusMatch",
15709  "Gene locus and locus_tag 'abc' match"));
15710  // AddChromosomeNoLocation(expected_errors, entry);
15711  eval = validator.Validate(seh, options);
15712  CheckErrors(*eval, expected_errors);
15713 
15714  CLEAR_ERRORS
15715  gene->SetData().SetGene().ResetLocus();
15716  gene->AddQualifier("old_locus_tag", "a, b, c");
15717  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OldLocusTagBadFormat",
15718  "old_locus_tag has comma, multiple old_locus_tags should be split into separate qualifiers"));
15719  // AddChromosomeNoLocation(expected_errors, entry);
15720  eval = validator.Validate(seh, options);
15721  CheckErrors(*eval, expected_errors);
15722 
15723  CLEAR_ERRORS
15724 }
15725 
15726 
15727 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_AltStartCodonException)
15728 {
15732  cds->SetExcept(true);
15733  cds->SetExcept_text("alternative start codon");
15734 
15736 
15737  // first, no errors because not refseq
15738  // AddChromosomeNoLocation(expected_errors, entry);
15739  eval = validator.Validate(seh, options);
15740  CheckErrors(*eval, expected_errors);
15741 
15742  CLEAR_ERRORS
15743 
15744  // report error if refseq
15745  scope.RemoveTopLevelSeqEntry(seh);
15746  nseq->SetSeq().SetId().front()->SetOther().SetAccession("NM_123456");
15747  cds->SetLocation().SetInt().SetId().SetOther().SetAccession("NM_123456");
15748  seh = scope.AddTopLevelSeqEntry(*entry);
15749  expected_errors.push_back(new CExpectedError("ref|NM_123456|", eDiag_Warning, "AltStartCodonException",
15750  "Unnecessary alternative start codon exception"));
15751  // AddChromosomeNoLocation(expected_errors, entry);
15752  eval = validator.Validate(seh, options);
15753  CheckErrors(*eval, expected_errors);
15754 
15755  CLEAR_ERRORS
15756 }
15757 
15758 
15759 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GenesInconsistent)
15760 {
15765  mgene->SetLocation().SetInt().SetTo(26);
15766  mgene->SetData().SetGene().SetLocus("locus1");
15767 
15770  cgene->SetLocation().SetInt().SetTo(26);
15771  cgene->SetData().SetGene().SetLocus("locus2");
15772 
15774 
15775  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "GenesInconsistent",
15776  "Gene on mRNA bioseq does not match gene on genomic bioseq"));
15777  // AddChromosomeNoLocation(expected_errors, entry);
15778  eval = validator.Validate(seh, options);
15779  CheckErrors(*eval, expected_errors);
15780 
15781  CLEAR_ERRORS
15782 }
15783 
15784 
15785 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateTranslExcept)
15786 {
15789  CRef<CCode_break> codebreak1(new CCode_break());
15790  codebreak1->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
15791  codebreak1->SetLoc().SetInt().SetFrom(24);
15792  codebreak1->SetLoc().SetInt().SetTo(26);
15793  cds->SetData().SetCdregion().SetCode_break().push_back(codebreak1);
15794  CRef<CCode_break> codebreak2(new CCode_break());
15795  codebreak2->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
15796  codebreak2->SetLoc().SetInt().SetFrom(24);
15797  codebreak2->SetLoc().SetInt().SetTo(26);
15798  cds->SetData().SetCdregion().SetCode_break().push_back(codebreak2);
15799 
15801 
15802  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "DuplicateTranslExcept",
15803  "Multiple code-breaks at same location [lcl|nuc:25-27]"));
15804  // AddChromosomeNoLocation(expected_errors, entry);
15805  eval = validator.Validate(seh, options);
15806  CheckErrors(*eval, expected_errors);
15807 
15808  CLEAR_ERRORS
15809 }
15810 
15811 
15812 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranslExceptAndRnaEditing)
15813 {
15816  CRef<CCode_break> codebreak1(new CCode_break());
15817  codebreak1->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
15818  codebreak1->SetLoc().SetInt().SetFrom(24);
15819  codebreak1->SetLoc().SetInt().SetTo(26);
15820  cds->SetData().SetCdregion().SetCode_break().push_back(codebreak1);
15821  cds->SetExcept(true);
15822  cds->SetExcept_text("RNA editing");
15823 
15825 
15826  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranslExceptAndRnaEditing",
15827  "CDS has both RNA editing /exception and /transl_except qualifiers"));
15828  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
15829  "CDS has exception but passes translation test"));
15830  // AddChromosomeNoLocation(expected_errors, entry);
15831  eval = validator.Validate(seh, options);
15832  CheckErrors(*eval, expected_errors);
15833 
15834  BOOST_CHECK_EQUAL(validator::DoesFeatureHaveUnnecessaryException(*cds, scope), true);
15835 
15836  CLEAR_ERRORS
15837 }
15838 
15839 
15840 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NoNameForProtein)
15841 {
15844  prot_feat->SetData().SetProt().ResetName();
15845  prot_feat->SetData().SetProt().SetDesc("protein description");
15847 
15848  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "NoNameForProtein",
15849  "Protein feature has description but no name"));
15850  // AddChromosomeNoLocation(expected_errors, entry);
15851  eval = validator.Validate(seh, options);
15852  CheckErrors(*eval, expected_errors);
15853 
15854  prot_feat->SetData().SetProt().ResetDesc();
15855  prot_feat->SetData().SetProt().SetActivity().push_back("activity");
15856  expected_errors[0]->SetErrMsg("Protein feature has function but no name");
15857  eval = validator.Validate(seh, options);
15858  CheckErrors(*eval, expected_errors);
15859 
15860  prot_feat->SetData().SetProt().ResetActivity();
15861  prot_feat->SetData().SetProt().SetEc().push_back("1.2.3.4");
15862  expected_errors[0]->SetErrMsg("Protein feature has EC number but no name");
15863  eval = validator.Validate(seh, options);
15864  CheckErrors(*eval, expected_errors);
15865 
15866  CLEAR_ERRORS
15867  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ProtRefHasNoData",
15868  "There is a protein feature where all fields are empty"));
15869  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "NoNameForProtein",
15870  "Protein feature has no name"));
15871  // AddChromosomeNoLocation(expected_errors, entry);
15872 
15873  prot_feat->SetData().SetProt().ResetEc();
15874  eval = validator.Validate(seh, options);
15875  CheckErrors(*eval, expected_errors);
15876 
15877  CLEAR_ERRORS
15878 }
15879 
15880 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSmRNAmismatch)
15881 {
15886  gene->SetLocation().SetInt().SetTo(40);
15887  unit_test_util::AddFeat(gene, nseq);
15889  mrna1->SetData().SetRna().SetExt().SetName("product 1");
15890  unit_test_util::AddFeat(mrna1, nseq);
15891 
15893  mrna2->SetData().SetRna().SetExt().SetName("product 2");
15894  mrna2->SetLocation().SetInt().SetTo(40);
15895  unit_test_util::AddFeat(mrna2, nseq);
15896 
15898  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNAmismatchCount",
15899  "mRNA count (2) does not match CDS (1) count for gene"));
15900  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSwithMultipleMRNAs",
15901  "CDS matches 2 mRNAs"));
15902  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNAMismatchLocation",
15903  "No CDS location match for 1 mRNA"));
15904  // AddChromosomeNoLocation(expected_errors, entry);
15905 
15906  eval = validator.Validate(seh, options);
15907  CheckErrors(*eval, expected_errors);
15908 
15909  CLEAR_ERRORS
15910 }
15911 
15912 
15913 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryException)
15914 {
15918  cds->SetExcept(true);
15919  cds->SetExcept_text("RNA editing");
15920  mrna->SetExcept(true);
15921  mrna->SetExcept_text("transcribed product replaced");
15922 
15924  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryException",
15925  "CDS has exception but passes translation test"));
15926  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryException",
15927  "mRNA has exception but passes transcription test"));
15928  // AddChromosomeNoLocation(expected_errors, entry);
15929 
15930  eval = validator.Validate(seh, options);
15931  CheckErrors(*eval, expected_errors);
15932  BOOST_CHECK_EQUAL(DoesFeatureHaveUnnecessaryException(*cds, scope), true);
15933  BOOST_CHECK_EQUAL(DoesFeatureHaveUnnecessaryException(*mrna, scope), true);
15934 
15935  CLEAR_ERRORS
15936 
15937  scope.RemoveTopLevelSeqEntry(seh);
15942  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc_seq->SetSeq().SetId().front()));
15943  mrna = unit_test_util::MakemRNAForCDS(cds);
15944  unit_test_util::AddFeat (mrna, nuc_seq);
15946  exon->SetData().SetImp().SetKey("exon");
15947  exon->SetLocation().Assign(*(cds->SetLocation().SetMix().Set().front()));
15948  cds->SetExcept(true);
15949  cds->SetExcept_text("artificial frameshift");
15950  mrna->SetExcept(true);
15951  mrna->SetExcept_text("artificial frameshift");
15952  exon->SetExcept(true);
15953  exon->SetExcept_text("artificial frameshift");
15954  seh = scope.AddTopLevelSeqEntry(*entry);
15955 
15956  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
15957  "feature has exception but passes splice site test"));
15958  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
15959  "feature has exception but passes splice site test"));
15960  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
15961  "feature has exception but passes splice site test"));
15962  // AddChromosomeNoLocation(expected_errors, entry);
15963 
15964  options |= CValidator::eVal_val_exons;
15965  eval = validator.Validate(seh, options);
15966  CheckErrors(*eval, expected_errors);
15967 
15968  BOOST_CHECK_EQUAL(DoesFeatureHaveUnnecessaryException(*cds, scope), true);
15969  BOOST_CHECK_EQUAL(DoesFeatureHaveUnnecessaryException(*mrna, scope), true);
15970  BOOST_CHECK_EQUAL(DoesFeatureHaveUnnecessaryException(*exon, scope), true);
15971  CLEAR_ERRORS
15972 }
15973 
15974 
15975 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_LocusTagProductMismatch)
15976 {
15981  CRef<CSeq_id> id(new CSeq_id());
15982  id->SetGeneral().SetDb("a");
15983  id->SetGeneral().SetTag().SetStr("good");
15985  CRef<CSeq_id> lcl_id(new CSeq_id());
15986  lcl_id->SetLocal().SetStr("x");
15987  prot->SetSeq().SetId().push_back(lcl_id);
15988 
15991 
15993  gene->SetData().SetGene().SetLocus_tag("something");
15995 
15997 
15999  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "LocusTagProductMismatch",
16000  "Gene locus_tag does not match general ID of product"));
16001  // AddChromosomeNoLocation(expected_errors, entry);
16002 
16003  eval = validator.Validate(seh, options);
16004  CheckErrors(*eval, expected_errors);
16005  CLEAR_ERRORS
16006 }
16007 
16008 
16009 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoCdsViaGeneHasProduct)
16010 {
16015  gene->SetPseudo(true);
16017 
16019 
16020  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PseudoCdsViaGeneHasProduct",
16021  "A coding region overlapped by a pseudogene should not have a product"));
16022  // AddChromosomeNoLocation(expected_errors, entry);
16023 
16024  eval = validator.Validate(seh, options);
16025  CheckErrors(*eval, expected_errors);
16026  CLEAR_ERRORS
16027 }
16028 
16029 
16030 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingGeneXref)
16031 {
16034  misc->SetLocation().SetInt().SetFrom(5);
16035 
16037  gene1->SetData().SetGene().SetLocus("first");
16038  gene1->SetLocation().SetInt().SetFrom(0);
16039  unit_test_util::AddFeat(gene1, entry);
16041  gene2->SetData().SetGene().SetLocus("second");
16042  gene2->SetLocation().SetInt().SetTo(misc->GetLocation().GetInt().GetTo() + 5);
16043  unit_test_util::AddFeat(gene2, entry);
16044 
16046 
16047  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingGeneXref",
16048  "Feature overlapped by 2 identical-length genes but has no cross-reference"));
16049  // AddChromosomeNoLocation(expected_errors, entry);
16050 
16051  eval = validator.Validate(seh, options);
16052  CheckErrors(*eval, expected_errors);
16053  CLEAR_ERRORS
16054 }
16055 
16056 
16057 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureCitationProblem)
16058 {
16061  CRef<CPub> pub(new CPub());
16063  misc->SetCit().SetPub().push_back(pub);
16064 
16066 
16067  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureCitationProblem",
16068  "Citation on feature refers to uid [2] not on a publication in the record"));
16069  // AddChromosomeNoLocation(expected_errors, entry);
16070 
16071  eval = validator.Validate(seh, options);
16072  CheckErrors(*eval, expected_errors);
16073  CLEAR_ERRORS
16074 }
16075 
16076 
16077 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NestedSeqLocMix)
16078 {
16081  CRef<CSeq_loc> loc1(new CSeq_loc());
16082  loc1->SetInt().SetId().SetLocal().SetStr("good");
16083  loc1->SetInt().SetFrom(0);
16084  loc1->SetInt().SetTo(10);
16085  CRef<CSeq_loc> loc2(new CSeq_loc());
16086  loc2->SetInt().SetId().SetLocal().SetStr("good");
16087  loc2->SetInt().SetFrom(20);
16088  loc2->SetInt().SetTo(30);
16089  CRef<CSeq_loc> loc3(new CSeq_loc());
16090  loc3->SetInt().SetId().SetLocal().SetStr("good");
16091  loc3->SetInt().SetFrom(40);
16092  loc3->SetInt().SetTo(50);
16093  CRef<CSeq_loc> loc4(new CSeq_loc());
16094  loc4->SetMix().Set().push_back(loc2);
16095  loc4->SetMix().Set().push_back(loc3);
16096 
16097  misc->SetLocation().SetMix().Set().push_back(loc1);
16098  misc->SetLocation().SetMix().Set().push_back(loc4);
16099  misc->SetProduct().Assign(misc->SetLocation());
16100 
16102 
16103  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NestedSeqLocMix",
16104  "Location: SeqLoc [[lcl|good:1-11, [21-31, 41-51]]] has nested SEQLOC_MIX elements"));
16105  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NestedSeqLocMix",
16106  "Product: SeqLoc [[lcl|good:1-11, [21-31, 41-51]]] has nested SEQLOC_MIX elements"));
16107  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SelfReferentialProduct",
16108  "Self-referential feature product"));
16109  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ProductShouldBeWhole",
16110  "Feature products should be entire sequences."));
16111  // AddChromosomeNoLocation(expected_errors, entry);
16112  eval = validator.Validate(seh, options);
16113  CheckErrors(*eval, expected_errors);
16114  CLEAR_ERRORS
16115 }
16116 
16117 
16118 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CodonQualifierUsed)
16119 {
16122  cds->AddQualifier("codon", "1");
16123 
16125 
16126  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "CodonQualifierUsed",
16127  "Use the proper genetic code, if available, or set transl_excepts on specific codons"));
16128 
16129  // AddChromosomeNoLocation(expected_errors, entry);
16130  eval = validator.Validate(seh, options);
16131  CheckErrors(*eval, expected_errors);
16132  CLEAR_ERRORS
16133 }
16134 
16135 
16136 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadCharInAuthorName)
16137 {
16139  CRef<CSeqdesc> desc(new CSeqdesc());
16142  auth->SetName().SetName().SetFirst("F1rst");
16143  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(auth);
16144  desc->SetPub().SetPub().Set().push_back(pub);
16145  entry->SetSeq().SetDescr().Set().push_back(desc);
16146 
16148 
16149  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadCharInAuthorName",
16150  "Bad characters in author F1rst"));
16151  // AddChromosomeNoLocation(expected_errors, entry);
16152 
16153  eval = validator.Validate(seh, options);
16154  CheckErrors(*eval, expected_errors);
16155  CLEAR_ERRORS
16156 }
16157 
16158 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PolyATail)
16159 {
16161  CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
16162  CRef<CSeq_feat> mrna = contig->SetSeq().SetAnnot().front()->SetData().SetFtable().back();
16163  mrna->SetLocation().SetInt().SetTo(25);
16164 
16166 
16167  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSwithNoMRNA",
16168  "Unmatched CDS"));
16169  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
16170  "No CDS location match for 1 mRNA"));
16171  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNArange",
16172  "mRNA overlaps or contains CDS but does not completely contain intervals"));
16173  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "PolyATail",
16174  "Transcript length [26] less than product length [27], but tail is 100% polyA"));
16175  // AddChromosomeNoLocation(expected_errors, entry);
16176  eval = validator.Validate(seh, options);
16177  CheckErrors(*eval, expected_errors);
16178 
16179  scope.RemoveTopLevelSeqEntry(seh);
16182  transcript->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTAAAAAAAAAAAAAAAAAATAA");
16183  transcript->SetSeq().SetInst().SetLength(46);
16184  seh = scope.AddTopLevelSeqEntry(*entry);
16185  expected_errors[3]->SetErrMsg("Transcript length [26] less than product length [46], but tail >= 95% polyA");
16186  eval = validator.Validate(seh, options);
16187  CheckErrors(*eval, expected_errors);
16188  CLEAR_ERRORS
16189 }
16190 
16191 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSwithMultipleMRNAs)
16192 {
16197  second_mrna->SetProduct().SetWhole().SetLocal().SetStr("nuc");
16198  unit_test_util::AddFeat(second_mrna, genomic);
16199 
16201  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureProductInconsistency",
16202  "mRNA products are not unique"));
16203  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSwithMultipleMRNAs",
16204  "CDS matches 2 mRNAs"));
16205  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
16206  "No CDS location match for 1 mRNA"));
16207  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatContentDup",
16208  "Duplicate feature"));
16209  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "IdenticalMRNAtranscriptIDs",
16210  "Identical transcript IDs found on multiple mRNAs"));
16211  // AddChromosomeNoLocation(expected_errors, entry);
16212 
16213  eval = validator.Validate(seh, options);
16214  CheckErrors(*eval, expected_errors);
16215  // now try with unique products
16216  scope.RemoveTopLevelSeqEntry(seh);
16217  CRef<CSeq_id> nuc_id(new CSeq_id());
16218  nuc_id->SetLocal().SetStr("nuc2");
16219  CRef<CSeq_id> prot_id(new CSeq_id());
16220  prot_id->SetLocal().SetStr("prot2");
16222  entry->SetSet().SetSeq_set().push_back(np);
16223  second_mrna->SetProduct().SetWhole().Assign(*nuc_id);
16224  seh = scope.AddTopLevelSeqEntry(*entry);
16225 
16226  CLEAR_ERRORS
16227 
16228  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "CDSwithMultipleMRNAs",
16229  "CDS matches 2 mRNAs, but product locations are unique"));
16230  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
16231  "No CDS location match for 1 mRNA"));
16232  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatContentDup", "Duplicate feature"));
16233  expected_errors.push_back(new CExpectedError("lcl|prot2", eDiag_Warning, "GenomicProductPackagingProblem",
16234  "Protein bioseq should be product of CDS feature on contig, but is not"));
16235  // AddChromosomeNoLocation(expected_errors, entry);
16236 
16237  eval = validator.Validate(seh, options);
16238  CheckErrors(*eval, expected_errors);
16239  CLEAR_ERRORS
16240 }
16241 
16242 
16243 void TestMultipleEquivBioSources(const string& lineage, TSeqPos first_end, TSeqPos second_start, bool expected)
16244 {
16247  src1->SetData().SetBiosrc().SetOrg().SetTaxname("Homo sapiens");
16248  src1->SetData().SetBiosrc().SetOrg().SetOrgname().SetLineage(lineage);
16249  src1->SetLocation().SetInt().SetTo(first_end);
16251  src2->SetData().SetBiosrc().SetOrg().SetTaxname("Homo sapiens");
16252  src2->SetData().SetBiosrc().SetOrg().SetOrgname().SetLineage(lineage);
16253  src2->SetLocation().SetInt().SetFrom(second_start);
16254  src2->SetLocation().SetInt().SetTo(second_start + 9);
16255  unit_test_util::SetTransgenic(entry, true);
16256 
16258 
16259  if (expected) {
16260  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleEquivBioSources",
16261  "Multiple equivalent source features should be combined into one multi-interval feature"));
16262  }
16263  // AddChromosomeNoLocation(expected_errors, entry);
16264 
16266  eval = validator.Validate(seh, options);
16267  CheckErrors(*eval, expected_errors);
16268 
16269  CLEAR_ERRORS
16270 }
16271 
16272 
16273 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleEquivBioSources)
16274 {
16275  // not expected, because not overlapping or abutting
16276  TestMultipleEquivBioSources("some lineage", 10, 15, false);
16277  // abutting, expected
16278  TestMultipleEquivBioSources("some lineage", 10, 11, true);
16279  // overlap, expected
16280  TestMultipleEquivBioSources("some lineage", 10, 8, true);
16281 
16282  // not expected for viruses
16283  TestMultipleEquivBioSources("Viruses", 10, 15, false);
16284  TestMultipleEquivBioSources("Viruses", 10, 11, false);
16285  TestMultipleEquivBioSources("Viruses", 10, 8, false);
16286 
16287 }
16288 
16289 
16290 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleEquivPublications)
16291 {
16292 
16293 
16296  CRef<CPub> pub1(new CPub());
16297  pub1->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
16298  feat1->SetData().SetPub().SetPub().Set().push_back(pub1);
16300  CRef<CPub> pub2(new CPub());
16301  pub2->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
16302  feat2->SetData().SetPub().SetPub().Set().push_back(pub2);
16303  feat2->SetLocation().SetInt().SetFrom(30);
16304  feat2->SetLocation().SetInt().SetTo(40);
16305 
16307  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleEquivPublications",
16308  "Multiple equivalent publication features should be combined into one multi-interval feature"));
16309  // AddChromosomeNoLocation(expected_errors, entry);
16310 
16311  eval = validator.Validate(seh, options);
16312  CheckErrors(*eval, expected_errors);
16313 
16314  CLEAR_ERRORS
16315 }
16316 
16317 
16318 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadFullLengthFeature)
16319 {
16322  src1->SetData().SetBiosrc().SetOrg().SetTaxname("Homo sapiens");
16323  src1->SetData().SetBiosrc().SetOrg().SetOrgname().SetLineage("some lineage");
16324  src1->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength() - 1);
16326  CRef<CPub> pub1(new CPub());
16327  pub1->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
16328  feat1->SetData().SetPub().SetPub().Set().push_back(pub1);
16329  feat1->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength() - 1);
16330  unit_test_util::SetTransgenic(entry, true);
16331 
16333  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadFullLengthFeature",
16334  // "Source feature is full length, should be descriptor"));
16335  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadFullLengthFeature",
16336  "Publication feature is full length, should be descriptor"));
16337  // AddChromosomeNoLocation(expected_errors, entry);
16339  eval = validator.Validate(seh, options);
16340  CheckErrors(*eval, expected_errors);
16341 
16342  CLEAR_ERRORS
16343 
16344  scope.RemoveTopLevelSeqEntry(seh);
16346  src2->SetData().SetBiosrc().SetOrg().SetTaxname("Drosophila melanogaster");
16347  src2->SetData().SetBiosrc().SetOrg().SetOrgname().SetLineage("some lineage");
16348  src2->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength() - 1);
16349  seh = scope.AddTopLevelSeqEntry(*entry);
16350 
16351  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicateFeat",
16352  "Features have identical intervals, but labels differ"));
16353  // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadFullLengthFeature",
16354  // "Source feature is full length, should be descriptor"));
16355  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadFullLengthFeature",
16356  "Multiple full-length source features, should only be one if descriptor is transgenic"));
16357  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadFullLengthFeature",
16358  "Publication feature is full length, should be descriptor"));
16359  // AddChromosomeNoLocation(expected_errors, entry);
16360  eval = validator.Validate(seh, options);
16361  CheckErrors(*eval, expected_errors);
16362 
16363  CLEAR_ERRORS
16364 }
16365 
16366 
16367 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RedundantFields)
16368 {
16374  gene->SetData().SetGene().SetLocus("redundant_g");
16375  gene->SetComment("redundant_g");
16376 
16378 
16379  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "RedundantFields",
16380  "Comment has same value as gene locus"));
16381  // AddChromosomeNoLocation(expected_errors, entry);
16382  eval = validator.Validate(seh, options);
16383  CheckErrors(*eval, expected_errors);
16384 
16385  CLEAR_ERRORS
16386  gene->SetData().SetGene().ResetLocus();
16387  gene->SetData().SetGene().SetLocus_tag("redundant_g");
16388  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "RedundantFields",
16389  "Comment has same value as gene locus_tag"));
16390  // AddChromosomeNoLocation(expected_errors, entry);
16391  eval = validator.Validate(seh, options);
16392  CheckErrors(*eval, expected_errors);
16393 
16394  CLEAR_ERRORS
16395 
16396  gene->ResetComment();
16397  gene->AddQualifier("old_locus_tag", "redundant_g");
16398  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "RedundantFields",
16399  "old_locus_tag has same value as gene locus_tag"));
16400  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "LocusTagProblem",
16401  "Gene locus_tag and old_locus_tag 'redundant_g' match"));
16402  // AddChromosomeNoLocation(expected_errors, entry);
16403 
16404  eval = validator.Validate(seh, options);
16405  CheckErrors(*eval, expected_errors);
16406 
16407  CLEAR_ERRORS
16408 
16409  gene->ResetQual();
16410 
16412  prot->SetData().SetProt().SetName().front().assign("redundant_p");
16413  prot->SetComment("redundant_p");
16414  prot->SetData().SetProt().SetDesc("redundant_p");
16415 
16416  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "RedundantFields",
16417  "Comment has same value as protein name"));
16418  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "RedundantFields",
16419  "Comment has same value as protein description"));
16420  // AddChromosomeNoLocation(expected_errors, entry);
16421  eval = validator.Validate(seh, options);
16422  CheckErrors(*eval, expected_errors);
16423 
16424  CLEAR_ERRORS
16425 }
16426 
16427 
16428 static void AddCDSAndProtForBigGoodNucProtSet(CRef<CSeq_entry> entry, string nuc_id, string prot_id, TSeqPos offset)
16429 {
16430  CRef<CSeq_feat> cds(new CSeq_feat());
16431  cds->SetData().SetCdregion();
16432  cds->SetProduct().SetWhole().SetLocal().SetStr(prot_id);
16433  cds->SetLocation().SetInt().SetId().SetLocal().SetStr(nuc_id);
16434  cds->SetLocation().SetInt().SetFrom(offset + 0);
16435  cds->SetLocation().SetInt().SetTo(offset + 26);
16436  unit_test_util::AddFeat(cds, entry);
16437 
16439 
16440  entry->SetSet().SetSeq_set().push_back(pentry);
16441 
16442 }
16443 
16444 
16446 {
16448  set->SetClass(CBioseq_set::eClass_nuc_prot);
16449 
16450  // make nucleotide
16451  CRef<CBioseq> nseq(new CBioseq());
16452  nseq->SetInst().SetMol(CSeq_inst::eMol_dna);
16453  nseq->SetInst().SetRepr(CSeq_inst::eRepr_raw);
16454  nseq->SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
16455  nseq->SetInst().SetLength(360);
16456 
16457  CRef<CSeq_id> id(new CSeq_id());
16458  id->SetLocal().SetStr("nuc");
16459  nseq->SetId().push_back(id);
16460 
16461  CRef<CSeqdesc> mdesc(new CSeqdesc());
16463  nseq->SetDescr().Set().push_back(mdesc);
16464 
16465  CRef<CSeq_entry> nentry(new CSeq_entry());
16466  nentry->SetSeq(*nseq);
16467 
16468  set->SetSeq_set().push_back(nentry);
16469 
16470  CRef<CSeq_entry> set_entry(new CSeq_entry());
16471  set_entry->SetSet(*set);
16472 
16473  int i = 1;
16474  for (TSeqPos offset = 0; offset < nseq->GetInst().GetLength() - 26; offset += 30, i++) {
16475  string prot_id = "prot" + NStr::IntToString(i);
16476  AddCDSAndProtForBigGoodNucProtSet(set_entry, "nuc", prot_id, offset);
16477  }
16478 
16479  unit_test_util::AddGoodSource(set_entry);
16480  unit_test_util::AddGoodPub(set_entry);
16481  return set_entry;
16482 }
16483 
16484 
16485 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSwithNoMRNA)
16486 {
16488  // make mRNA for first CDS
16490 
16491  CSeq_annot::TData::TFtable::iterator cds_it = entry->SetSet().SetAnnot().front()->SetData().SetFtable().begin();
16492 
16498 
16500 
16501  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSwithNoMRNA",
16502  "11 out of 12 CDSs unmatched"));
16503  // AddChromosomeNoLocation(expected_errors, entry);
16504 
16505  eval = validator.Validate(seh, options);
16506  CheckErrors(*eval, expected_errors);
16507  CLEAR_ERRORS
16508  scope.RemoveTopLevelSeqEntry(seh);
16509  for (int i = 0; i < 3; i++) {
16510  ++cds_it;
16511  CRef<CSeq_feat> new_mrna = unit_test_util::MakemRNAForCDS(*cds_it);
16512  unit_test_util::AddFeat(new_mrna, nuc);
16513  }
16514  seh = scope.AddTopLevelSeqEntry(*entry);
16515  for (int i = 0; i < 8; i++) {
16516  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSwithNoMRNA",
16517  "Unmatched CDS"));
16518  }
16519 
16520  // AddChromosomeNoLocation(expected_errors, entry);
16521  eval = validator.Validate(seh, options);
16522  CheckErrors(*eval, expected_errors);
16523 
16524  CLEAR_ERRORS
16525 }
16526 
16527 
16528 BOOST_FIXTURE_TEST_CASE(Test_SEQ_FEAT_FeatureProductInconsistency, CGenBankFixture)
16529 {
16536  bad_cds->SetData().SetCdregion();
16537  bad_cds->SetLocation().SetInt().SetFrom(30);
16538  bad_cds->SetLocation().SetInt().SetTo(56);
16539  CRef<CSeq_feat> bad_mrna = unit_test_util::MakemRNAForCDS(bad_cds);
16540  unit_test_util::AddFeat(bad_mrna, nuc);
16541 
16543 
16544  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "FeatureProductInconsistency",
16545  "2 CDS features have 1 product references"));
16546  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "MissingCDSproduct", "Expected CDS product absent"));
16547  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoProtein", "No protein Bioseq given"));
16548  // AddChromosomeNoLocation(expected_errors, entry);
16549  eval = validator.Validate(seh, options);
16550  CheckErrors(*eval, expected_errors);
16551 
16552  CLEAR_ERRORS
16553 
16554  scope.RemoveTopLevelSeqEntry(seh);
16555  bad_cds->SetProduct().SetWhole().SetLocal().SetStr("prot");
16556  seh = scope.AddTopLevelSeqEntry(*entry);
16557 
16558  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "FeatureProductInconsistency",
16559  "CDS products are not unique"));
16560  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "MultipleCDSproducts",
16561  "Same product Bioseq from multiple CDS features"));
16562  // AddChromosomeNoLocation(expected_errors, entry);
16563  eval = validator.Validate(seh, options);
16564  CheckErrors(*eval, expected_errors);
16565 
16566  CLEAR_ERRORS
16567 
16568  scope.RemoveTopLevelSeqEntry(seh);
16569  nuc->SetSeq().ResetAnnot();
16570  AddCDSAndProtForBigGoodNucProtSet(entry, "nuc", "prot1", 30);
16571  bad_mrna = unit_test_util::MakemRNAForCDS(entry->SetSet().SetAnnot().front()->SetData().SetFtable().back());
16572  unit_test_util::AddFeat(bad_mrna, nuc);
16573  mrna = unit_test_util::MakemRNAForCDS(cds);
16574  mrna->SetProduct().SetWhole().SetGenbank().SetAccession("AY123456");
16576 
16577  seh = scope.AddTopLevelSeqEntry(*entry);
16578  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "FeatureProductInconsistency",
16579  "2 mRNA features have 1 product references"));
16580  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptLen",
16581  "Transcript length [27] less than (far) product length [485], and tail < 95% polyA"));
16582  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptMismatches",
16583  "There are 16 mismatches out of 27 bases between the transcript and (far) product sequence"));
16584  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
16585  "Inconsistent: Product= partial, Location= complete, Feature.partial= FALSE"));
16586  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "RnaProductMismatch",
16587  "Type of RNA does not match MolInfo of product Bioseq"));
16588  // AddChromosomeNoLocation(expected_errors, entry);
16589 
16590  eval = validator.Validate(seh, options);
16591  CheckErrors(*eval, expected_errors);
16592 
16593  CLEAR_ERRORS
16594 
16595  scope.RemoveTopLevelSeqEntry(seh);
16596  bad_mrna->SetProduct().SetWhole().SetGenbank().SetAccession("AY123456");
16597  seh = scope.AddTopLevelSeqEntry(*entry);
16598  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "FeatureProductInconsistency",
16599  "mRNA products are not unique"));
16600  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
16601  "Inconsistent: Product= partial, Location= complete, Feature.partial= FALSE"));
16602  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "IdenticalMRNAtranscriptIDs",
16603  "Identical transcript IDs found on multiple mRNAs"));
16604  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptLen",
16605  "Transcript length [27] less than (far) product length [485], and tail < 95% polyA"));
16606  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptMismatches",
16607  "There are 16 mismatches out of 27 bases between the transcript and (far) product sequence"));
16608  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "RnaProductMismatch",
16609  "Type of RNA does not match MolInfo of product Bioseq"));
16610  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptLen",
16611  "Transcript length [27] less than (far) product length [485], and tail < 95% polyA"));
16612  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptMismatches",
16613  "There are 16 mismatches out of 27 bases between the transcript and (far) product sequence"));
16614  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
16615  "Inconsistent: Product= partial, Location= complete, Feature.partial= FALSE"));
16616  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "RnaProductMismatch",
16617  "Type of RNA does not match MolInfo of product Bioseq"));
16618  // AddChromosomeNoLocation(expected_errors, entry);
16619 
16620  eval = validator.Validate(seh, options);
16621  CheckErrors(*eval, expected_errors);
16622 
16623  CLEAR_ERRORS
16624 }
16625 
16626 
16627 static void SetFeatureLocationBond(CRef<CSeq_feat> feat, string id, TSeqPos pt1, TSeqPos pt2)
16628 {
16629  feat->SetLocation().SetBond().SetA().SetId().SetLocal().SetStr(id);
16630  feat->SetLocation().SetBond().SetA().SetPoint(0);
16631  feat->SetLocation().SetBond().SetB().SetId().SetLocal().SetStr(id);
16632  feat->SetLocation().SetBond().SetB().SetPoint(5);
16633 }
16634 
16635 
16636 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImproperBondLocation)
16637 {
16640  SetFeatureLocationBond(f1, "good", 0, 5);
16641 
16643  f2->SetData().SetHet();
16644  SetFeatureLocationBond(f2, "good", 0, 5);
16645 
16647  f3->SetData().SetCdregion();
16648  f3->SetPseudo(true);
16649  SetFeatureLocationBond(f3, "good", 0, 5);
16650 
16652  f4->SetData().SetBond();
16653  SetFeatureLocationBond(f4, "good", 0, 5);
16654 
16656  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrandOther",
16657  "Strand 'other' in location"));
16658  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ImproperBondLocation",
16659  "Bond location should only be on bond features"));
16660  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrandOther",
16661  "Strand 'other' in location"));
16662  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ImproperBondLocation",
16663  "Bond location should only be on bond features"));
16664  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrandOther",
16665  "Strand 'other' in location"));
16666  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ImproperBondLocation",
16667  "Bond location should only be on bond features"));
16668  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrandOther",
16669  "Strand 'other' in location"));
16670  // AddChromosomeNoLocation(expected_errors, entry);
16671  eval = validator.Validate(seh, options);
16672  CheckErrors(*eval, expected_errors);
16673 
16674  CLEAR_ERRORS
16675 }
16676 
16677 
16678 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefWithoutGene)
16679 {
16682  feat->SetGeneXref().SetLocus("missing");
16683 
16685 
16686  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GeneXrefWithoutGene",
16687  "Feature has gene locus cross-reference but no equivalent gene feature exists"));
16688  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OnlyGeneXrefs",
16689  "There are 1 gene xrefs and no gene features in this record."));
16690  // AddChromosomeNoLocation(expected_errors, entry);
16691  eval = validator.Validate(seh, options);
16692  CheckErrors(*eval, expected_errors);
16693  CLEAR_ERRORS
16694 
16695  feat->SetGeneXref().ResetLocus();
16696  feat->SetGeneXref().SetLocus_tag("missing");
16697 
16698  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GeneXrefWithoutGene",
16699  "Feature has gene locus_tag cross-reference but no equivalent gene feature exists"));
16700  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OnlyGeneXrefs",
16701  "There are 1 gene xrefs and no gene features in this record."));
16702  // AddChromosomeNoLocation(expected_errors, entry);
16703  eval = validator.Validate(seh, options);
16704  CheckErrors(*eval, expected_errors);
16705  CLEAR_ERRORS
16706 }
16707 
16708 
16710 {
16711  f1.AddSeqFeatXref(f2.GetId());
16712  f2.AddSeqFeatXref(f1.GetId());
16713 }
16714 
16715 
16716 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqFeatXrefProblem)
16717 {
16721 
16722  // add ID to CDS
16723  cds->SetId().SetLocal().SetId(1);
16724 
16725  // create mRNA feature
16727  mrna->SetId().SetLocal().SetId(2);
16729 
16730  // create gene feature
16732  gene->SetId().SetLocal().SetId(3);
16734 
16735  // add misc_feature
16737  misc->SetId().SetLocal().SetId(4);
16738 
16740 
16741  // add broken SeqFeatXref to coding region
16742  CRef<CSeqFeatXref> x1(new CSeqFeatXref());
16743  cds->SetXref().push_back(x1);
16744 
16745  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefProblem",
16746  "SeqFeatXref with no id or data field"));
16747  // AddChromosomeNoLocation(expected_errors, entry);
16748  eval = validator.Validate(seh, options);
16749  CheckErrors(*eval, expected_errors);
16750  cds->ResetXref();
16751 
16752  CLEAR_ERRORS
16753 
16754  // xref between CDS and misc_feat is not allowed,
16755  // triggers error for non-ambiguous CDS/mRNA
16756  scope.RemoveTopLevelSeqEntry(seh);
16757  CreateReciprocalLinks(*cds, *misc);
16758  seh = scope.AddTopLevelSeqEntry(*entry);
16759 
16760 // expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefNotReciprocal",
16761 // "CDS/mRNA unambiguous pair have erroneous cross-references"));
16762  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefProblem",
16763  "Cross-references are not between CDS and mRNA pair or between a gene and a CDS or mRNA (misc_feature,CDS)"));
16764  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefProblem",
16765  "Cross-references are not between CDS and mRNA pair or between a gene and a CDS or mRNA (CDS,misc_feature)"));
16766  // AddChromosomeNoLocation(expected_errors, entry);
16767  eval = validator.Validate(seh, options);
16768  CheckErrors(*eval, expected_errors);
16769 
16770  CLEAR_ERRORS
16771 
16772  // complain if linked-to feature has no xrefs of its own
16773  scope.RemoveTopLevelSeqEntry(seh);
16774  misc->ResetXref();
16775  seh = scope.AddTopLevelSeqEntry(*entry);
16776 // expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefNotReciprocal",
16777 // "CDS/mRNA unambiguous pair have erroneous cross-references"));
16778  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefProblem",
16779  "Cross-referenced feature does not have its own cross-reference"));
16780  // AddChromosomeNoLocation(expected_errors, entry);
16781  eval = validator.Validate(seh, options);
16782  CheckErrors(*eval, expected_errors);
16783 
16784  cds->ResetXref();
16785 
16786  CLEAR_ERRORS
16787 
16788  // create xref between mRNA and coding region - this is allowed
16789  scope.RemoveTopLevelSeqEntry(seh);
16790  CreateReciprocalLinks(*cds, *mrna);
16791  seh = scope.AddTopLevelSeqEntry(*entry);
16792 
16793  // AddChromosomeNoLocation(expected_errors, entry);
16794  eval = validator.Validate(seh, options);
16795  CheckErrors(*eval, expected_errors);
16796 
16797  // create xref between coding region and gene - this is allowed
16798  scope.RemoveTopLevelSeqEntry(seh);
16799  CreateReciprocalLinks(*cds, *gene);
16800  seh = scope.AddTopLevelSeqEntry(*entry);
16801 
16802  eval = validator.Validate(seh, options);
16803  CheckErrors(*eval, expected_errors);
16804 
16805  // create xref between mRNA and gene - this is allowed
16806  scope.RemoveTopLevelSeqEntry(seh);
16807  CreateReciprocalLinks(*mrna, *gene);
16808  seh = scope.AddTopLevelSeqEntry(*entry);
16809 
16810  eval = validator.Validate(seh, options);
16811  CheckErrors(*eval, expected_errors);
16812 
16813  // shouldn't matter what order the links are created in
16814  scope.RemoveTopLevelSeqEntry(seh);
16815  mrna->ResetXref();
16816  cds->ResetXref();
16817  gene->ResetXref();
16818  CreateReciprocalLinks(*cds, *gene);
16819  CreateReciprocalLinks(*mrna, *gene);
16820  CreateReciprocalLinks(*cds, *mrna);
16821  seh = scope.AddTopLevelSeqEntry(*entry);
16822 
16823  eval = validator.Validate(seh, options);
16824  CheckErrors(*eval, expected_errors);
16825 
16826  // if feature has gene xref AND a feature ID xref to a gene feature,
16827  // they should not conflict
16828  scope.RemoveTopLevelSeqEntry(seh);
16830  other_gene->SetData().SetGene().SetLocus("mismatch");
16831  // note that gene and other_gene cannot have the same location or will
16832  // trigger duplicate feature errors, gene xref gene should not be the
16833  // gene mapped to by overlap
16834  other_gene->SetLocation().Assign(gene->GetLocation());
16835  other_gene->SetLocation().SetInt().SetTo(other_gene->GetLocation().GetInt().GetTo() + 1);
16836  seh = scope.AddTopLevelSeqEntry(*entry);
16837 
16838  CRef<CSeqFeatXref> gene_xref(new CSeqFeatXref());
16839  gene_xref->SetData().SetGene().SetLocus("mismatch");
16840  cds->SetXref().push_back(gene_xref);
16841 
16842  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefProblem",
16843  "Feature gene xref does not match Feature ID cross-referenced gene feature"));
16844  eval = validator.Validate(seh, options);
16845  CheckErrors(*eval, expected_errors);
16846 
16847  CLEAR_ERRORS
16848 
16849  // ignore if gene xref and linked gene feature match
16850  scope.RemoveTopLevelSeqEntry(seh);
16851  gene_xref->SetData().SetGene().SetLocus("gene locus");
16852  other_gene->SetLocation().Assign(gene->GetLocation());
16853  gene->SetLocation().SetInt().SetTo(gene->GetLocation().GetInt().GetTo() + 1);
16854  seh = scope.AddTopLevelSeqEntry(*entry);
16855 
16856  // AddChromosomeNoLocation(expected_errors, entry);
16857  eval = validator.Validate(seh, options);
16858  CheckErrors(*eval, expected_errors);
16859 
16860  CLEAR_ERRORS
16861 }
16862 
16863 
16864 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingTrnaAA)
16865 {
16868  feat->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
16869 
16871  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingTrnaAA",
16872  "Missing encoded amino acid qualifier in tRNA"));
16873  // AddChromosomeNoLocation(expected_errors, entry);
16874 
16875  eval = validator.Validate(seh, options);
16876  CheckErrors(*eval, expected_errors);
16877 
16878  CLEAR_ERRORS
16879 }
16880 
16881 
16882 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CollidingFeatureIDs)
16883 {
16886  feat->SetId().SetLocal().SetId(1);
16888  gene->SetId().SetLocal().SetId(1);
16889  unit_test_util::AddFeat(gene, entry);
16890 
16892  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "CollidingFeatureIDs",
16893  "Colliding feature ID 1"));
16894  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "CollidingFeatureIDs",
16895  "Colliding feature ID 1"));
16896  // AddChromosomeNoLocation(expected_errors, entry);
16897  eval = validator.Validate(seh, options);
16898  CheckErrors(*eval, expected_errors);
16899 
16900  CLEAR_ERRORS
16901 }
16902 
16903 
16904 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PolyAsignalNotRange)
16905 {
16908  feat->SetData().SetImp().SetKey("polyA_signal");
16909  feat->SetLocation().SetInt().SetTo(0);
16910 
16912  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PolyAsignalNotRange",
16913  "PolyA_signal should be a range"));
16914  // AddChromosomeNoLocation(expected_errors, entry);
16915  eval = validator.Validate(seh, options);
16916  CheckErrors(*eval, expected_errors);
16917 
16918  CLEAR_ERRORS
16919 }
16920 
16921 
16922 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_OldLocusTagMismtach)
16923 {
16925 
16927  feat->AddQualifier("old_locus_tag", "one value");
16928 
16930  gene->AddQualifier("old_locus_tag", "another value");
16931  unit_test_util::AddFeat(gene, entry);
16932 
16934 
16935  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OldLocusTagMismtach",
16936  "Old locus tag on feature (one value) does not match that on gene (another value)"));
16937  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "OldLocusTagWithoutLocusTag",
16938  "old_locus_tag without inherited locus_tag"));
16939  // AddChromosomeNoLocation(expected_errors, entry);
16940  eval = validator.Validate(seh, options);
16941  CheckErrors(*eval, expected_errors);
16942 
16943  CLEAR_ERRORS
16944 }
16945 
16946 
16947 static CRef<CUser_field> MakeGoTerm(string text = "something", string evidence = "some evidence")
16948 {
16949  CRef<CUser_field> go_term(new CUser_field());
16950  go_term->SetLabel().SetStr("a go term");
16951 
16952  SetGoTermId(*go_term, "123");
16953 
16954  SetGoTermPMID(*go_term, 4);
16955 
16956  SetGoTermText(*go_term, text);
16957 
16958  AddGoTermEvidence(*go_term, evidence);
16959 
16960  return go_term;
16961 }
16962 
16963 
16965 {
16967  AddFeat(feat, entry);
16968 
16970 
16971  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "DuplicateGeneOntologyTerm",
16972  "Duplicate GO term on feature"));
16973  // AddChromosomeNoLocation(expected_errors, entry);
16974  eval = validator.Validate(seh, options);
16975  CheckErrors(*eval, expected_errors);
16976 
16977  CLEAR_ERRORS
16978 
16979  BOOST_CHECK_EQUAL(CountProcessGoTerms(*feat), (size_t)2);
16980  RemoveDuplicateGoTerms(*feat);
16981  BOOST_CHECK_EQUAL(CountProcessGoTerms(*feat), (size_t)1);
16982  RemoveDuplicateGoTerms(*feat);
16983  BOOST_CHECK_EQUAL(CountProcessGoTerms(*feat), (size_t)1);
16984 
16985 }
16986 
16987 
16989 {
16991  AddFeat(feat, entry);
16992 
16994 
16995  eval = validator.Validate(seh, options);
16996  // AddChromosomeNoLocation(expected_errors, entry);
16997  CheckErrors(*eval, expected_errors);
16998 
16999  CLEAR_ERRORS
17000 
17001  BOOST_CHECK_EQUAL(CountProcessGoTerms(*feat), (size_t)2);
17002  RemoveDuplicateGoTerms(*feat);
17003  BOOST_CHECK_EQUAL(CountProcessGoTerms(*feat), (size_t)2);
17004 }
17005 
17006 
17008 {
17009  CRef<CSeq_feat> feat(new CSeq_feat());
17010  feat->SetLocation().SetInt().SetFrom(0);
17011  feat->SetLocation().SetInt().SetTo(10);
17012  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
17013  feat->SetData().SetImp().SetKey("misc_feature");
17014  feat->SetComment("comment is required");
17015 
17016  AddProcessGoTerm(*feat, term1);
17017  AddProcessGoTerm(*feat, term2);
17018 
17019  return feat;
17020 }
17021 
17022 
17023 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateGeneOntologyTerm)
17024 {
17025 
17026  CRef<CUser_field> term1 = MakeGoTerm();
17027  CRef<CUser_field> term2 = MakeGoTerm();
17028  CRef<CSeq_feat> feat = MakeGeneOntologyFeat(term1, term2);
17029 
17031 
17032  SetGoTermId(*term2, "234");
17033  feat = MakeGeneOntologyFeat(term1, term2);
17035 
17036  term2 = MakeGoTerm();
17037  ClearGoTermEvidence(*term1);
17038  feat = MakeGeneOntologyFeat(term1, term2);
17040 
17041  ClearGoTermEvidence(*term2);
17042  feat = MakeGeneOntologyFeat(term1, term2);
17044 
17045  AddGoTermEvidence(*term1, "A");
17046  AddGoTermEvidence(*term1, "B");
17047 
17048  AddGoTermEvidence(*term2, "C");
17049  AddGoTermEvidence(*term2, "B");
17050  feat = MakeGeneOntologyFeat(term1, term2);
17052 
17053  ClearGoTermEvidence(*term1);
17054  ClearGoTermEvidence(*term2);
17055  ClearGoTermPMID(*term2);
17056  feat = MakeGeneOntologyFeat(term1, term2);
17058 
17059  ClearGoTermPMID(*term1);
17060  feat = MakeGeneOntologyFeat(term1, term2);
17062 
17063 }
17064 
17065 
17066 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidInferenceValue)
17067 {
17070  feat->AddQualifier("inference", " ");
17071 
17073 
17074  feat->SetQual().front()->SetVal("bad");
17075  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidInferenceValue",
17076  "Inference qualifier problem - bad inference prefix (bad)"));
17077  // AddChromosomeNoLocation(expected_errors, entry);
17078  eval = validator.Validate(seh, options);
17079  CheckErrors(*eval, expected_errors);
17080 
17081  feat->SetQual().front()->SetVal("similar to sequence");
17082  expected_errors[0]->SetErrMsg("Inference qualifier problem - bad inference body (similar to sequence)");
17083  eval = validator.Validate(seh, options);
17084  CheckErrors(*eval, expected_errors);
17085 
17086  feat->SetQual().front()->SetVal("profile(same species): INSD:AY123456.1");
17087  expected_errors[0]->SetErrMsg("Inference qualifier problem - same species misused (profile(same species): INSD:AY123456.1)");
17088  eval = validator.Validate(seh, options);
17089  CheckErrors(*eval, expected_errors);
17090 
17091  feat->SetQual().front()->SetVal("similar to RNA sequence: INSD:AY123456.1 INSD:AY123457");
17092  expected_errors[0]->SetErrMsg("Inference qualifier problem - spaces in inference (similar to RNA sequence: INSD:AY123456.1 INSD:AY123457)");
17093  eval = validator.Validate(seh, options);
17094  CheckErrors(*eval, expected_errors);
17095 
17096  feat->SetQual().front()->SetVal("similar to RNA sequence: INSD:AY123456");
17097  expected_errors[0]->SetErrMsg("Inference qualifier problem - bad inference accession version (similar to RNA sequence: INSD:AY123456)");
17098  eval = validator.Validate(seh, options);
17099  CheckErrors(*eval, expected_errors);
17100 
17101  feat->SetQual().front()->SetVal("similar to RNA sequence: RefSeq:AY123456.1");
17102  expected_errors[0]->SetErrMsg("Inference qualifier problem - bad accession type (similar to RNA sequence: RefSeq:AY123456.1)");
17103  eval = validator.Validate(seh, options);
17104  CheckErrors(*eval, expected_errors);
17105 
17106  feat->SetQual().front()->SetVal("similar to RNA sequence: BLAST:AY123456.1");
17107  expected_errors[0]->SetErrMsg("Inference qualifier problem - bad accession type (similar to RNA sequence: BLAST:AY123456.1)");
17108  eval = validator.Validate(seh, options);
17109  CheckErrors(*eval, expected_errors);
17110 
17111  feat->SetQual().front()->SetVal("similar to AA sequence:RefSeq:gi|21240850|ref|NP_640432.1|");
17112  eval = validator.Validate(seh, options);
17113  expected_errors[0]->SetErrMsg("Inference qualifier problem - the value in the accession field is not legal. The only allowed value is accession.version, eg AF123456.1. Problem = (similar to AA sequence:RefSeq:gi|21240850|ref|NP_640432.1|)");
17114  CheckErrors(*eval, expected_errors);
17115 
17116 
17117  CLEAR_ERRORS
17118 
17119  // SRA inferences are ok
17120  feat->SetQual().front()->SetVal("similar to RNA sequence:INSD:ERP003431");
17121  // AddChromosomeNoLocation(expected_errors, entry);
17122  eval = validator.Validate(seh, options);
17123  CheckErrors(*eval, expected_errors);
17124 
17125  // GeneDB is ok for similar to
17126  feat->SetQual().front()->SetVal("similar to RNA sequence:GeneDB:LmjF.01.0090");
17127  eval = validator.Validate(seh, options);
17128  CheckErrors(*eval, expected_errors);
17129 
17130  CLEAR_ERRORS
17131 }
17132 
17133 
17134 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_HypotheticalProteinMismatch) {
17136 
17137  CRef<CSeq_id> protid(new CSeq_id());
17138  protid->SetOther().SetAccession("XP_654321");
17139  unit_test_util::ChangeProtId(entry, protid);
17141  prot->SetData().SetProt().ResetName();
17142  prot->SetData().SetProt().SetName().push_back("hypothetical protein XP_123");
17143 
17145 
17146  expected_errors.push_back(new CExpectedError("ref|XP_654321|", eDiag_Warning, "HypotheticalProteinMismatch",
17147  "Hypothetical protein reference does not match accession"));
17148  // AddChromosomeNoLocation(expected_errors, entry);
17149  eval = validator.Validate(seh, options);
17150  CheckErrors(*eval, expected_errors);
17151 
17152  CLEAR_ERRORS
17153 }
17154 
17155 
17156 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SelfReferentialProduct)
17157 {
17160  cds->SetData().SetCdregion();
17161  cds->SetLocation().SetInt().SetTo(59);
17162  cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
17163  cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
17164  cds->SetPartial(true);
17165  cds->SetProduct().SetWhole().Assign(*(entry->SetSeq().SetId().front()));
17166 
17168 
17169  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SelfReferentialProduct",
17170  "Self-referential feature product"));
17171  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PartialsInconsistent",
17172  "Inconsistent: Product= complete, Location= partial, Feature.partial= TRUE"));
17173  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "CDSproductPackagingProblem",
17174  "Protein product not packaged in nuc-prot set with nucleotide"));
17175  // AddChromosomeNoLocation(expected_errors, entry);
17176  eval = validator.Validate(seh, options);
17177  CheckErrors(*eval, expected_errors);
17178 
17179  CLEAR_ERRORS
17180 }
17181 
17182 
17183 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ITSdoesNotAbutRRNA)
17184 {
17187  rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
17188  rrna->SetData().SetRna().SetExt().SetName("18s ribosomal subunit");
17189 
17191  its->SetData().SetRna().SetType(CRNA_ref::eType_miscRNA);
17192  its->SetData().SetRna().SetExt().SetName("internal transcribed spacer 1");
17193  its->SetLocation().SetInt().SetFrom(rrna->GetLocation().GetInt().GetTo() + 2);
17194  its->SetLocation().SetInt().SetTo(rrna->GetLocation().GetInt().GetTo() + 12);
17195 
17197 
17198  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ITSdoesNotAbutRRNA",
17199  "ITS does not abut adjacent rRNA component"));
17200  // AddChromosomeNoLocation(expected_errors, entry);
17201  eval = validator.Validate(seh, options);
17202  CheckErrors(*eval, expected_errors);
17203 
17204  scope.RemoveTopLevelSeqEntry(seh);
17205  unit_test_util::RevComp(entry);
17206  seh = scope.AddTopLevelSeqEntry(*entry);
17207  eval = validator.Validate(seh, options);
17208  CheckErrors(*eval, expected_errors);
17209 
17210  rrna->SetData().SetRna().SetExt().SetName("5.8S ribosomal subunit");
17211  its->SetData().SetRna().SetExt().SetName("internal transcribed spacer 2");
17212  eval = validator.Validate(seh, options);
17213  CheckErrors(*eval, expected_errors);
17214 
17215  scope.RemoveTopLevelSeqEntry(seh);
17216  unit_test_util::RevComp(entry);
17217  seh = scope.AddTopLevelSeqEntry(*entry);
17218  eval = validator.Validate(seh, options);
17219  CheckErrors(*eval, expected_errors);
17220 
17221  CLEAR_ERRORS
17222 }
17223 
17224 
17225 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureSeqIDCaseDifference)
17226 {
17229  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("Good");
17231 
17232  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "FeatureSeqIDCaseDifference",
17233  "Sequence identifier in feature location differs in capitalization with identifier on Bioseq"));
17234  // AddChromosomeNoLocation(expected_errors, entry);
17235  eval = validator.Validate(seh, options);
17236  CheckErrors(*eval, expected_errors);
17237 
17238  CLEAR_ERRORS
17239 }
17240 
17241 
17242 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureLocationIsGi0)
17243 {
17245  entry->SetSeq().SetId().front()->SetGi(ZERO_GI);
17247 
17249 
17250  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Critical, "ZeroGiNumber",
17251  "Invalid GI number"));
17252  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Error, "GiWithoutAccession",
17253  "No accession on sequence with gi number"));
17254  expected_errors.push_back(new CExpectedError("gi|0", eDiag_Critical, "FeatureLocationIsGi0",
17255  "Feature has 1 gi|0 location on Bioseq gi|0"));
17256  // AddChromosomeNoLocation(expected_errors, entry);
17257  eval = validator.Validate(seh, options);
17258  CheckErrors(*eval, expected_errors);
17259 
17260  CLEAR_ERRORS
17261 }
17262 
17263 
17264 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GapFeatureProblem)
17265 {
17267  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CNCATGATGATG");
17268 
17270  gap->SetData().SetImp().SetKey("gap");
17271  gap->AddQualifier("estimated_length", "11");
17272 
17274 
17275  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapFeatureProblem",
17276  "Gap feature over 11 real bases"));
17277  // AddChromosomeNoLocation(expected_errors, entry);
17278  eval = validator.Validate(seh, options);
17279  CheckErrors(*eval, expected_errors);
17280 
17281  CLEAR_ERRORS
17282 
17283  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InstantiatedGapMismatch",
17284  "Gap feature location does not match delta gap coordinates"));
17285  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapFeatureProblem",
17286  "Gap feature over 2 real bases"));
17287 
17288  gap->SetLocation().SetInt().SetFrom(10);
17289  gap->SetLocation().SetInt().SetTo(20);
17290  eval = validator.Validate(seh, options);
17291  CheckErrors(*eval, expected_errors);
17292 
17293  gap->SetLocation().SetInt().SetFrom(20);
17294  gap->SetLocation().SetInt().SetTo(30);
17295  expected_errors[1]->SetErrMsg("Gap feature over 8 real bases and 1 Ns");
17296  eval = validator.Validate(seh, options);
17297  CheckErrors(*eval, expected_errors);
17298 
17299  CLEAR_ERRORS
17300 
17301  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapFeatureProblem",
17302  "Gap feature estimated_length 11 does not match 10 feature length"));
17303  gap->SetLocation().SetInt().SetFrom(12);
17304  gap->SetLocation().SetInt().SetTo(21);
17305  eval = validator.Validate(seh, options);
17306  CheckErrors(*eval, expected_errors);
17307 
17308  CLEAR_ERRORS
17309 }
17310 
17311 
17313 {
17316  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATAA");
17317  nuc->SetSeq().SetInst().SetLength(366);
17318  nuc->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
17321  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MFFFFFFFFFFPPPPPPPPPPGGGGGGGGGGKKKKKKKKKKFFFFFFFFFFPPPPPPPPPPGGGGGGGGGGKKKKKKKKKKFFFFFFFFFFPPPPPPPPPPGGGGGGGGGGKKKKKKKKKK");
17322  prot->SetSeq().SetInst().SetLength(121);
17325  cds->SetLocation().SetInt().SetFrom(0);
17326  cds->SetLocation().SetInt().SetTo(nuc->GetSeq().GetInst().GetLength()-1);
17327  if (nuc_id) {
17329  }
17330  if (prot_id) {
17332  }
17333  return np;
17334 }
17335 
17336 
17338 {
17339  CRef<CSeq_entry> entry(new CSeq_entry());
17342  contig->SetSeq().SetInst().SetSeq_data().SetIupacna().Set(
17343  "ATGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATAAGGGCCCTTT"
17344  );
17345  contig->SetSeq().SetInst().SetLength(375);
17346  entry->SetSet().SetSeq_set().push_back(contig);
17347  CRef<CSeq_id> nuc_id(new CSeq_id());
17348  nuc_id->SetLocal().SetStr("nuc");
17349  CRef<CSeq_id> prot_id(new CSeq_id());
17350  prot_id->SetLocal().SetStr("prot");
17351  CRef<CSeq_entry> np = BuildGenProdSetBigNucProtSet(nuc_id, prot_id);
17352  entry->SetSet().SetSeq_set().push_back(np);
17353 
17354  CRef<CSeq_feat> cds(new CSeq_feat());
17356  cds->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
17357  unit_test_util::AddFeat(cds, contig);
17359  mrna->SetProduct().SetWhole().Assign(*nuc_id);
17360  unit_test_util::AddFeat(mrna, contig);
17361 
17362  return entry;
17363 }
17364 
17365 
17366 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ErroneousException)
17367 {
17370  cds->SetExcept(true);
17371  cds->SetExcept_text("unclassified translation discrepancy");
17373  mrna->SetExcept(true);
17374  mrna->SetExcept_text("unclassified transcription discrepancy");
17376  genomic->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATAAGGGCCCTTT");
17377 
17379 
17380  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ErroneousException",
17381  "CDS has unclassified exception but only difference is 1 mismatches out of 121 residues"));
17382  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ErroneousException",
17383  "mRNA has unclassified exception but only difference is 1 mismatches out of 366 bases"));
17384  // AddChromosomeNoLocation(expected_errors, entry);
17385  eval = validator.Validate(seh, options);
17386  CheckErrors(*eval, expected_errors);
17387 
17388  CLEAR_ERRORS
17389 }
17390 
17391 
17392 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_WholeLocation)
17393 {
17396  misc->SetLocation().SetWhole().Assign(*(entry->SetSeq().SetId().front()));
17398  cds->SetData().SetCdregion();
17399  cds->SetLocation().SetWhole().Assign(*(entry->SetSeq().SetId().front()));
17400  cds->SetPseudo(true);
17401 
17403  mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
17404  mrna->SetLocation().SetWhole().Assign(*(entry->SetSeq().SetId().front()));
17405  mrna->SetPseudo(true);
17406 
17407 
17409 
17410  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrandOther",
17411  "Strand 'other' in location"));
17412  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "WholeLocation",
17413  "Feature may not have whole location"));
17414  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrandOther",
17415  "Strand 'other' in location"));
17416  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "WholeLocation",
17417  "CDS may not have whole location"));
17418  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrandOther",
17419  "Strand 'other' in location"));
17420  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "WholeLocation",
17421  "mRNA may not have whole location"));
17422  // AddChromosomeNoLocation(expected_errors, entry);
17423  eval = validator.Validate(seh, options);
17424  CheckErrors(*eval, expected_errors);
17425 
17426  CLEAR_ERRORS
17427 }
17428 
17429 
17430 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_EcNumberProblem)
17431 {
17434  cds->SetComment("EC:1.1.1.10");
17436  prot->SetData().SetProt().SetName().front().append("; EC:1.1.1.10");
17437  prot->SetComment("EC:1.1.1.10");
17438  prot->SetData().SetProt().SetEc().push_back("");
17439 
17442  exon->SetData().SetImp().SetKey("exon");
17443  exon->AddQualifier("EC_number", "");
17444 
17446 
17447  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InvalidPunctuation",
17448  "Qualifier other than replace has just quotation marks"));
17449  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "EcNumberEmpty",
17450  "EC number should not be empty"));
17451  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "EcNumberInProteinName",
17452  "Apparent EC number in protein title"));
17453  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "EcNumberInProteinComment",
17454  "Apparent EC number in protein comment"));
17455  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "EcNumberEmpty",
17456  "EC number should not be empty"));
17457  // AddChromosomeNoLocation(expected_errors, entry);
17458  eval = validator.Validate(seh, options);
17459  CheckErrors(*eval, expected_errors);
17460 
17461  CLEAR_ERRORS
17462 
17463  prot->SetData().SetProt().ResetEc();
17464  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InvalidPunctuation",
17465  "Qualifier other than replace has just quotation marks"));
17466  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "EcNumberEmpty",
17467  "EC number should not be empty"));
17468  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "EcNumberInProteinName",
17469  "Apparent EC number in protein title"));
17470  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "EcNumberInProteinComment",
17471  "Apparent EC number in protein comment"));
17472  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "EcNumberInCDSComment",
17473  "Apparent EC number in CDS comment"));
17474  // AddChromosomeNoLocation(expected_errors, entry);
17475 
17476  eval = validator.Validate(seh, options);
17477  CheckErrors(*eval, expected_errors);
17478 
17479  CLEAR_ERRORS
17480 }
17481 
17482 
17483 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_VectorContamination)
17484 {
17487  misc->AddQualifier("standard_name", "Vector Contamination");
17488 
17490 
17491  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "VectorContamination",
17492  "Vector Contamination region should be trimmed from sequence"));
17493  // AddChromosomeNoLocation(expected_errors, entry);
17494 
17495  eval = validator.Validate(seh, options);
17496  CheckErrors(*eval, expected_errors);
17497 
17498  CLEAR_ERRORS
17499 }
17500 
17501 
17502 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MinusStrandProtein)
17503 {
17506  misc->SetLocation().SetInt().SetStrand(eNa_strand_minus);
17507  misc->SetLocation().SetInt().SetTo(5);
17508 
17510 
17511  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MinusStrandProtein",
17512  "Feature on protein indicates negative strand"));
17513  // AddChromosomeNoLocation(expected_errors, entry);
17514 
17515  eval = validator.Validate(seh, options);
17516  CheckErrors(*eval, expected_errors);
17517 
17518  CLEAR_ERRORS
17519 }
17520 
17521 
17522 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadProteinName)
17523 {
17526  prot->SetData().SetProt().ResetName();
17527  prot->SetData().SetProt().SetName().push_back("Hypothetical protein");
17528  prot->SetData().SetProt().SetEc().push_back("1.1.1.20");
17529 
17531 
17532  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadProteinName",
17533  "Unknown or hypothetical protein should not have EC number"));
17534  // AddChromosomeNoLocation(expected_errors, entry);
17535  eval = validator.Validate(seh, options);
17536  CheckErrors(*eval, expected_errors);
17537 
17538  prot->SetData().SetProt().ResetName();
17539  prot->SetData().SetProt().SetName().push_back("hypothetical protein");
17540  eval = validator.Validate(seh, options);
17541  CheckErrors(*eval, expected_errors);
17542 
17543  prot->SetData().SetProt().ResetName();
17544  prot->SetData().SetProt().SetName().push_back("Unknown protein");
17545  eval = validator.Validate(seh, options);
17546  CheckErrors(*eval, expected_errors);
17547 
17548  prot->SetData().SetProt().ResetName();
17549  prot->SetData().SetProt().SetName().push_back("unknown protein");
17550  eval = validator.Validate(seh, options);
17551  CheckErrors(*eval, expected_errors);
17552 
17553  CLEAR_ERRORS
17554 }
17555 
17556 
17557 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefWithoutLocus)
17558 {
17562  unit_test_util::AddFeat(gene1, entry);
17564  gene2->SetData().SetGene().SetLocus_tag("locus_tag");
17565  gene2->SetData().SetGene().SetLocus("second locus");
17566  gene2->SetLocation().SetInt().SetTo(misc->GetLocation().GetInt().GetTo() + 5);
17567  unit_test_util::AddFeat(gene2, entry);
17569  x->SetData().SetGene().SetLocus_tag("locus_tag");
17570  misc->SetXref().push_back(x);
17571 
17573 
17574  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GeneXrefWithoutLocus",
17575  "Feature has Gene Xref with locus_tag but no locus, gene with locus_tag and locus exists"));
17576  // AddChromosomeNoLocation(expected_errors, entry);
17577  eval = validator.Validate(seh, options);
17578  CheckErrors(*eval, expected_errors);
17579 
17580  CLEAR_ERRORS
17581 }
17582 
17583 
17584 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UTRdoesNotExtendToEnd)
17585 {
17588  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTAAAAAGGGAAA");
17589  nuc->SetSeq().SetInst().SetLength(36);
17590  nuc->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
17594  utr3->SetData().SetImp().SetKey("3'UTR");
17595  utr3->SetLocation().SetInt().SetFrom(cds->GetLocation().GetInt().GetTo() + 1);
17596  utr3->SetLocation().SetInt().SetTo(30);
17597 
17599 
17600  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UTRdoesNotExtendToEnd",
17601  "3'UTR does not extend to end of mRNA"));
17602  // AddChromosomeNoLocation(expected_errors, entry);
17603  eval = validator.Validate(seh, options);
17604  CheckErrors(*eval, expected_errors);
17605 
17606  CLEAR_ERRORS
17607 }
17608 
17609 
17610 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDShasTooManyXs)
17611 {
17614  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGNNNNNNNNNNNNNNNATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
17616  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MXXXXXIN");
17617 
17619 
17620  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "FeatureIsMostlyNs",
17621  "Feature contains more than 50% Ns"));
17622  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "CDShasTooManyXs",
17623  "CDS translation consists of more than 50% X residues"));
17624  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "HighNpercent5Prime",
17625  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
17626  // AddChromosomeNoLocation(expected_errors, entry);
17627 
17628  eval = validator.Validate(seh, options);
17629  CheckErrors(*eval, expected_errors);
17630 
17631  CLEAR_ERRORS
17632 }
17633 
17634 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SuspiciousFrame)
17635 {
17638  cds->SetData().SetCdregion().SetFrame(CCdregion::eFrame_two);
17639  cds->SetLocation().SetInt().SetTo(21);
17640 
17642  string tmp;
17643  CSeqTranslator::Translate(*cds, scope, tmp, false, false);
17644  scope.RemoveTopLevelSeqEntry(seh);
17646  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set(tmp);
17647  prot->SetSeq().SetInst().SetLength(tmp.length());
17650  seh = scope.AddTopLevelSeqEntry(*entry);
17651 
17652  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "SuspiciousFrame",
17653  "Suspicious CDS location - reading frame > 1 but not 5' partial"));
17654  // AddChromosomeNoLocation(expected_errors, entry);
17655  eval = validator.Validate(seh, options);
17656  CheckErrors(*eval, expected_errors);
17657 
17658  cds->SetData().SetCdregion().SetFrame(CCdregion::eFrame_three);
17659  cds->SetLocation().SetInt().SetFrom(1);
17660  cds->SetLocation().SetInt().SetTo(26);
17661  cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
17662  cds->SetPartial(true);
17663  tmp.clear();
17664  CSeqTranslator::Translate(*cds, scope, tmp, false, false);
17665  scope.RemoveTopLevelSeqEntry(seh);
17666  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set(tmp);
17667  prot->SetSeq().SetInst().SetLength(tmp.length());
17670  prot_feat->SetLocation().SetPartialStart(true, eExtreme_Biological);
17671  prot_feat->SetPartial(true);
17672  seh = scope.AddTopLevelSeqEntry(*entry);
17673  CLEAR_ERRORS
17674 
17675  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus5Prime",
17676  "5' partial is not at beginning of sequence, gap, or consensus splice site"));
17677  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SuspiciousFrame",
17678  "Suspicious CDS location - reading frame > 1 and not at consensus splice site"));
17679  // AddChromosomeNoLocation(expected_errors, entry);
17680  eval = validator.Validate(seh, options);
17681  CheckErrors(*eval, expected_errors);
17682 
17683  CLEAR_ERRORS
17684 }
17685 
17686 
17687 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TerminalXDiscrepancy)
17688 {
17691  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACNAAGGG");
17693  cds->SetPartial(true);
17694  cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
17695  cds->SetLocation().SetInt().SetFrom(30);
17696  cds->SetLocation().SetInt().SetTo(nuc->GetSeq().GetInst().GetLength() - 1);
17698  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEINXX");
17699  prot->SetSeq().SetInst().SetLength(10);
17703  prot_feat->SetPartial(true);
17704  prot_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
17705 
17707  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
17708  "Given protein length [8] does not match translation length [10]"));
17709  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TerminalXDiscrepancy",
17710  "Terminal X count for CDS translation (0) and protein product sequence (2) are not equal"));
17711  // AddChromosomeNoLocation(expected_errors, entry);
17712  eval = validator.Validate(seh, options);
17713  CheckErrors(*eval, expected_errors);
17714 
17715  CLEAR_ERRORS
17716 }
17717 
17718 
17719 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryTranslExcept)
17720 {
17723  CRef<CCode_break> codebreak(new CCode_break());
17724  codebreak->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
17725  codebreak->SetLoc().SetInt().SetFrom(3);
17726  codebreak->SetLoc().SetInt().SetTo(5);
17727  codebreak->SetAa().SetNcbieaa('P');
17728  cds->SetData().SetCdregion().SetCode_break().push_back(codebreak);
17729 
17731 
17732  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryTranslExcept",
17733  "Unnecessary transl_except P at position 2"));
17734  // AddChromosomeNoLocation(expected_errors, entry);
17735  eval = validator.Validate(seh, options);
17736  CheckErrors(*eval, expected_errors);
17737 
17738  CLEAR_ERRORS
17739  codebreak->SetLoc().SetInt().SetFrom(0);
17740  codebreak->SetLoc().SetInt().SetTo(2);
17741  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranslExcept",
17742  "Suspicious transl_except P at first codon of complete CDS"));
17743  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "MisMatchAA",
17744  "Residue 1 in protein [M] != translation [P] at lcl|nuc:1-3"));
17745  // AddChromosomeNoLocation(expected_errors, entry);
17746  eval = validator.Validate(seh, options);
17747  CheckErrors(*eval, expected_errors);
17748 
17749  CLEAR_ERRORS
17750 }
17751 
17752 
17753 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidMatchingReplace)
17754 {
17757  feat->SetData().SetImp().SetKey("misc_difference");
17758  feat->AddQualifier("replace", "aattggccaaa");
17759 
17761 
17762  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "InvalidMatchingReplace",
17763  "/replace already matches underlying sequence (aattggccaaa)"));
17764  // AddChromosomeNoLocation(expected_errors, entry);
17765  eval = validator.Validate(seh, options);
17766  CheckErrors(*eval, expected_errors);
17767  CLEAR_ERRORS
17768 }
17769 
17770 
17771 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NotSpliceConsensusDonor)
17772 {
17776  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
17777  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
17778  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'G';
17779  CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
17780  unit_test_util::AddFeat(intron, nuc);
17781 
17783 
17784  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor",
17785  "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|nuc"));
17786  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor",
17787  "Splice donor consensus (GT) not found after exon ending at position 16 of lcl|nuc"));
17788  // AddChromosomeNoLocation(expected_errors, entry);
17789  eval = validator.Validate(seh, options);
17790  CheckErrors(*eval, expected_errors);
17791 
17792  scope.RemoveTopLevelSeqEntry(seh);
17793  unit_test_util::RevComp(entry);
17794  seh = scope.AddTopLevelSeqEntry(*entry);
17795  expected_errors[0]->SetErrMsg("Splice donor consensus (GT) not found at start of intron, position 44 of lcl|nuc");
17796  expected_errors[1]->SetErrMsg("Splice donor consensus (GT) not found after exon ending at position 45 of lcl|nuc");
17797  eval = validator.Validate(seh, options);
17798  CheckErrors(*eval, expected_errors);
17799 
17800  scope.RemoveTopLevelSeqEntry(seh);
17801  unit_test_util::RevComp(entry);
17802  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = '\xFB';
17803  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[17] = '\xFB';
17804  seh = scope.AddTopLevelSeqEntry(*entry);
17805  CLEAR_ERRORS
17806  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [17]"));
17807  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [18]"));
17808  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|nuc"));
17809  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Bad sequence at splice donor after exon ending at position 16 of lcl|nuc"));
17810  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
17811  // AddChromosomeNoLocation(expected_errors, entry);
17812  eval = validator.Validate(seh, options);
17813  CheckErrors(*eval, expected_errors);
17814 
17815  scope.RemoveTopLevelSeqEntry(seh);
17816  unit_test_util::RevComp(entry);
17817  seh = scope.AddTopLevelSeqEntry(*entry);
17818  CLEAR_ERRORS
17819  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [43]"));
17820  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [44]"));
17821  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Splice donor consensus (GT) not found at start of intron, position 44 of lcl|nuc"));
17822  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Bad sequence at splice donor after exon ending at position 45 of lcl|nuc"));
17823  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
17824  // AddChromosomeNoLocation(expected_errors, entry);
17825 
17826  eval = validator.Validate(seh, options);
17827  CheckErrors(*eval, expected_errors);
17828 
17829  CLEAR_ERRORS
17830 
17831  scope.RemoveTopLevelSeqEntry(seh);
17832  entry = unit_test_util::BuildGoodSeq();
17833  intron = unit_test_util::AddMiscFeature(entry);
17834  intron->SetData().SetImp().SetKey("intron");
17835  seh = scope.AddTopLevelSeqEntry(*entry);
17836  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NotSpliceConsensusDonorTerminalIntron",
17837  "Splice donor consensus (GT) not found at start of terminal intron, position 1 of lcl|good"));
17838  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
17839  "Splice acceptor consensus (AG) not found at end of intron, position 11 of lcl|good"));
17840  // AddChromosomeNoLocation(expected_errors, entry);
17841  eval = validator.Validate(seh, options);
17842  CheckErrors(*eval, expected_errors);
17843 
17844  scope.RemoveTopLevelSeqEntry(seh);
17845  unit_test_util::RevComp(entry);
17846  seh = scope.AddTopLevelSeqEntry(*entry);
17847  CLEAR_ERRORS
17848  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NotSpliceConsensusDonorTerminalIntron",
17849  "Splice donor consensus (GT) not found at start of terminal intron, position 60 of lcl|good"));
17850  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
17851  "Splice acceptor consensus (AG) not found at end of intron, position 50 of lcl|good"));
17852  // AddChromosomeNoLocation(expected_errors, entry);
17853  eval = validator.Validate(seh, options);
17854  CheckErrors(*eval, expected_errors);
17855 
17856  CLEAR_ERRORS
17857 }
17858 
17859 
17860 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NotSpliceConsensusAcceptor)
17861 {
17865  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
17866  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = 'G';
17867  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[17] = 'T';
17868  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'T';
17869  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'C';
17870  CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
17871  unit_test_util::AddFeat(intron, nuc);
17872 
17874 
17875  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17876  "Splice acceptor consensus (AG) not found at end of intron, position 46 of lcl|nuc"));
17877  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17878  "Splice acceptor consensus (AG) not found before exon starting at position 47 of lcl|nuc"));
17879  // AddChromosomeNoLocation(expected_errors, entry);
17880  eval = validator.Validate(seh, options);
17881  CheckErrors(*eval, expected_errors);
17882 
17883  scope.RemoveTopLevelSeqEntry(seh);
17884  unit_test_util::RevComp(entry);
17885  seh = scope.AddTopLevelSeqEntry(*entry);
17886  expected_errors[0]->SetErrMsg("Splice acceptor consensus (AG) not found at end of intron, position 15 of lcl|nuc");
17887  expected_errors[1]->SetErrMsg("Splice acceptor consensus (AG) not found before exon starting at position 14 of lcl|nuc");
17888  eval = validator.Validate(seh, options);
17889  CheckErrors(*eval, expected_errors);
17890 
17891  scope.RemoveTopLevelSeqEntry(seh);
17892  unit_test_util::RevComp(entry);
17893  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = '\xFB';
17894  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = '\xFB';
17895  seh = scope.AddTopLevelSeqEntry(*entry);
17896  CLEAR_ERRORS
17897  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue",
17898  "Invalid residue [251] at position [45]"));
17899  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue",
17900  "Invalid residue [251] at position [46]"));
17901  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17902  "Splice acceptor consensus (AG) not found at end of intron, position 46 of lcl|nuc"));
17903  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17904  "Bad sequence at splice acceptor before exon starting at position 47 of lcl|nuc"));
17905  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
17906  // AddChromosomeNoLocation(expected_errors, entry);
17907  eval = validator.Validate(seh, options);
17908  CheckErrors(*eval, expected_errors);
17909 
17910  scope.RemoveTopLevelSeqEntry(seh);
17911  unit_test_util::RevComp(entry);
17912  seh = scope.AddTopLevelSeqEntry(*entry);
17913  CLEAR_ERRORS
17914  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical,
17915  "InvalidResidue", "Invalid residue [251] at position [15]"));
17916  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue",
17917  "Invalid residue [251] at position [16]"));
17918  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17919  "Splice acceptor consensus (AG) not found at end of intron, position 15 of lcl|nuc"));
17920  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17921  "Bad sequence at splice acceptor before exon starting at position 14 of lcl|nuc"));
17922  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
17923  // AddChromosomeNoLocation(expected_errors, entry);
17924 
17925  eval = validator.Validate(seh, options);
17926  CheckErrors(*eval, expected_errors);
17927 
17928  CLEAR_ERRORS
17929 
17930  scope.RemoveTopLevelSeqEntry(seh);
17931  entry = unit_test_util::BuildGoodSeq();
17932  intron = unit_test_util::AddMiscFeature(entry);
17933  intron->SetData().SetImp().SetKey("intron");
17934  seh = scope.AddTopLevelSeqEntry(*entry);
17935  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NotSpliceConsensusDonorTerminalIntron",
17936  "Splice donor consensus (GT) not found at start of terminal intron, position 1 of lcl|good"));
17937  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
17938  "Splice acceptor consensus (AG) not found at end of intron, position 11 of lcl|good"));
17939  // AddChromosomeNoLocation(expected_errors, entry);
17940  eval = validator.Validate(seh, options);
17941  CheckErrors(*eval, expected_errors);
17942 
17943  scope.RemoveTopLevelSeqEntry(seh);
17944  unit_test_util::RevComp(entry);
17945  seh = scope.AddTopLevelSeqEntry(*entry);
17946  CLEAR_ERRORS
17947  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NotSpliceConsensusDonorTerminalIntron",
17948  "Splice donor consensus (GT) not found at start of terminal intron, position 60 of lcl|good"));
17949  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
17950  "Splice acceptor consensus (AG) not found at end of intron, position 50 of lcl|good"));
17951  // AddChromosomeNoLocation(expected_errors, entry);
17952  eval = validator.Validate(seh, options);
17953  CheckErrors(*eval, expected_errors);
17954 
17955  CLEAR_ERRORS
17956 }
17957 
17958 
17959 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RareSpliceConsensusDonor)
17960 {
17964  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
17965  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = 'G';
17966  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[17] = 'C';
17967  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
17968  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'G';
17969  CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
17970  unit_test_util::AddFeat(intron, nuc);
17971 
17973  // no longer report
17974  // AddChromosomeNoLocation(expected_errors, entry);
17975  eval = validator.Validate(seh, options);
17976  CheckErrors(*eval, expected_errors);
17977 
17978  scope.RemoveTopLevelSeqEntry(seh);
17979  unit_test_util::RevComp(entry);
17980  seh = scope.AddTopLevelSeqEntry(*entry);
17981  CLEAR_ERRORS
17982 
17983  // AddChromosomeNoLocation(expected_errors, entry);
17984  eval = validator.Validate(seh, options);
17985  CheckErrors(*eval, expected_errors);
17986 
17987  CLEAR_ERRORS
17988 }
17989 
17990 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RareSpliceConsensusDonor_VR_65)
17991 {
17995  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
17996  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = 'A';
17997  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[17] = 'T';
17998  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
17999  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'C';
18000  CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
18001  unit_test_util::AddFeat(intron, nuc);
18002 
18004 
18005  // no longer report
18006  eval = validator.Validate(seh, options);
18007  // AddChromosomeNoLocation(expected_errors, entry);
18008  CheckErrors(*eval, expected_errors);
18009 
18010  scope.RemoveTopLevelSeqEntry(seh);
18011  unit_test_util::RevComp(entry);
18012  seh = scope.AddTopLevelSeqEntry(*entry);
18013  CLEAR_ERRORS
18014  // no longer report
18015  // AddChromosomeNoLocation(expected_errors, entry);
18016  eval = validator.Validate(seh, options);
18017  CheckErrors(*eval, expected_errors);
18018 
18019  CLEAR_ERRORS
18020 }
18021 
18022 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqFeatXrefNotReciprocal)
18023 {
18027  cds->SetId().SetLocal().SetId(1);
18029  mrna->SetId().SetLocal().SetId(2);
18033  gene->SetId().SetLocal().SetId(3);
18034 
18035  cds->AddSeqFeatXref(mrna->GetId());
18036  mrna->AddSeqFeatXref(gene->GetId());
18037 
18038 
18040  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefNotReciprocal",
18041  "Cross-referenced feature does not link reciprocally"));
18042  // AddChromosomeNoLocation(expected_errors, entry);
18043 
18044  eval = validator.Validate(seh, options);
18045  CheckErrors(*eval, expected_errors);
18046 
18047  CLEAR_ERRORS
18048 }
18049 
18050 
18051 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqFeatXrefFeatureMissing)
18052 {
18055  cds->SetId().SetLocal().SetId(1);
18056  CRef<CSeqFeatXref> x1(new CSeqFeatXref());
18057  x1->SetId().SetLocal().SetId(2);
18058  cds->SetXref().push_back(x1);
18060  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefFeatureMissing",
18061  "Cross-referenced feature cannot be found"));
18062  // AddChromosomeNoLocation(expected_errors, entry);
18063 
18064  eval = validator.Validate(seh, options);
18065  CheckErrors(*eval, expected_errors);
18066 
18067  CLEAR_ERRORS
18068 }
18069 
18070 
18071 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureInsideGap)
18072 {
18075  misc->SetLocation().SetInt().SetFrom(12);
18076  misc->SetLocation().SetInt().SetTo(20);
18077 
18079  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureInsideGap",
18080  "Feature inside sequence gap"));
18081  // AddChromosomeNoLocation(expected_errors, entry);
18082 
18083  eval = validator.Validate(seh, options);
18084  CheckErrors(*eval, expected_errors);
18085 
18086  CLEAR_ERRORS
18087  scope.RemoveTopLevelSeqEntry(seh);
18088  CRef<CDelta_seq> gap_seg(new CDelta_seq());
18089  gap_seg->SetLiteral().SetSeq_data().SetGap();
18090  gap_seg->SetLiteral().SetLength(10);
18091  entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
18092  entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGATGATG", CSeq_inst::eMol_dna);
18093  entry->SetSeq().SetInst().SetLength(116);
18094  misc->SetLocation().SetInt().SetFrom(48);
18095  misc->SetLocation().SetInt().SetTo(98);
18096  seh = scope.AddTopLevelSeqEntry(*entry);
18097  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent",
18098  "Sequence contains 51 percent Ns"));
18099  /*
18100  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
18101  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
18102  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
18103  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
18104  */
18105  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "FeatureIsMostlyNs",
18106  "Feature contains more than 50% Ns"));
18107  // AddChromosomeNoLocation(expected_errors, entry);
18108 
18109  eval = validator.Validate(seh, options);
18110  CheckErrors(*eval, expected_errors);
18111 
18112  CLEAR_ERRORS
18113 }
18114 
18115 
18116 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureCrossesGap)
18117 {
18119  for (auto& it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
18120  if (it->IsLiteral() && it->GetLiteral().GetSeq_data().IsGap()) {
18121  it->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
18122  }
18123  }
18124 
18126  misc->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
18127  misc->SetLocation().SetInt().SetFrom(5);
18128  misc->SetLocation().SetInt().SetTo(30);
18129 
18131  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
18132  "No CDS location match for 1 mRNA"));
18133  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureCrossesGap",
18134  "Feature crosses gap of unknown length"));
18135  // AddChromosomeNoLocation(expected_errors, entry);
18136 
18137  eval = validator.Validate(seh, options);
18138  CheckErrors(*eval, expected_errors);
18139 
18140  CLEAR_ERRORS
18141 
18142  scope.RemoveTopLevelSeqEntry(seh);
18143  CRef<CSeq_loc> int1(new CSeq_loc());
18144  int1->SetInt().SetFrom(3);
18145  int1->SetInt().SetTo(15);
18146  int1->SetInt().SetId().SetLocal().SetStr("good");
18147  CRef<CSeq_loc> int2(new CSeq_loc());
18148  int2->SetInt().SetFrom(22);
18149  int2->SetInt().SetTo(30);
18150  int2->SetInt().SetId().SetLocal().SetStr("good");
18151  misc->SetLocation().SetMix().Set().push_back(int1);
18152  misc->SetLocation().SetMix().Set().push_back(int2);
18153  seh = scope.AddTopLevelSeqEntry(*entry);
18154  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
18155  "No CDS location match for 1 mRNA"));
18156  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "IntervalBeginsOrEndsInGap",
18157  "Internal interval begins or ends in gap"));
18158  // AddChromosomeNoLocation(expected_errors, entry);
18159 
18160  eval = validator.Validate(seh, options);
18161  CheckErrors(*eval, expected_errors);
18162 
18163  CLEAR_ERRORS
18164 }
18165 
18166 
18167 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAuthorSuffix)
18168 {
18171  CRef<CSeqdesc> desc(new CSeqdesc());
18172  desc->SetPub().SetPub().Set().push_back(pub);
18173  entry->SetDescr().Set().push_back(desc);
18174  pub->SetArticle().SetAuthors().SetNames().SetStd().front()->SetName().SetName().SetSuffix("foo");
18175 
18177  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAuthorSuffix",
18178  "Bad author suffix foo"));
18179  // AddChromosomeNoLocation(expected_errors, entry);
18180 
18181  eval = validator.Validate(seh, options);
18182  CheckErrors(*eval, expected_errors);
18183 
18184  CLEAR_ERRORS
18185 
18186  // don't report good suffixes
18187  pub->SetArticle().SetAuthors().SetNames().SetStd().front()->SetName().SetName().SetSuffix("3rd");
18188  // AddChromosomeNoLocation(expected_errors, entry);
18189  eval = validator.Validate(seh, options);
18190  CheckErrors(*eval, expected_errors);
18191 
18192  CLEAR_ERRORS
18193 }
18194 
18195 
18196 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAnticodonAA)
18197 {
18199  CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
18200  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(8);
18201  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(10);
18202  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('S');
18203  unit_test_util::AddFeat(trna, entry);
18204 
18206  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonAA",
18207  "Codons predicted from anticodon (AAA) cannot produce amino acid (S/Ser)"));
18208  // AddChromosomeNoLocation(expected_errors, entry);
18209 
18210  eval = validator.Validate(seh, options);
18211  CheckErrors(*eval, expected_errors);
18212 
18213  CLEAR_ERRORS
18214 }
18215 
18216 
18217 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAnticodonCodon)
18218 {
18220  CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
18221  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(8);
18222  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(10);
18223  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('K');
18224  trna->SetData().SetRna().SetExt().SetTRNA().SetCodon().push_back(42);
18225  unit_test_util::AddFeat(trna, entry);
18226 
18228  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonAA",
18229  "Codons predicted from anticodon (AAA) cannot produce amino acid (K/Lys)"));
18230  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonCodon",
18231  "Codon recognized cannot be produced from anticodon (AAA)"));
18232  // AddChromosomeNoLocation(expected_errors, entry);
18233 
18234  eval = validator.Validate(seh, options);
18235  CheckErrors(*eval, expected_errors);
18236 
18237  CLEAR_ERRORS
18238 }
18239 
18240 
18241 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAnticodonStrand)
18242 {
18244  CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
18245  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(8);
18246  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(10);
18247  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetStrand(eNa_strand_minus);
18248  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('K');
18249  unit_test_util::AddFeat(trna, entry);
18250 
18252  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "AnticodonStrandConflict",
18253  "Anticodon strand and tRNA strand do not match."));
18254  // AddChromosomeNoLocation(expected_errors, entry);
18255 
18256  eval = validator.Validate(seh, options);
18257  CheckErrors(*eval, expected_errors);
18258 
18259  scope.RemoveTopLevelSeqEntry(seh);
18260  trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().ResetStrand();
18261  trna->SetLocation().SetInt().SetStrand(eNa_strand_minus);
18262  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('F');
18263  seh = scope.AddTopLevelSeqEntry(*entry);
18264  eval = validator.Validate(seh, options);
18265  CheckErrors(*eval, expected_errors);
18266 
18267  CLEAR_ERRORS
18268 }
18269 
18270 
18271 #define test_gene_syn(name) \
18272  gene->SetData().SetGene().ResetSyn(); \
18273  gene->SetData().SetGene().SetSyn().push_back(name); \
18274  msg = "Uninformative gene synonym '"; \
18275  msg.append(name); \
18276  msg.append("'"); \
18277  expected_errors[0]->SetErrMsg(msg); \
18278  eval = validator.Validate(seh, options); \
18279  CheckErrors(*eval, expected_errors);
18280 
18281 
18282 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UndesiredGeneSynonym)
18283 {
18285  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
18287  gene->SetData().SetGene().SetLocus("something");
18288  string msg;
18289 
18291 
18292  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "UndesiredGeneSynonym", ""));
18293  // AddChromosomeNoLocation(expected_errors, entry);
18294 
18295  test_gene_syn("alpha")
18296  test_gene_syn("alternative")
18297  test_gene_syn("beta")
18298  test_gene_syn("cellular")
18299  test_gene_syn("cytokine")
18300  test_gene_syn("delta")
18301  test_gene_syn("drosophila")
18302  test_gene_syn("epsilon")
18303  test_gene_syn("gamma")
18304  test_gene_syn("HLA")
18305  test_gene_syn("homolog")
18306  test_gene_syn("mouse")
18307  test_gene_syn("orf")
18308  test_gene_syn("partial")
18309  test_gene_syn("plasma")
18310  test_gene_syn("precursor")
18311  test_gene_syn("pseudogene")
18312  test_gene_syn("putative")
18313  test_gene_syn("rearranged")
18314  test_gene_syn("small")
18315  test_gene_syn("trna")
18316  test_gene_syn("unknown")
18317  test_gene_syn("unknown function")
18318  test_gene_syn("unknown protein")
18319  test_gene_syn("unnamed")
18320 
18321 
18322  gene->SetData().SetGene().ResetSyn();
18323  gene->SetData().SetGene().SetSyn().push_back("same_as");
18324  gene->SetData().SetGene().SetLocus("same_as");
18325  expected_errors[0]->SetErrMsg("gene synonym has same value as gene locus");
18326  eval = validator.Validate(seh, options);
18327  CheckErrors(*eval, expected_errors);
18328 
18329  gene->SetData().SetGene().ResetSyn();
18330  gene->SetData().SetGene().SetDesc("same_as");
18331  expected_errors[0]->SetErrMsg("gene description has same value as gene locus");
18332  eval = validator.Validate(seh, options);
18333  CheckErrors(*eval, expected_errors);
18334 
18335  gene->SetData().SetGene().ResetDesc();
18336  gene->SetData().SetGene().ResetLocus();
18337  gene->SetData().SetGene().SetSyn().push_back("only_syn");
18338  expected_errors[0]->SetErrMsg("gene synonym without gene locus or description");
18339  eval = validator.Validate(seh, options);
18340  CheckErrors(*eval, expected_errors);
18341 
18342 
18343  CLEAR_ERRORS
18344 }
18345 
18346 
18347 #define test_undesired_protein_name(name) \
18348  prot->SetData().SetProt().ResetName(); \
18349  prot->SetData().SetProt().SetName().push_back(name); \
18350  msg = "Uninformative protein name '"; \
18351  msg.append(name); \
18352  msg.append("'"); \
18353  expected_errors[0]->SetErrMsg(msg); \
18354  eval = validator.Validate(seh, options); \
18355  CheckErrors(*eval, expected_errors);
18356 
18357 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UndesiredProteinName)
18358 {
18360  CRef<CSeq_id> id(new CSeq_id());
18361  id->SetOther().SetAccession("NC_123456");
18364 
18366 
18367  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "UndesiredProteinName",
18368  ""));
18369  // AddChromosomeNoLocation(expected_errors, entry);
18370  string msg;
18371 
18373  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadInternalCharacter",
18374  "Protein name contains undesired character"));
18376  delete expected_errors[1];
18377  expected_errors.pop_back();
18378  test_undesired_protein_name("uniprot protein")
18379  test_undesired_protein_name("uniprotkb protein")
18380  test_undesired_protein_name("refers to pmid 23")
18381  test_undesired_protein_name("refers to dbxref")
18382  // test_undesired_protein_name("hypothetical protein")
18383  test_undesired_protein_name("uncharacterized conserved membrane protein")
18384  test_undesired_protein_name("unknown; predicted coding region")
18385  test_undesired_protein_name("unnamed")
18386 
18387  CLEAR_ERRORS
18388 }
18389 
18390 
18391 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureBeginsOrEndsInGap)
18392 {
18394  for (auto& it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
18395  if (it->IsLiteral() && it->GetLiteral().GetSeq_data().IsGap()) {
18396  it->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
18397  }
18398  }
18399 
18401  misc->SetLocation().SetInt().SetFrom(5);
18402  misc->SetLocation().SetInt().SetTo(20);
18403 
18405  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureBeginsOrEndsInGap",
18406  "Feature begins or ends in gap starting at 13"));
18407  // AddChromosomeNoLocation(expected_errors, entry);
18408 
18409  eval = validator.Validate(seh, options);
18410  CheckErrors(*eval, expected_errors);
18411 
18412  scope.RemoveTopLevelSeqEntry(seh);
18413  misc->SetLocation().SetInt().SetStrand(eNa_strand_minus);
18414  seh = scope.AddTopLevelSeqEntry(*entry);
18415  eval = validator.Validate(seh, options);
18416  CheckErrors(*eval, expected_errors);
18417 
18418  scope.RemoveTopLevelSeqEntry(seh);
18419  misc->SetLocation().SetInt().SetFrom(14);
18420  misc->SetLocation().SetInt().SetTo(30);
18421  seh = scope.AddTopLevelSeqEntry(*entry);
18422 
18423  eval = validator.Validate(seh, options);
18424  CheckErrors(*eval, expected_errors);
18425 
18426  scope.RemoveTopLevelSeqEntry(seh);
18427  misc->SetLocation().SetInt().ResetStrand();
18428  seh = scope.AddTopLevelSeqEntry(*entry);
18429  eval = validator.Validate(seh, options);
18430  CheckErrors(*eval, expected_errors);
18431 
18432  CLEAR_ERRORS
18433 }
18434 
18435 
18436 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneOntologyTermMissingGOID)
18437 {
18440  feat->SetExt().SetType().SetStr("GeneOntology");
18441  CRef<CUser_field> go_list(new CUser_field());
18442  go_list->SetLabel().SetStr("Process");
18443  CRef<CUser_field> go_term(new CUser_field());
18444  go_term->SetLabel().SetStr("a go term");
18445 
18446  CRef<CUser_field> pmid(new CUser_field());
18447  pmid->SetLabel().SetStr("pubmed id");
18448  pmid->SetData().SetInt(4);
18449  go_term->SetData().SetFields().push_back(pmid);
18450 
18451  CRef<CUser_field> term(new CUser_field());
18452  term->SetLabel().SetStr("text string");
18453  term->SetData().SetStr("something");
18454  go_term->SetData().SetFields().push_back(term);
18455 
18456  CRef<CUser_field> ev(new CUser_field());
18457  ev->SetLabel().SetStr("evidence");
18458  ev->SetData().SetStr("some evidence");
18459  go_term->SetData().SetFields().push_back(ev);
18460 
18461  go_list->SetData().SetFields().push_back(go_term);
18462  feat->SetExt().SetData().push_back(go_list);
18463 
18465 
18466  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GeneOntologyTermMissingGOID",
18467  "GO term does not have GO identifier"));
18468  // AddChromosomeNoLocation(expected_errors, entry);
18469  eval = validator.Validate(seh, options);
18470  CheckErrors(*eval, expected_errors);
18471 
18472  CLEAR_ERRORS
18473 }
18474 
18475 
18476 // note - this test also covers PseudoRnaViaGeneHasProduct
18477 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoRnaHasProduct)
18478 {
18481  rna->ResetComment();
18482  rna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18483  rna->SetPseudo(true);
18484  rna->SetProduct().SetWhole().SetGenbank().SetAccession("AY123456");
18485 
18487 
18488  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PseudoRnaHasProduct",
18489  "A pseudo RNA should not have a product"));
18490  // AddChromosomeNoLocation(expected_errors, entry);
18491  eval = validator.Validate(seh, options);
18492  CheckErrors(*eval, expected_errors);
18493 
18494  // this exception should turn off the warning
18495  rna->SetExcept(true);
18496  rna->SetExcept_text("transcribed pseudogene");
18497  CLEAR_ERRORS
18498  // AddChromosomeNoLocation(expected_errors, entry);
18499  eval = validator.Validate(seh, options);
18500  CheckErrors(*eval, expected_errors);
18501 
18502  // should get error if overlapping gene is pseudo (and not except text)
18503  scope.RemoveTopLevelSeqEntry(seh);
18504  rna->ResetExcept();
18505  rna->ResetExcept_text();
18507  gene->SetPseudo(true);
18508  unit_test_util::AddFeat(gene, entry);
18509  seh = scope.AddTopLevelSeqEntry(*entry);
18510  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PseudoRnaHasProduct",
18511  "A pseudo RNA should not have a product"));
18512  eval = validator.Validate(seh, options);
18513  CheckErrors(*eval, expected_errors);
18514 
18515 
18516  // now get PseudoRnaViaGeneHasProduct when rna is not pseudo itself
18517  rna->ResetPseudo();
18518  CLEAR_ERRORS
18519  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PseudoRnaViaGeneHasProduct",
18520  "An RNA overlapped by a pseudogene should not have a product"));
18521  // AddChromosomeNoLocation(expected_errors, entry);
18522  eval = validator.Validate(seh, options);
18523  CheckErrors(*eval, expected_errors);
18524 
18525  CLEAR_ERRORS
18526 }
18527 
18528 
18529 // note - this test also covers PseudoRnaViaGeneHasProduct
18530 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_VR_803)
18531 {
18533  CRef<CSeq_id> id(new CSeq_id("NC_000001.1"));
18534  unit_test_util::ChangeId(entry, id);
18536  rna->ResetComment();
18537  rna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18538  rna->SetPseudo(true);
18539  rna->SetProduct().SetWhole().SetGenbank().SetAccession("AY123456");
18540 
18542 
18543  // AddChromosomeNoLocation(expected_errors, entry);
18544 
18545  // no error expected because RefSeq
18546  eval = validator.Validate(seh, options);
18547  CheckErrors(*eval, expected_errors);
18548 
18549  // should get error if overlapping gene is pseudo (and not except text)
18550  scope.RemoveTopLevelSeqEntry(seh);
18552  gene->SetPseudo(true);
18553  unit_test_util::AddFeat(gene, entry);
18554  seh = scope.AddTopLevelSeqEntry(*entry);
18555  eval = validator.Validate(seh, options);
18556  CheckErrors(*eval, expected_errors);
18557 
18558 
18559  // now get PseudoRnaViaGeneHasProduct when rna is not pseudo itself
18560  rna->ResetPseudo();
18561  eval = validator.Validate(seh, options);
18562  CheckErrors(*eval, expected_errors);
18563 
18564  CLEAR_ERRORS
18565 }
18566 
18567 
18568 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadRRNAcomponentOrder)
18569 {
18571  CRef<CSeq_feat> r1(new CSeq_feat());
18572  r1->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18573  r1->SetData().SetRna().SetExt().SetName("26S ribosomal RNA");
18574  r1->SetLocation().SetInt().SetId().Assign(*(entry->SetSeq().SetId().front()));
18575  r1->SetLocation().SetInt().SetFrom(0);
18576  r1->SetLocation().SetInt().SetTo(10);
18577  unit_test_util::AddFeat(r1, entry);
18578  CRef<CSeq_feat> r2(new CSeq_feat());
18579  r2->SetData().SetRna().SetType(CRNA_ref::eType_miscRNA);
18580  r2->SetData().SetRna().SetExt().SetName("internal transcribed spacer 2");
18581  r2->SetLocation().SetInt().SetId().Assign(*(entry->SetSeq().SetId().front()));
18582  r2->SetLocation().SetInt().SetFrom(11);
18583  r2->SetLocation().SetInt().SetTo(20);
18584  unit_test_util::AddFeat(r2, entry);
18585  CRef<CSeq_feat> r3(new CSeq_feat());
18586  r3->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18587  r3->SetData().SetRna().SetExt().SetName("16S ribosomal RNA");
18588  r3->SetLocation().SetInt().SetId().Assign(*(entry->SetSeq().SetId().front()));
18589  r3->SetLocation().SetInt().SetFrom(21);
18590  r3->SetLocation().SetInt().SetTo(30);
18591  unit_test_util::AddFeat(r3, entry);
18592 
18593 
18595 
18596  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadRRNAcomponentOrder",
18597  "Problem with order of abutting rRNA components"));
18598  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadRRNAcomponentOrder",
18599  "Problem with order of abutting rRNA components"));
18600  // AddChromosomeNoLocation(expected_errors, entry);
18601  eval = validator.Validate(seh, options);
18602  CheckErrors(*eval, expected_errors);
18603 
18604  scope.RemoveTopLevelSeqEntry(seh);
18605  unit_test_util::RevComp(entry);
18606  seh = scope.AddTopLevelSeqEntry(*entry);
18607 
18608  eval = validator.Validate(seh, options);
18609  CheckErrors(*eval, expected_errors);
18610 
18611  CLEAR_ERRORS
18612 
18613  // no errors if organelle
18615  // AddChromosomeNoLocation(expected_errors, entry);
18616  eval = validator.Validate(seh, options);
18617  CheckErrors(*eval, expected_errors);
18618 
18619  CLEAR_ERRORS
18620 }
18621 
18622 
18623 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingGeneLocusTag)
18624 {
18626  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
18628  gene1->ResetComment();
18629  gene1->SetData().SetGene().SetLocus("a");
18630  gene1->SetData().SetGene().SetLocus_tag("tag1");
18632  gene2->ResetComment();
18633  gene2->SetData().SetGene().SetLocus("b");
18634  gene2->SetLocation().SetInt().SetFrom(20);
18635  gene2->SetLocation().SetInt().SetTo(30);
18636 
18638 
18639  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "MissingGeneLocusTag",
18640  "Missing gene locus tag"));
18641  // AddChromosomeNoLocation(expected_errors, entry);
18642  eval = validator.Validate(seh, options);
18643  CheckErrors(*eval, expected_errors);
18644 
18645  CLEAR_ERRORS
18646 }
18647 
18648 
18649 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleProtRefs)
18650 {
18654  prot2->SetData().SetProt().SetName().push_back("a second protein name");
18655  prot2->SetLocation().SetInt().SetTo(prot_seq->GetSeq().GetInst().GetLength()-1);
18657 
18658  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "MultipleProtRefs",
18659  "2 full-length protein features present on protein"));
18660  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "DuplicateFeat",
18661  "Features have identical intervals, but labels differ"));
18662  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "ExtraProteinFeature",
18663  "Protein sequence has multiple unprocessed protein features"));
18664  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "ExtraProteinFeature",
18665  "Protein sequence has multiple unprocessed protein features"));
18666  // AddChromosomeNoLocation(expected_errors, entry);
18667  eval = validator.Validate(seh, options);
18668  CheckErrors(*eval, expected_errors);
18669 
18670  CLEAR_ERRORS
18671 }
18672 
18673 
18674 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadInternalCharacter)
18675 {
18679  prot->SetData().SetProt().ResetName();
18680  prot->SetData().SetProt().SetName().push_back("name~something");
18682 
18684  mrna->SetData().SetRna().SetExt().SetName("name~something");
18687  gene->SetData().SetGene().SetLocus("gene?something");
18689 
18691  rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18692  rrna->SetData().SetRna().SetExt().SetName("rna!something");
18693 
18695  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadInternalCharacter",
18696  "mRNA name contains undesired character"));
18697  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadInternalCharacter",
18698  "Gene locus contains undesired character"));
18699  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadInternalCharacter",
18700  "rRNA name contains undesired character"));
18701  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadInternalCharacter",
18702  "Protein name contains undesired character"));
18703  // AddChromosomeNoLocation(expected_errors, entry);
18704  eval = validator.Validate(seh, options);
18705  CheckErrors(*eval, expected_errors);
18706 
18707  CLEAR_ERRORS
18708 }
18709 
18710 
18712 {
18717  gene->SetData().SetGene().SetLocus("gene|synonym");
18719 
18721  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadInternalCharacter",
18722  "Gene locus contains undesired character"));
18723  // AddChromosomeNoLocation(expected_errors, entry);
18724 
18725  eval = validator.Validate(seh, options);
18726  CheckErrors(*eval, expected_errors);
18727 
18728  CLEAR_ERRORS
18729 }
18730 
18731 
18732 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrailingCharacter)
18733 {
18737  prot->SetData().SetProt().ResetName();
18738  prot->SetData().SetProt().SetName().push_back("name something,");
18740 
18742  mrna->SetData().SetRna().SetExt().SetName("name something_");
18745  gene->SetData().SetGene().SetLocus("gene something;");
18747 
18749  rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18750  rrna->SetData().SetRna().SetExt().SetName("rna something:");
18751 
18753  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingCharacter",
18754  "mRNA name ends with undesired character"));
18755  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingCharacter",
18756  "Gene locus ends with undesired character"));
18757  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingCharacter",
18758  "rRNA name ends with undesired character"));
18759  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadTrailingCharacter",
18760  "Protein name ends with undesired character"));
18761  // AddChromosomeNoLocation(expected_errors, entry);
18762  eval = validator.Validate(seh, options);
18763  CheckErrors(*eval, expected_errors);
18764 
18765  CLEAR_ERRORS
18766 }
18767 
18768 
18769 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrailingHyphen)
18770 {
18774  prot->SetData().SetProt().ResetName();
18775  prot->SetData().SetProt().SetName().push_back("name something-");
18777 
18779  mrna->SetData().SetRna().SetExt().SetName("name something-");
18782  gene->SetData().SetGene().SetLocus("gene something-");
18784 
18786  rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18787  rrna->SetData().SetRna().SetExt().SetName("rna something-");
18788 
18790  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingHyphen",
18791  "mRNA name ends with hyphen"));
18792  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingHyphen",
18793  "Gene locus ends with hyphen"));
18794  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingHyphen",
18795  "rRNA name ends with hyphen"));
18796  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadTrailingHyphen",
18797  "Protein name ends with hyphen"));
18798  // AddChromosomeNoLocation(expected_errors, entry);
18799  eval = validator.Validate(seh, options);
18800  CheckErrors(*eval, expected_errors);
18801 
18802  CLEAR_ERRORS
18803 }
18804 
18805 
18806 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleGeneOverlap)
18807 {
18810  gene1->SetData().SetGene().SetLocus("a");
18811  gene1->SetLocation().SetInt().SetFrom(0);
18812  gene1->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength()-1);
18814  gene2->SetData().SetGene().SetLocus("b");
18816  gene3->SetData().SetGene().SetLocus("c");
18817  gene3->SetLocation().SetInt().SetFrom(11);
18818  gene3->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength()-1);
18819 
18821  // no error for only two genes
18822  // AddChromosomeNoLocation(expected_errors, entry);
18823  eval = validator.Validate(seh, options);
18824  CheckErrors(*eval, expected_errors);
18825 
18826  scope.RemoveTopLevelSeqEntry(seh);
18828  gene4->SetData().SetGene().SetLocus("d");
18829  gene4->SetLocation().SetInt().SetFrom(15);
18830  gene4->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength()-1);
18832  gene5->SetData().SetGene().SetLocus("e");
18833  gene5->SetLocation().SetInt().SetFrom(20);
18834  gene5->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength()-1);
18836  gene6->SetData().SetGene().SetLocus("f");
18837  gene6->SetLocation().SetInt().SetFrom(25);
18838  gene6->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength()-1);
18839  seh = scope.AddTopLevelSeqEntry(*entry);
18840 
18841  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleGeneOverlap",
18842  "Gene contains 5 other genes"));
18843  eval = validator.Validate(seh, options);
18844  CheckErrors(*eval, expected_errors);
18845 
18846  CLEAR_ERRORS
18847 }
18848 
18849 
18850 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadCharInAuthorLastName)
18851 {
18853  CRef<CAuthor> author(new CAuthor());
18854  author->SetName().SetName().SetLast("Gr@nt");
18855  CRef<CPub> pub(new CPub());
18856  pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
18858  art_title->SetName("article title");
18859  pub->SetArticle().SetTitle().Set().push_back(art_title);
18860  CRef<CSeqdesc> desc(new CSeqdesc());
18861  desc->SetPub().SetPub().Set().push_back(pub);
18862  entry->SetDescr().Set().push_back(desc);
18863 
18865  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadCharInAuthorLastName",
18866  "Bad characters in author Gr@nt"));
18867  // AddChromosomeNoLocation(expected_errors, entry);
18868  eval = validator.Validate(seh, options);
18869  CheckErrors(*eval, expected_errors);
18870 
18871  CLEAR_ERRORS
18872 }
18873 
18874 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoCDSmRNArange)
18875 {
18878  cds->ResetComment();
18879  cds->SetData().SetCdregion();
18880  cds->SetPseudo(true);
18881  cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
18883  mrna->SetLocation().SetMix().Set().front()->SetInt().SetTo(16);
18884  unit_test_util::AddFeat(mrna, entry);
18885  mrna->SetPseudo(true);
18886 
18888  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
18889  "No CDS location match for 1 mRNA"));
18890  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "PseudoCDSmRNArange",
18891  "mRNA contains CDS but internal intron-exon boundaries do not match"));
18892  // AddChromosomeNoLocation(expected_errors, entry);
18893  eval = validator.Validate(seh, options);
18894  CheckErrors(*eval, expected_errors);
18895 
18896  scope.RemoveTopLevelSeqEntry(seh);
18897  mrna->SetLocation().SetMix().Set().back()->SetInt().SetTo(55);
18898  seh = scope.AddTopLevelSeqEntry(*entry);
18899  expected_errors[1]->SetErrMsg("mRNA overlaps or contains CDS but does not completely contain intervals");
18900  eval = validator.Validate(seh, options);
18901  CheckErrors(*eval, expected_errors);
18902  CLEAR_ERRORS
18903 }
18904 
18905 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefNeeded)
18906 {
18909  AddCDSAndProtForBigGoodNucProtSet(entry, "nuc", "prot2", 30);
18910  CRef<CSeq_feat> cds = entry->SetSet().SetAnnot().front()->SetData().SetFtable().back();
18912  gene1->SetLocation().SetInt().SetFrom(gene1->GetLocation().GetInt().GetFrom() - 3);
18913  gene1->SetData().SetGene().SetLocus("a1");
18914  gene1->SetData().SetGene().SetAllele("x");
18915  unit_test_util::AddFeat(gene1, nuc);
18917  gene2->SetData().SetGene().SetLocus("a1");
18918  gene2->SetData().SetGene().SetAllele("y");
18919  gene2->SetLocation().SetInt().SetTo(gene2->GetLocation().GetInt().GetTo() + 3);
18920  unit_test_util::AddFeat(gene2, nuc);
18921 
18923  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "GeneXrefNeeded",
18924  "Feature overlapped by 2 identical-length equivalent genes but has no cross-reference"));
18925  // AddChromosomeNoLocation(expected_errors, entry);
18926  eval = validator.Validate(seh, options);
18927  CheckErrors(*eval, expected_errors);
18928 
18929  CLEAR_ERRORS
18930 }
18931 
18932 
18933 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RubiscoProblem)
18934 {
18937  prot->SetData().SetProt().SetName().pop_back();
18938  prot->SetData().SetProt().SetName().push_back("ribulose bisphosphate");
18940  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "RubiscoProblem",
18941  "Nonstandard ribulose bisphosphate protein name"));
18942  // AddChromosomeNoLocation(expected_errors, entry);
18944  eval = validator.Validate(seh, options);
18945  CheckErrors(*eval, expected_errors);
18946 
18947  CLEAR_ERRORS
18948 }
18949 
18950 
18951 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnqualifiedException)
18952 {
18955  mrna->SetExcept(true);
18956  mrna->SetExcept_text("transcribed product replaced");
18958  cds->SetExcept(true);
18959  cds->SetExcept_text("translated product replaced");
18961  genomic->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGGGGAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
18962 
18964  /*
18965  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryException",
18966  "CDS has unnecessary translated product replaced exception"));
18967  */
18968  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "mRNAUnnecessaryException",
18969  "mRNA has transcribed product replaced exception"));
18970  // AddChromosomeNoLocation(expected_errors, entry);
18971  eval = validator.Validate(seh, options);
18972  CheckErrors(*eval, expected_errors);
18973 
18974  CLEAR_ERRORS
18975 }
18976 
18977 
18978 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ProteinNameHasPMID)
18979 {
18982  prot->SetData().SetProt().SetName().front().assign("(PMID 1234)");
18984  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ProteinNameHasPMID",
18985  "Protein name has internal PMID"));
18986  // AddChromosomeNoLocation(expected_errors, entry);
18987  eval = validator.Validate(seh, options);
18988  CheckErrors(*eval, expected_errors);
18989 
18990  CLEAR_ERRORS
18991 }
18992 
18993 
18994 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadGeneOntologyFormat)
18995 {
18998  feat->SetExt().SetType().SetStr("GeneOntology");
18999  CRef<CUser_field> go_list(new CUser_field());
19000  go_list->SetData().SetStr("something");
19001  feat->SetExt().SetData().push_back(go_list);
19002 
19004  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadGeneOntologyFormat",
19005  "Bad data format for GO term"));
19006  // AddChromosomeNoLocation(expected_errors, entry);
19007  eval = validator.Validate(seh, options);
19008  CheckErrors(*eval, expected_errors);
19009 
19010  CRef<CUser_field> go_term(new CUser_field());
19011  go_list->SetData().SetFields().push_back(go_term);
19012  expected_errors[0]->SetErrMsg("Unrecognized GO term label [blank]");
19013  eval = validator.Validate(seh, options);
19014  CheckErrors(*eval, expected_errors);
19015 
19016  go_list->SetLabel().SetStr("something");
19017  expected_errors[0]->SetErrMsg("Unrecognized GO term label something");
19018  eval = validator.Validate(seh, options);
19019  CheckErrors(*eval, expected_errors);
19020 
19021  go_list->SetLabel().SetStr("Process");
19022  expected_errors[0]->SetErrMsg("Bad GO term format");
19023  eval = validator.Validate(seh, options);
19024  CheckErrors(*eval, expected_errors);
19025 
19026  CRef<CUser_field> go_field(new CUser_field());
19027  go_term->SetData().SetFields().push_back(go_field);
19028  expected_errors[0]->SetErrMsg("Unrecognized label on GO term qualifier field [blank]");
19029  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GeneOntologyTermMissingGOID",
19030  "GO term does not have GO identifier"));
19031  eval = validator.Validate(seh, options);
19032  CheckErrors(*eval, expected_errors);
19033 
19034  go_field->SetLabel().SetStr("notlabel");
19035  expected_errors[0]->SetErrMsg("Unrecognized label on GO term qualifier field notlabel");
19036  eval = validator.Validate(seh, options);
19037  CheckErrors(*eval, expected_errors);
19038 
19039  go_field->SetLabel().SetStr("go id");
19040  expected_errors[0]->SetErrMsg("Bad data format for GO term qualifier GO ID");
19041  eval = validator.Validate(seh, options);
19042  CheckErrors(*eval, expected_errors);
19043 
19044  go_field->SetData().SetInt(123);
19045  CRef<CUser_field> go_field2(new CUser_field());
19046  go_field2->SetLabel().SetStr("text string");
19047  go_field2->SetData().SetInt(123);
19048  go_term->SetData().SetFields().push_back(go_field2);
19049  expected_errors[0]->SetErrMsg("Bad data format for GO term qualifier term");
19050  delete expected_errors[1];
19051  expected_errors.pop_back();
19052  eval = validator.Validate(seh, options);
19053  CheckErrors(*eval, expected_errors);
19054 
19055  go_field2->SetData().SetStr("some text");
19056  CRef<CUser_field> go_field3(new CUser_field());
19057  go_field3->SetLabel().SetStr("pubmed id");
19058  go_field3->SetData().SetStr("some text");
19059  go_term->SetData().SetFields().push_back(go_field3);
19060  expected_errors[0]->SetErrMsg("Bad data format for GO term qualifier PMID");
19061  eval = validator.Validate(seh, options);
19062  CheckErrors(*eval, expected_errors);
19063 
19064  go_field3->SetData().SetInt(123);
19065  CRef<CUser_field> go_field4(new CUser_field());
19066  go_field4->SetLabel().SetStr("evidence");
19067  go_field4->SetData().SetInt(123);
19068  go_term->SetData().SetFields().push_back(go_field4);
19069  expected_errors[0]->SetErrMsg("Bad data format for GO term qualifier evidence");
19070  eval = validator.Validate(seh, options);
19071  CheckErrors(*eval, expected_errors);
19072 
19073  CLEAR_ERRORS
19074 }
19075 
19076 
19077 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InconsistentGeneOntologyTermAndId)
19078 {
19081  CRef<CUser_field> term1 = MakeGoTerm("a1", "evidence 1");
19082  AddProcessGoTerm(*feat, term1);
19083  CRef<CUser_field> term2 = MakeGoTerm("a2", "evidence 2");
19084  AddProcessGoTerm(*feat, term2);
19085 
19087  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentGeneOntologyTermAndId",
19088  "Inconsistent GO terms for GO ID 123"));
19089  // AddChromosomeNoLocation(expected_errors, entry);
19090  eval = validator.Validate(seh, options);
19091  CheckErrors(*eval, expected_errors);
19092 
19093  CLEAR_ERRORS
19094 }
19095 
19096 
19097 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateGeneConflictingLocusTag)
19098 {
19101  gene1->SetData().SetGene().SetLocus("gene1");
19103  gene2->SetData().SetGene().SetLocus("gene1");
19104 
19105 
19107  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatContentDup",
19108  "Duplicate feature"));
19109  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "DuplicateGeneConflictingLocusTag",
19110  "Colliding names in gene features, but feature locations are identical"));
19111  // AddChromosomeNoLocation(expected_errors, entry);
19112  eval = validator.Validate(seh, options);
19113  CheckErrors(*eval, expected_errors);
19114 
19115  CLEAR_ERRORS
19116 
19117  gene2->SetData().SetGene().SetLocus("GENE1");
19118  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicateFeat",
19119  "Features have identical intervals, but labels differ"));
19120  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "DuplicateGeneConflictingLocusTag",
19121  "Colliding names (with different capitalization) in gene features, but feature locations are identical"));
19122  // AddChromosomeNoLocation(expected_errors, entry);
19123  eval = validator.Validate(seh, options);
19124  CheckErrors(*eval, expected_errors);
19125 
19126  CLEAR_ERRORS
19127 }
19128 
19129 
19130 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ReplicatedGeneSequence)
19131 {
19134  gene1->SetData().SetGene().SetLocus("gene1");
19136  gene2->SetData().SetGene().SetLocus("gene1");
19137  gene2->SetLocation().SetInt().SetFrom(30);
19138  gene2->SetLocation().SetInt().SetTo(30 + gene1->GetLocation().GetInt().GetTo());
19139 
19141  // AddChromosomeNoLocation(expected_errors, entry);
19142  // error no longer expected, VR-801
19143  eval = validator.Validate(seh, options);
19144  CheckErrors(*eval, expected_errors);
19145 
19146  gene2->SetData().SetGene().SetLocus("GENE1");
19147  eval = validator.Validate(seh, options);
19148  CheckErrors(*eval, expected_errors);
19149 
19150  CLEAR_ERRORS
19151 }
19152 
19153 
19154 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefStrandProblem)
19155 {
19158  feat->SetGeneXref().SetLocus("gene locus");
19160  gene->SetLocation().SetInt().SetStrand(eNa_strand_minus);
19161  gene->SetData().SetGene().SetLocus("gene locus");
19162  unit_test_util::AddFeat(gene, entry);
19163 
19165  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GeneXrefStrandProblem",
19166  "Gene cross-reference is not on expected strand"));
19167  // AddChromosomeNoLocation(expected_errors, entry);
19168  eval = validator.Validate(seh, options);
19169  CheckErrors(*eval, expected_errors);
19170 
19171  scope.RemoveTopLevelSeqEntry(seh);
19172  unit_test_util::RevComp(entry);
19173  seh = scope.AddTopLevelSeqEntry(*entry);
19174  eval = validator.Validate(seh, options);
19175  CheckErrors(*eval, expected_errors);
19176 
19177  feat->SetGeneXref().ResetLocus();
19178  feat->SetGeneXref().SetLocus_tag("LOCUSTAG");
19179  gene->SetData().SetGene().ResetLocus();
19180  gene->SetData().SetGene().SetLocus_tag("LOCUSTAG");
19181  eval = validator.Validate(seh, options);
19182  CheckErrors(*eval, expected_errors);
19183 
19184  scope.RemoveTopLevelSeqEntry(seh);
19185  unit_test_util::RevComp(entry);
19186  seh = scope.AddTopLevelSeqEntry(*entry);
19187  eval = validator.Validate(seh, options);
19188  CheckErrors(*eval, expected_errors);
19189 
19190  CLEAR_ERRORS
19191 }
19192 
19193 
19194 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSmRNAXrefLocationProblem)
19195 {
19199  cds->SetId().SetLocal().SetId(1);
19200  CRef<CSeqFeatXref> x1(new CSeqFeatXref());
19201  x1->SetId().SetLocal().SetId(2);
19202  cds->SetXref().push_back(x1);
19203 
19205  mrna->SetId().SetLocal().SetId(2);
19206  CRef<CSeqFeatXref> x2(new CSeqFeatXref());
19207  x2->SetId().SetLocal().SetId(1);
19208  mrna->SetXref().push_back(x2);
19209  mrna->SetLocation().SetInt().SetTo(mrna->GetLocation().GetInt().GetTo() - 1);
19211 
19213 
19214  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNAXrefLocationProblem",
19215  "CDS not contained within cross-referenced mRNA"));
19216  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNArange",
19217  "mRNA overlaps or contains CDS but does not completely contain intervals"));
19218  // AddChromosomeNoLocation(expected_errors, entry);
19219  eval = validator.Validate(seh, options);
19220  CheckErrors(*eval, expected_errors);
19221  CLEAR_ERRORS
19222 }
19223 
19224 
19225 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_IdenticalGeneSymbolAndSynonym)
19226 {
19228 
19229  CRef<CSeq_feat> gene1(new CSeq_feat());
19230  gene1->SetData().SetGene().SetLocus("gene1");
19231  gene1->SetLocation().SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
19232  gene1->SetLocation().SetInt().SetFrom(0);
19233  gene1->SetLocation().SetInt().SetTo(3);
19234  unit_test_util::AddFeat(gene1, entry);
19235 
19236  CRef<CSeq_feat> gene2(new CSeq_feat());
19237  gene2->SetData().SetGene().SetLocus("gene2");
19238  gene2->SetData().SetGene().SetSyn().push_back("gene1");
19239  gene2->SetLocation().SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
19240  gene2->SetLocation().SetInt().SetFrom(4);
19241  gene2->SetLocation().SetInt().SetTo(entry->GetSeq().GetLength() - 1);
19242  unit_test_util::AddFeat(gene2, entry);
19243 
19245 
19246  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IdenticalGeneSymbolAndSynonym",
19247  "gene synonym has same value (gene1) as locus of another gene feature"));
19248  // AddChromosomeNoLocation(expected_errors, entry);
19249  eval = validator.Validate(seh, options);
19250  CheckErrors(*eval, expected_errors);
19251 
19252  CLEAR_ERRORS
19253 }
19254 
19255 
19256 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PartialProblem)
19257 {
19259  CRef<CSeq_entry> nuc = entry->SetSet().SetSeq_set().front();
19260  CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
19261  CRef<CSeq_feat> prot_feat = prot->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
19263 
19264  // make coding region shorter, 5' partial
19265  cds_feat->SetLocation().SetInt().SetFrom(3);
19266  cds_feat->SetLocation().SetPartialStart(true, eExtreme_Biological);
19267  // shorten protein sequence
19268  prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("PRKTEIN");
19269  prot->SetSeq().SetInst().SetLength(7);
19271  // make protein sequence 3' partial
19273 
19274 
19276 
19277  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
19278  "Coding region and protein feature partials conflict"));
19279  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus5Prime",
19280  "5' partial is not at beginning of sequence, gap, or consensus splice site"));
19281  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
19282  "Inconsistent: Product= partial, Location= partial, Feature.partial= FALSE"));
19283  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop", "Got stop codon, but 3'end is labeled partial"));
19284  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem", "CDS is 3' complete but protein is CO2 partial"));
19285  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem", "CDS is 5' partial but protein is CO2 partial"));
19286  // AddChromosomeNoLocation(expected_errors, entry);
19287  eval = validator.Validate(seh, options);
19288  CheckErrors(*eval, expected_errors);
19289 
19290  // set partial on CDS, third error should go away
19291  cds_feat->SetPartial(true);
19292  delete expected_errors[2];
19293  expected_errors[2] = nullptr;
19294  eval = validator.Validate(seh, options);
19295  CheckErrors(*eval, expected_errors);
19296 
19297  CLEAR_ERRORS
19298 }
19299 
19300 
19301 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ProteinNameEndsInBracket)
19302 {
19304  unit_test_util::SetNucProtSetProductName(entry, "something [ends with bracket]");
19305 
19307  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ProteinNameEndsInBracket",
19308  "Protein name ends with bracket and may contain organism name"));
19309  // AddChromosomeNoLocation(expected_errors, entry);
19310  eval = validator.Validate(seh, options);
19311  CheckErrors(*eval, expected_errors);
19312 
19313  // report if no beginning bracket
19314  unit_test_util::SetNucProtSetProductName(entry, "something NAD with bracket]");
19315  eval = validator.Validate(seh, options);
19316  CheckErrors(*eval, expected_errors);
19317 
19318  CLEAR_ERRORS
19319  // no report if [NAD
19320 
19321  unit_test_util::SetNucProtSetProductName(entry, "something [NAD with bracket]");
19322  // AddChromosomeNoLocation(expected_errors, entry);
19323  eval = validator.Validate(seh, options);
19324  CheckErrors(*eval, expected_errors);
19325 
19326  CLEAR_ERRORS
19327 }
19328 
19329 
19330 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ShortIntron)
19331 {
19333  CRef<CSeq_id> id = entry->SetSeq().SetId().front();
19334 
19335  // add gene
19336  CRef<CSeq_feat> gene(new CSeq_feat());
19337  gene->SetData().SetGene().SetLocus("locus");
19338  gene->SetLocation().SetInt().SetFrom(0);
19339  gene->SetLocation().SetInt().SetTo(59);
19340  gene->SetLocation().SetInt().SetId().Assign(*id);
19341  unit_test_util::AddFeat(gene, entry);
19342 
19343  // add coding region
19344  CRef<CSeq_feat> cds(new CSeq_feat());
19345  cds->SetData().SetCdregion();
19346 
19347  CRef<CSeq_loc> loc1(new CSeq_loc());
19348  loc1->SetInt().SetFrom(0);
19349  loc1->SetInt().SetTo(15);
19350  loc1->SetInt().SetId().Assign(*id);
19351 
19352  CRef<CSeq_loc> loc2(new CSeq_loc());
19353  loc2->SetInt().SetFrom(19);
19354  loc2->SetInt().SetTo(59);
19355  loc2->SetInt().SetId().Assign(*id);
19356 
19357  cds->SetLocation().SetMix().Set().push_back(loc1);
19358  cds->SetLocation().SetMix().Set().push_back(loc2);
19359  unit_test_util::AddFeat(cds, entry);
19360 
19361  // add intron
19362  CRef<CSeq_feat> intron(new CSeq_feat());
19363  intron->SetData().SetImp().SetKey("intron");
19364  intron->SetLocation().SetInt().SetFrom(16);
19365  intron->SetLocation().SetInt().SetTo(18);
19366  intron->SetLocation().SetInt().SetId().Assign(*id);
19367  unit_test_util::AddFeat(intron, entry);
19368 
19370 
19371  BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
19372  BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
19373 
19374  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StartCodon",
19375  "Illegal start codon used. Wrong genetic code [0] or protein should be partial"));
19376  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoStop",
19377  "Missing stop codon"));
19378  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19379  "Splice donor consensus (GT) not found after exon ending at position 16 of lcl|good"));
19380  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19381  "Splice acceptor consensus (AG) not found before exon starting at position 20 of lcl|good"));
19382  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingCDSproduct",
19383  "Expected CDS product absent"));
19384  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortIntron",
19385  "Introns should be at least 10 nt long"));
19386  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortIntron",
19387  "Introns at positions 16-20 should be at least 10 nt long"));
19388  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19389  "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|good"));
19390  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19391  "Splice acceptor consensus (AG) not found at end of intron, position 19 of lcl|good"));
19392  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoProtein", "No protein Bioseq given"));
19393  // AddChromosomeNoLocation(expected_errors, entry);
19394  eval = validator.Validate(seh, options);
19395  CheckErrors(*eval, expected_errors);
19396 
19397  // set CDS pseudo, one ShortIntron error should go away
19398  cds->SetPseudo(true);
19399  CLEAR_ERRORS
19400  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortIntron",
19401  "Introns should be at least 10 nt long"));
19402  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19403  "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|good"));
19404  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19405  "Splice acceptor consensus (AG) not found at end of intron, position 19 of lcl|good"));
19406  // AddChromosomeNoLocation(expected_errors, entry);
19407 
19408 
19409  eval = validator.Validate(seh, options);
19410  CheckErrors(*eval, expected_errors);
19411 
19412  // make cds not pseudo, intron pseudo, should still get one ShortIntron error
19413  cds->ResetPseudo();
19414  intron->SetPseudo(true);
19415 
19416  BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
19417  BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
19418 
19419  CLEAR_ERRORS
19420  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StartCodon",
19421  "Illegal start codon used. Wrong genetic code [0] or protein should be partial"));
19422  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoStop",
19423  "Missing stop codon"));
19424  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19425  "Splice donor consensus (GT) not found after exon ending at position 16 of lcl|good"));
19426  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19427  "Splice acceptor consensus (AG) not found before exon starting at position 20 of lcl|good"));
19428  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingCDSproduct",
19429  "Expected CDS product absent"));
19430  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortIntron",
19431  "Introns at positions 16-20 should be at least 10 nt long"));
19432  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19433  "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|good"));
19434  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19435  "Splice acceptor consensus (AG) not found at end of intron, position 19 of lcl|good"));
19436  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoProtein", "No protein Bioseq given"));
19437  // AddChromosomeNoLocation(expected_errors, entry);
19438  eval = validator.Validate(seh, options);
19439  CheckErrors(*eval, expected_errors);
19440 
19441  // clear both pseudo, make gene pseudo, both errors should go away
19442  intron->ResetPseudo();
19443  gene->SetPseudo(true);
19444  CLEAR_ERRORS
19445 
19446  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19447  "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|good"));
19448  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19449  "Splice acceptor consensus (AG) not found at end of intron, position 19 of lcl|good"));
19450  // AddChromosomeNoLocation(expected_errors, entry);
19451 
19452  eval = validator.Validate(seh, options);
19453  CheckErrors(*eval, expected_errors);
19454  CLEAR_ERRORS
19455 
19456  // clear all pseudos
19457  gene->ResetPseudo();
19458  // nonsense intron silences coding region shortintron message
19459  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGGCCAAAATTGGTAAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
19460 
19461  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoProtein",
19462  "No protein Bioseq given"));
19463  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "IntronIsStopCodon",
19464  "Triplet intron encodes stop codon"));
19465  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StartCodon",
19466  "Illegal start codon used. Wrong genetic code [0] or protein should be partial"));
19467  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoStop",
19468  "Missing stop codon"));
19469  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingCDSproduct",
19470  "Expected CDS product absent"));
19471  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortIntron",
19472  "Introns should be at least 10 nt long"));
19473  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19474  "Splice donor consensus (GT) not found after exon ending at position 16 of lcl|good"));
19475  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19476  "Splice acceptor consensus (AG) not found before exon starting at position 20 of lcl|good"));
19477  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19478  "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|good"));
19479  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19480  "Splice acceptor consensus (AG) not found at end of intron, position 19 of lcl|good"));
19481  // AddChromosomeNoLocation(expected_errors, entry);
19482 
19483  eval = validator.Validate(seh, options);
19484  CheckErrors(*eval, expected_errors);
19485  CLEAR_ERRORS
19486 }
19487 
19488 
19489 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NeedsNote)
19490 {
19493  misc->ResetComment();
19494 
19496 
19497  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MiscFeatureNeedsNote",
19498  "A note or other qualifier is required for a misc_feature"));
19499  // AddChromosomeNoLocation(expected_errors, entry);
19500  eval = validator.Validate(seh, options);
19501  CheckErrors(*eval, expected_errors);
19502 
19503  CLEAR_ERRORS
19504 }
19505 
19506 
19507 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RptUnitRangeProblem)
19508 {
19511  misc->SetData().SetImp().SetKey("repeat_region");
19512  CRef<CGb_qual> qual(new CGb_qual());
19513  qual->SetQual("rpt_unit_range");
19514  qual->SetVal("1..70");
19515  misc->SetQual().push_back(qual);
19516 
19518 
19519  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "RptUnitRangeProblem",
19520  "/rpt_unit_range is not within sequence length"));
19521  // AddChromosomeNoLocation(expected_errors, entry);
19522  eval = validator.Validate(seh, options);
19523  CheckErrors(*eval, expected_errors);
19524 
19525  CLEAR_ERRORS
19526 }
19527 
19528 
19529 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TooManyInferenceAccessions)
19530 {
19532 
19533  for (int i = 0; i < 50; i++) {
19534  CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry, i + 10);
19535  for (int j = 0; j < 10; j++) {
19536  CRef<CGb_qual> qual(new CGb_qual());
19537  qual->SetQual("inference");
19538  string val = "similar to DNA sequence:";
19539  for (int k = 0; k < 10; k++) {
19540  val += "INSD:AY" + NStr::IntToString(k + j * 100 + 123400) + ".1";
19541  if (k < 9) {
19542  val += ",";
19543  }
19544  }
19545  qual->SetVal(val);
19546  misc->SetQual().push_back(qual);
19547  }
19548  }
19550 
19551  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "TooManyInferenceAccessions",
19552  "Skipping validation of 500 /inference qualifiers with 5000 accessions"));
19553  // AddChromosomeNoLocation(expected_errors, entry);
19554  eval = validator.Validate(seh, options | CValidator::eVal_inference_accns);
19555  CheckErrors(*eval, expected_errors);
19556 
19557  CLEAR_ERRORS
19558 }
19559 
19560 
19562 {
19563  CRef<CSeq_align> align(new CSeq_align());
19565  align->SetSegs().SetDenseg().SetNumseg(1);
19566 
19567  int dim = 0;
19568  int len = 0;
19569 
19570  FOR_EACH_SEQENTRY_ON_SEQSET(s, entry->GetSet()) {
19571  dim++;
19572  CRef<CSeq_id> id(new CSeq_id());
19573  id->Assign(*((*s)->GetSeq().GetId().front()));
19574  align->SetSegs().SetDenseg().SetIds().push_back(id);
19575  align->SetSegs().SetDenseg().SetStarts().push_back(0);
19576 
19577  len = (*s)->GetSeq().GetInst().GetLength();
19578  }
19579  align->SetDim(dim);
19580  align->SetSegs().SetDenseg().SetDim(dim);
19581  align->SetSegs().SetDenseg().SetLens().push_back(len);
19582 
19583  return align;
19584 }
19585 
19586 
19587 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_SeqIdProblem, CGenBankFixture)
19588 {
19590  CRef<CSeq_annot> annot(new CSeq_annot());
19591  CRef<CSeq_align> align = BuildSetAlign(entry);
19592  align->SetSegs().SetDenseg().SetIds().back()->SetLocal().SetStr("good4");
19593  annot->SetData().SetAlign().push_back(align);
19594  entry->SetSet().SetAnnot().push_back(annot);
19595 
19597 
19598 
19599  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "FastaLike",
19600  "Fasta: This may be a fasta-like alignment for SeqId: lcl|good1 in the context of good1"));
19601  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SeqIdProblem",
19602  "SeqId: The sequence corresponding to SeqId lcl|good4 could not be found."));
19603  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "PercentIdentity",
19604  "PercentIdentity: This alignment has a percent identity of 0%"));
19605  // AddChromosomeNoLocation(expected_errors, entry);
19607  eval = validator.Validate(seh, options);
19608  CheckErrors(*eval, expected_errors);
19609  CLEAR_ERRORS
19610 }
19611 
19612 
19613 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_DensegLenStart, CGenBankFixture)
19614 {
19616 
19617  CRef<CSeq_align> align(new CSeq_align());
19619  align->SetSegs().SetDenseg().SetNumseg(2);
19620 
19621  int dim = 0;
19622 
19623  FOR_EACH_SEQENTRY_ON_SEQSET(s, entry->GetSet()) {
19624  dim++;
19625  CRef<CSeq_id> id(new CSeq_id());
19626  id->Assign(*((*s)->GetSeq().GetId().front()));
19627  align->SetSegs().SetDenseg().SetIds().push_back(id);
19628  align->SetSegs().SetDenseg().SetStarts().push_back(0);
19629  }
19630  align->SetDim(dim);
19631  align->SetSegs().SetDenseg().SetDim(dim);
19632 
19633  align->SetSegs().SetDenseg().SetLens().push_back(5);
19634  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19635  align->SetSegs().SetDenseg().SetStarts().push_back(6);
19636  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19637  align->SetSegs().SetDenseg().SetLens().push_back(10);
19638 
19639  CRef<CSeq_annot> annot(new CSeq_annot());
19640  annot->SetData().SetAlign().push_back(align);
19641  entry->SetSet().SetAnnot().push_back(annot);
19642 
19644 
19645  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "DensegLenStart",
19646  "Start/Length: There is a problem with sequence lcl|good2, in segment 1 (near sequence position 0), context good1: the segment is too long or short or the next segment has an incorrect start position"));
19647  // AddChromosomeNoLocation(expected_errors, entry);
19649  eval = validator.Validate(seh, options);
19650  CheckErrors(*eval, expected_errors);
19651  CLEAR_ERRORS
19652 }
19653 
19654 
19655 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_SumLenStart, CGenBankFixture)
19656 {
19658  CRef<CSeq_align> align = BuildSetAlign(entry);
19659  align->SetSegs().SetDenseg().SetNumseg(2);
19660  align->SetSegs().SetDenseg().SetLens()[0] = 5;
19661  align->SetSegs().SetDenseg().SetLens().push_back(60);
19662 
19663  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19664  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19665  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19666 
19667  CRef<CSeq_annot> annot(new CSeq_annot());
19668  annot->SetData().SetAlign().push_back(align);
19669  entry->SetSet().SetAnnot().push_back(annot);
19670 
19672 
19673  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SumLenStart",
19674  "Start: In sequence lcl|good1, segment 2 (near sequence position 5) context good1, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment"));
19675  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SumLenStart",
19676  "Start: In sequence lcl|good2, segment 2 (near sequence position 5) context good1, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment"));
19677  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SumLenStart",
19678  "Start: In sequence lcl|good3, segment 2 (near sequence position 5) context good1, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment"));
19679  // AddChromosomeNoLocation(expected_errors, entry);
19681  eval = validator.Validate(seh, options);
19682  CheckErrors(*eval, expected_errors);
19683  CLEAR_ERRORS
19684 }
19685 
19686 
19687 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_AlignDimSeqIdNotMatch, CGenBankFixture)
19688 {
19690  CRef<CSeq_align> align = BuildSetAlign(entry);
19691  align->SetSegs().SetDenseg().SetDim(4);
19692 
19693  CRef<CSeq_annot> annot(new CSeq_annot());
19694  annot->SetData().SetAlign().push_back(align);
19695  entry->SetSet().SetAnnot().push_back(annot);
19696 
19697  SetDiagFilter(eDiagFilter_All, "!(1207.5)");
19700 
19701  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "AlignDimSeqIdNotMatch",
19702  "SeqId: The Seqalign has more or fewer ids than the number of rows in the alignment (context good1). Look for possible formatting errors in the ids."));
19703  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SegsStartsMismatch",
19704  "The number of Starts (3) does not match the expected size of dim * numseg (4)"));
19705  // AddChromosomeNoLocation(expected_errors, entry);
19707  eval = validator.Validate(seh, options);
19708  CheckErrors(*eval, expected_errors);
19709  CLEAR_ERRORS
19710 }
19711 
19712 
19713 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_FastaLike, CGenBankFixture)
19714 {
19716  unit_test_util::RevComp(entry->SetSet().SetSeq_set().front());
19717  CRef<CSeq_align> align = BuildSetAlign(entry);
19718 
19719  CRef<CSeq_annot> annot(new CSeq_annot());
19720  annot->SetData().SetAlign().push_back(align);
19721  entry->SetSet().SetAnnot().push_back(annot);
19722 
19724 
19725  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "FastaLike",
19726  "Fasta: This may be a fasta-like alignment for SeqId: lcl|good1 in the context of good1"));
19727  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "PercentIdentity",
19728  "PercentIdentity: This alignment has a percent identity of 0%"));
19729  // AddChromosomeNoLocation(expected_errors, entry);
19731  eval = validator.Validate(seh, options);
19732  CheckErrors(*eval, expected_errors);
19733 
19734  // fasta like error should disappear if there are 5' gaps or internal gaps
19735  align->SetSegs().SetDenseg().SetNumseg(2);
19736  align->SetSegs().SetDenseg().SetLens()[0] = 5;
19737  align->SetSegs().SetDenseg().SetLens().push_back(55);
19738  align->SetSegs().SetDenseg().SetStarts()[2] = -1;
19739  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19740  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19741  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19742 
19743  CLEAR_ERRORS
19744  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "PercentIdentity",
19745  "PercentIdentity: This alignment has a percent identity of 0%"));
19746  // AddChromosomeNoLocation(expected_errors, entry);
19747 
19748  eval = validator.Validate(seh, options);
19749  CheckErrors(*eval, expected_errors);
19750 
19751  CLEAR_ERRORS
19752 }
19753 
19754 
19755 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_NullSegs, CGenBankFixture)
19756 {
19758  CRef<CSeq_align> align = BuildSetAlign(entry);
19759  align->ResetSegs();
19760 
19761  CRef<CSeq_annot> annot(new CSeq_annot());
19762  annot->SetData().SetAlign().push_back(align);
19763  entry->SetSet().SetAnnot().push_back(annot);
19764 
19766 
19767  expected_errors.push_back(new CExpectedError("", eDiag_Error, "NullSegs",
19768  "Segs: This alignment is missing all segments. This is a non-correctable error -- look for serious formatting problems."));
19769  // AddChromosomeNoLocation(expected_errors, entry);
19771  eval = validator.Validate(seh, options);
19772  CheckErrors(*eval, expected_errors);
19773 
19774  CLEAR_ERRORS
19775 }
19776 
19777 
19778 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_SegmentGap, CGenBankFixture)
19779 {
19781  CRef<CSeq_align> align = BuildSetAlign(entry);
19782  align->SetSegs().SetDenseg().SetNumseg(3);
19783  align->SetSegs().SetDenseg().SetLens()[0] = 5;
19784  align->SetSegs().SetDenseg().SetLens().push_back(10);
19785  align->SetSegs().SetDenseg().SetLens().push_back(55);
19786  align->SetSegs().SetDenseg().SetStarts().push_back(-1);
19787  align->SetSegs().SetDenseg().SetStarts().push_back(-1);
19788  align->SetSegs().SetDenseg().SetStarts().push_back(-1);
19789  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19790  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19791  align->SetSegs().SetDenseg().SetStarts().push_back(5);
19792 
19793  CRef<CSeq_annot> annot(new CSeq_annot());
19794  annot->SetData().SetAlign().push_back(align);
19795  entry->SetSet().SetAnnot().push_back(annot);
19796 
19798 
19799  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SegmentGap",
19800  "Segs: Segment 2 (near alignment position 5) in the context of good1 contains only gaps. Each segment must contain at least one actual sequence -- look for columns with all gaps and delete them."));
19801  // AddChromosomeNoLocation(expected_errors, entry);
19803  eval = validator.Validate(seh, options);
19804  CheckErrors(*eval, expected_errors);
19805 
19806  CLEAR_ERRORS
19807 }
19808 
19809 
19810 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_AlignDimOne, CGenBankFixture)
19811 {
19813  CRef<CSeq_align> align = BuildSetAlign(entry);
19814  align->SetSegs().SetDenseg().SetDim(1);
19815  align->SetSegs().SetDenseg().SetIds().pop_back();
19816  align->SetSegs().SetDenseg().SetIds().pop_back();
19817  align->SetSegs().SetDenseg().SetStarts().pop_back();
19818  align->SetSegs().SetDenseg().SetStarts().pop_back();
19819 
19820  CRef<CSeq_annot> annot(new CSeq_annot());
19821  annot->SetData().SetAlign().push_back(align);
19822  entry->SetSet().SetAnnot().push_back(annot);
19823 
19825 
19826  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "AlignDimOne",
19827  "Dim: This seqalign apparently has only one sequence. Each alignment must have at least two sequences. context lcl|good1"));
19828  // AddChromosomeNoLocation(expected_errors, entry);
19830  eval = validator.Validate(seh, options);
19831  CheckErrors(*eval, expected_errors);
19832 
19833  CLEAR_ERRORS
19834 }
19835 
19836 
19838 {
19840  CRef<CSeq_align> align(new CSeq_align());
19841  align->SetSegs().SetSparse();
19842 
19843  CRef<CSeq_annot> annot(new CSeq_annot());
19844  annot->SetData().SetAlign().push_back(align);
19845  entry->SetSet().SetAnnot().push_back(annot);
19846 
19848 
19849  expected_errors.push_back(new CExpectedError("", eDiag_Warning, "Segtype",
19850  "Segs: This alignment has an undefined or unsupported Seqalign segtype 7 (alignment number 1)"));
19851  // AddChromosomeNoLocation(expected_errors, entry);
19853  eval = validator.Validate(seh, options);
19854  CheckErrors(*eval, expected_errors);
19855 
19856  align->SetSegs().SetSpliced();
19857  expected_errors[0]->SetErrMsg("Segs: This alignment has an undefined or unsupported Seqalign segtype 6 (alignment number 1)");
19858  eval = validator.Validate(seh, options);
19859  CheckErrors(*eval, expected_errors);
19860 
19861  CLEAR_ERRORS
19862 }
19863 
19864 
19865 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_BlastAligns, CGenBankFixture)
19866 {
19868  CRef<CSeq_align> align = BuildSetAlign(entry);
19869 
19870  CRef<CSeq_annot> annot(new CSeq_annot());
19871  annot->SetData().SetAlign().push_back(align);
19872 
19873  CRef<CAnnotdesc> ad(new CAnnotdesc());
19874  ad->SetUser().SetType().SetStr("Blast Type");
19875  annot->SetDesc().Set().push_back(ad);
19876  entry->SetSet().SetAnnot().push_back(annot);
19877 
19879 
19880  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "BlastAligns",
19881  "Record contains BLAST alignments"));
19882  // AddChromosomeNoLocation(expected_errors, entry);
19884  eval = validator.Validate(seh, options);
19885  CheckErrors(*eval, expected_errors);
19886 
19887  CLEAR_ERRORS
19888 }
19889 
19890 
19891 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_PercentIdentity, CGenBankFixture)
19892 {
19894  entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTGGCCAAAATTGGCCAA");
19895  CRef<CSeq_align> align = BuildSetAlign(entry);
19896 
19897  CRef<CSeq_annot> annot(new CSeq_annot());
19898  annot->SetData().SetAlign().push_back(align);
19899 
19900  entry->SetSet().SetAnnot().push_back(annot);
19901 
19903 
19904  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "FastaLike",
19905  "Fasta: This may be a fasta-like alignment for SeqId: lcl|good1 in the context of good1"));
19906  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "PercentIdentity",
19907  "PercentIdentity: This alignment has a percent identity of 43%"));
19908  // AddChromosomeNoLocation(expected_errors, entry);
19910  eval = validator.Validate(seh, options);
19911  CheckErrors(*eval, expected_errors);
19912 
19913  CLEAR_ERRORS
19914 }
19915 
19916 
19918 {
19919  CRef<CSeq_align> align(new CSeq_align());
19921 
19922  CRef<CDense_diag> diag(new CDense_diag());
19923 
19924 
19925  int dim = 0;
19926  int len = 0;
19927 
19928  FOR_EACH_SEQENTRY_ON_SEQSET(s, entry->GetSet()) {
19929  dim++;
19930  CRef<CSeq_id> id(new CSeq_id());
19931  id->Assign(*((*s)->GetSeq().GetId().front()));
19932  diag->SetIds().push_back(id);
19933  diag->SetStarts().push_back(0);
19934 
19935  len = (*s)->GetSeq().GetInst().GetLength();
19936  }
19937  align->SetDim(dim);
19938  diag->SetDim(dim);
19939  diag->SetLen(len);
19940  align->SetSegs().SetDendiag().push_back(diag);
19941 
19942  return align;
19943 }
19944 
19945 
19946 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_UnexpectedAlignmentType, CGenBankFixture)
19947 {
19949  CRef<CSeq_align> align = BuildSetDendiagAlign(entry);
19950 
19951  CRef<CSeq_annot> annot(new CSeq_annot());
19952  annot->SetData().SetAlign().push_back(align);
19953 
19954  entry->SetSet().SetAnnot().push_back(annot);
19955 
19957 
19958  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "UnexpectedAlignmentType",
19959  "UnexpectedAlignmentType: This is not a DenseSeg alignment."));
19960  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "PercentIdentity",
19961  "PercentIdentity: This alignment has a percent identity of 0%"));
19962  // AddChromosomeNoLocation(expected_errors, entry);
19964  eval = validator.Validate(seh, options);
19965  CheckErrors(*eval, expected_errors);
19966 
19967  CLEAR_ERRORS
19968 }
19969 
19970 
19972 {
19973  CRef<CSeq_graph> graph(new CSeq_graph());
19974  graph->SetTitle("Phrap Quality");
19975  if (len == kInvalidSeqPos) {
19976  len = entry->GetSeq().GetInst().GetLength() - offset;
19977  }
19978  graph->SetNumval(len);
19979  graph->SetLoc().SetInt().SetFrom(offset);
19980  graph->SetLoc().SetInt().SetTo(offset + len - 1);
19981  graph->SetLoc().SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
19982 
19983  for (size_t pos = 0; pos < len; pos++) {
19984  graph->SetGraph().SetByte().SetValues().push_back(40);
19985  }
19986 
19987 
19988  graph->SetGraph().SetByte().SetMax(40);
19989  graph->SetGraph().SetByte().SetMin(40);
19990  graph->SetGraph().SetByte().SetAxis(40);
19991  return graph;
19992 }
19993 
19994 
19995 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphMin)
19996 {
19998  CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
19999  graph->SetGraph().SetByte().SetMin(-1);
20000  CRef<CSeq_annot> annot(new CSeq_annot());
20001  annot->SetData().SetGraph().push_back(graph);
20002  entry->SetSeq().SetAnnot().push_back(annot);
20003 
20005 
20006  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphMin",
20007  "Graph min (-1) out of range"));
20008  // AddChromosomeNoLocation(expected_errors, entry);
20009  eval = validator.Validate(seh, options);
20010  CheckErrors(*eval, expected_errors);
20011 
20012  graph->SetGraph().SetByte().SetMin(101);
20013  expected_errors[0]->SetErrMsg("Graph min (101) out of range");
20014  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphBelow",
20015  "60 quality scores have values below the reported minimum or 0"));
20016  eval = validator.Validate(seh, options);
20017  CheckErrors(*eval, expected_errors);
20018 
20019  CLEAR_ERRORS
20020 }
20021 
20022 
20023 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphMax)
20024 {
20026  CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
20027  graph->SetGraph().SetByte().SetMax(-1);
20028  CRef<CSeq_annot> annot(new CSeq_annot());
20029  annot->SetData().SetGraph().push_back(graph);
20030  entry->SetSeq().SetAnnot().push_back(annot);
20031 
20033 
20034  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphMax",
20035  "Graph max (-1) out of range"));
20036  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphAbove",
20037  "60 quality scores have values above the reported maximum or 100"));
20038  // AddChromosomeNoLocation(expected_errors, entry);
20039  eval = validator.Validate(seh, options);
20040  CheckErrors(*eval, expected_errors);
20041 
20042  delete expected_errors[1];
20043  expected_errors[1] = nullptr;
20044 
20045  graph->SetGraph().SetByte().SetMax(101);
20046  expected_errors[0]->SetErrMsg("Graph max (101) out of range");
20047  expected_errors[0]->SetSeverity(eDiag_Warning);
20048  eval = validator.Validate(seh, options);
20049  CheckErrors(*eval, expected_errors);
20050 
20051  CLEAR_ERRORS
20052 }
20053 
20054 
20055 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphByteLen)
20056 {
20058  CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
20059  graph->SetNumval(40);
20060  CRef<CSeq_annot> annot(new CSeq_annot());
20061  annot->SetData().SetGraph().push_back(graph);
20062  entry->SetSeq().SetAnnot().push_back(annot);
20063 
20065 
20066  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphByteLen",
20067  "SeqGraph (40) and ByteStore (60) length mismatch"));
20068  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen",
20069  "SeqGraph (40) and Bioseq (60) length mismatch"));
20070  // AddChromosomeNoLocation(expected_errors, entry);
20071  eval = validator.Validate(seh, options);
20072  CheckErrors(*eval, expected_errors);
20073 
20074  CLEAR_ERRORS
20075 }
20076 
20077 
20078 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphOutOfOrder)
20079 {
20081  CRef<CSeq_annot> annot(new CSeq_annot());
20082  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 20, 20));
20083  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 20));
20084  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 40, 20));
20085  entry->SetSeq().SetAnnot().push_back(annot);
20086 
20088 
20089  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphOutOfOrder",
20090  "Graph components are out of order - may be a software bug"));
20091  // AddChromosomeNoLocation(expected_errors, entry);
20092  eval = validator.Validate(seh, options);
20093  CheckErrors(*eval, expected_errors);
20094 
20095  CLEAR_ERRORS
20096 }
20097 
20098 
20099 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphSeqLitLen)
20100 {
20102  CRef<CSeq_annot> annot(new CSeq_annot());
20103  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 11));
20104  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20105  entry->SetSeq().SetAnnot().push_back(annot);
20106 
20108 
20109  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen",
20110  "SeqGraph (23) and Bioseq (24) length mismatch"));
20111  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphSeqLitLen",
20112  "SeqGraph (11) and SeqLit (12) length mismatch"));
20113  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStopPhase",
20114  "SeqGraph (10) and SeqLit (11) stop do not coincide"));
20115  // AddChromosomeNoLocation(expected_errors, entry);
20116  eval = validator.Validate(seh, options);
20117  CheckErrors(*eval, expected_errors);
20118 
20119  CLEAR_ERRORS
20120 }
20121 
20122 
20123 BOOST_FIXTURE_TEST_CASE(Test_SEQ_GRAPH_GraphSeqLocLen, CGenBankFixture)
20124 {
20126  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
20127  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
20128  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
20129 
20130  CRef<CSeq_annot> annot(new CSeq_annot());
20131  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 13));
20132  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20133  entry->SetSeq().SetAnnot().push_back(annot);
20134 
20136 
20137  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphGapScore",
20138  "1 gap bases have positive score value"));
20139  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen",
20140  "SeqGraph (25) and Bioseq (24) length mismatch"));
20141  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphSeqLocLen",
20142  "SeqGraph (13) and SeqLoc (12) length mismatch"));
20143  // AddChromosomeNoLocation(expected_errors, entry);
20144  eval = validator.Validate(seh, options);
20145  CheckErrors(*eval, expected_errors);
20146 
20147  CLEAR_ERRORS
20148 }
20149 
20150 
20151 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphStartPhase)
20152 {
20154  CRef<CSeq_annot> annot(new CSeq_annot());
20155  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20156  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 21, 13));
20157  entry->SetSeq().SetAnnot().push_back(annot);
20158 
20160 
20161  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphGapScore",
20162  "1 gap bases have positive score value"));
20163  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen",
20164  "SeqGraph (25) and Bioseq (24) length mismatch"));
20165  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphSeqLitLen",
20166  "SeqGraph (13) and SeqLit (12) length mismatch"));
20167  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStartPhase",
20168  "SeqGraph (21) and SeqLit (22) start do not coincide"));
20169  // AddChromosomeNoLocation(expected_errors, entry);
20170  eval = validator.Validate(seh, options);
20171  CheckErrors(*eval, expected_errors);
20172 
20173  CLEAR_ERRORS
20174 }
20175 
20176 // note - GraphStopPhase exercised in Test_SEQ_GRAPH_GraphSeqLitLen
20177 
20178 
20179 BOOST_FIXTURE_TEST_CASE(Test_SEQ_GRAPH_GraphDiffNumber, CGenBankFixture)
20180 {
20182 
20183  CRef<CSeq_annot> annot(new CSeq_annot());
20184  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 6));
20185  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 6, 6));
20186  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20187  entry->SetSeq().SetAnnot().push_back(annot);
20188 
20190 
20191  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphSeqLitLen",
20192  "SeqGraph (6) and SeqLit (12) length mismatch"));
20193  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStopPhase",
20194  "SeqGraph (5) and SeqLit (11) stop do not coincide"));
20195  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphSeqLitLen",
20196  "SeqGraph (6) and SeqLit (12) length mismatch"));
20197  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStartPhase",
20198  "SeqGraph (6) and SeqLit (22) start do not coincide"));
20199  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStopPhase",
20200  "SeqGraph (11) and SeqLit (33) stop do not coincide"));
20201  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphDiffNumber",
20202  "Different number of SeqGraph (3) and SeqLit (2) components"));
20203  // AddChromosomeNoLocation(expected_errors, entry);
20204  eval = validator.Validate(seh, options);
20205  CheckErrors(*eval, expected_errors);
20206 
20207  CLEAR_ERRORS
20208 }
20209 
20210 
20211 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphACGTScore)
20212 {
20214  CRef<CSeq_annot> annot(new CSeq_annot());
20215  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20216  CRef<CSeq_graph> graph = BuildGoodByteGraph(entry, 22, 12);
20217  graph->SetGraph().SetByte().SetValues().pop_back();
20218  graph->SetGraph().SetByte().SetValues().push_back(0);
20219  graph->SetGraph().SetByte().SetMin(0);
20220  annot->SetData().SetGraph().push_back(graph);
20221 
20222  entry->SetSeq().SetAnnot().push_back(annot);
20223 
20225 
20226  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphACGTScore",
20227  "1 ACGT bases have zero score value - first one at position 34"));
20228  // AddChromosomeNoLocation(expected_errors, entry);
20229  eval = validator.Validate(seh, options);
20230  CheckErrors(*eval, expected_errors);
20231 
20232  CLEAR_ERRORS
20233 }
20234 
20235 
20236 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphNScore)
20237 {
20239  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCATNATGATG");
20240 
20241  CRef<CSeq_annot> annot(new CSeq_annot());
20242  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20243  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20244 
20245  entry->SetSeq().SetAnnot().push_back(annot);
20246 
20248 
20249  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphNScore",
20250  "1 N bases have positive score value - first one at position 28"));
20251  // AddChromosomeNoLocation(expected_errors, entry);
20252  eval = validator.Validate(seh, options);
20253  CheckErrors(*eval, expected_errors);
20254 
20255  CLEAR_ERRORS
20256 }
20257 
20258 
20259 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphGapScore)
20260 {
20262 
20263  CRef<CSeq_annot> annot(new CSeq_annot());
20264  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20265  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 12, 10));
20266  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20267 
20268  entry->SetSeq().SetAnnot().push_back(annot);
20269 
20271 
20272  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphGapScore",
20273  "10 gap bases have positive score value"));
20274  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphSeqLitLen",
20275  "SeqGraph (10) and SeqLit (12) length mismatch"));
20276  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStartPhase",
20277  "SeqGraph (12) and SeqLit (22) start do not coincide"));
20278  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStopPhase",
20279  "SeqGraph (21) and SeqLit (33) stop do not coincide"));
20280  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphDiffNumber",
20281  "Different number of SeqGraph (3) and SeqLit (2) components"));
20282  // AddChromosomeNoLocation(expected_errors, entry);
20283  eval = validator.Validate(seh, options);
20284  CheckErrors(*eval, expected_errors);
20285 
20286  CLEAR_ERRORS
20287 }
20288 
20289 
20290 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphOverlap)
20291 {
20293 
20294  CRef<CSeq_annot> annot(new CSeq_annot());
20295  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 31));
20296  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 30, 30));
20297 
20298  entry->SetSeq().SetAnnot().push_back(annot);
20299 
20301 
20302  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphOverlap",
20303  "Graph components overlap, with multiple scores for a single base"));
20304  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen",
20305  "SeqGraph (61) and Bioseq (60) length mismatch"));
20306  // AddChromosomeNoLocation(expected_errors, entry);
20307  eval = validator.Validate(seh, options);
20308  CheckErrors(*eval, expected_errors);
20309 
20310  CLEAR_ERRORS
20311 }
20312 
20313 
20314 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphBioseqId)
20315 {
20317 
20318  CRef<CSeq_annot> annot(new CSeq_annot());
20319  CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
20320  graph->SetLoc().SetInt().SetId().SetLocal().SetStr("good2");
20321  annot->SetData().SetGraph().push_back(graph);
20322  entry->SetSeq().SetAnnot().push_back(annot);
20323 
20325 
20326  expected_errors.push_back(new CExpectedError("lcl|good2", eDiag_Warning, "GraphBioseqId",
20327  "Bioseq not found for Graph location good2"));
20328  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "GraphPackagingProblem",
20329  "There is 1 mispackaged graph in this record."));
20330  // AddChromosomeNoLocation(expected_errors, entry);
20331  eval = validator.Validate(seh, options);
20332  CheckErrors(*eval, expected_errors);
20333 
20334  CLEAR_ERRORS
20335 }
20336 
20337 
20338 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphACGTScoreMany)
20339 {
20341  CRef<CSeq_annot> annot(new CSeq_annot());
20342  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20343  CRef<CSeq_graph> graph = BuildGoodByteGraph(entry, 22, 12);
20344  graph->SetGraph().SetByte().ResetValues();
20345  for (size_t i = 0; i < graph->GetNumval(); i++) {
20346  graph->SetGraph().SetByte().SetValues().push_back(0);
20347  }
20348  graph->SetGraph().SetByte().SetMin(0);
20349  annot->SetData().SetGraph().push_back(graph);
20350 
20351  entry->SetSeq().SetAnnot().push_back(annot);
20352 
20354 
20355  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphACGTScoreMany",
20356  "12 ACGT bases (50.00%) have zero score value - first one at position 23"));
20357  // AddChromosomeNoLocation(expected_errors, entry);
20358  eval = validator.Validate(seh, options);
20359  CheckErrors(*eval, expected_errors);
20360 
20361  CLEAR_ERRORS
20362 }
20363 
20364 
20365 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphNScoreMany)
20366 {
20368  entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("ANNNNNNTGATG");
20369 
20370  CRef<CSeq_annot> annot(new CSeq_annot());
20371  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20372  annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20373 
20374  entry->SetSeq().SetAnnot().push_back(annot);
20375 
20377 
20378  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphNScoreMany",
20379  "6 N bases (25.00%) have positive score value - first one at position 24"));
20380  /*
20381  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
20382  "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
20383  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
20384  "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
20385  */
20386  // AddChromosomeNoLocation(expected_errors, entry);
20387 
20388  eval = validator.Validate(seh, options);
20389  CheckErrors(*eval, expected_errors);
20390 
20391  CLEAR_ERRORS
20392 
20393 #if 0
20394 
20395  scope.RemoveTopLevelSeqEntry(seh);
20396  CSeq_literal& first_part = entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral();
20397  first_part.SetSeq_data().SetIupacna().Set("AAAAAAAAAAAAAAAAAAAANNNNNNNNNNNNNNNNNNNNTTTTTTTTTTTTTTTTTTTT");
20398  first_part.SetLength(60);
20399  entry->SetSeq().SetInst().SetLength(82);
20400  entry->SetSeq().ResetAnnot();
20401  CRef<CSeq_graph> bad_graph = BuildGoodByteGraph(entry, 0, 79);
20402  CSeq_graph_Base::C_Graph::TByte& bytes = bad_graph->SetGraph().SetByte();
20403  bytes.ResetValues();
20404  for (size_t pos = 0; pos < 20; pos++) {
20405  bytes.SetValues().push_back(0);
20406  }
20407  for (size_t pos = 20; pos < 40; pos++) {
20408  bytes.SetValues().push_back(114);
20409  }
20410  for (size_t pos = 40; pos < 70; pos++) {
20411  bytes.SetValues().push_back(21);
20412  }
20413  bytes.SetMax(-1);
20414  bytes.SetMin(0);
20415  bytes.SetAxis(5);
20416  CRef<CSeq_annot> annot2(new CSeq_annot());
20417  annot2->SetData().SetGraph().push_back(bad_graph);
20418  entry->SetSeq().SetAnnot().push_back(annot2);
20419 
20420  seh = scope.AddTopLevelSeqEntry(*entry);
20421 
20422  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen", "SeqGraph(79) and Bioseq(72) length mismatch"));
20423  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphMax", "Graph max(-1) out of range"));
20424  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphByteLen", "SeqGraph(79) and ByteStore(70) length mismatch"));
20425  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphACGTScoreMany", "23 ACGT bases(29.11%) have zero score value - first one at position 1"));
20426  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphNScoreMany", "20 N bases(25.32%) have positive score value - first one at position 21"));
20427  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphGapScore", "10 gap bases have positive score value"));
20428  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphAbove", "79 quality scores have values above the reported maximum or 100"));
20429  eval = validator.Validate(seh, options);
20430  CheckErrors(*eval, expected_errors);
20431 
20432  CLEAR_ERRORS
20433 #endif
20434 }
20435 
20436 
20437 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphLocInvalid_1)
20438 {
20440 
20441  CRef<CSeq_annot> annot(new CSeq_annot());
20442  CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
20443  graph->SetLoc().SetInt().SetTo(61);
20444  annot->SetData().SetGraph().push_back(graph);
20445  entry->SetSeq().SetAnnot().push_back(annot);
20446 
20448 
20449  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphLocInvalid",
20450  "SeqGraph location (lcl|good:1-62) is invalid"));
20451  // AddChromosomeNoLocation(expected_errors, entry);
20452  eval = validator.Validate(seh, options);
20453  CheckErrors(*eval, expected_errors);
20454  CLEAR_ERRORS
20455 }
20456 
20457 
20458 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphLocInvalid_2)
20459 {
20461 
20462  CRef<CSeq_annot> annot(new CSeq_annot());
20463  CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
20464  graph->ResetLoc();
20465  annot->SetData().SetGraph().push_back(graph);
20466  entry->SetSeq().SetAnnot().push_back(annot);
20467 
20469 
20470  expected_errors.push_back(new CExpectedError("", eDiag_Error, "GraphLocInvalid",
20471  "SeqGraph location (Unknown) is invalid"));
20472  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "GraphPackagingProblem",
20473  "There is 1 mispackaged graph in this record."));
20474  // AddChromosomeNoLocation(expected_errors, entry);
20475  eval = validator.Validate(seh, options);
20476  CheckErrors(*eval, expected_errors);
20477 
20478  CLEAR_ERRORS
20479 }
20480 
20481 
20482 BOOST_AUTO_TEST_CASE(Test_SEQ_ANNOT_AnnotIDs)
20483 {
20485  CRef<CSeq_annot> annot(new CSeq_annot());
20486  annot->SetData().SetIds();
20487  entry->SetSeq().SetAnnot().push_back(annot);
20488 
20490 
20491  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "AnnotIDs",
20492  "Record contains Seq-annot.data.ids"));
20493  // AddChromosomeNoLocation(expected_errors, entry);
20494  eval = validator.Validate(seh, options);
20495  CheckErrors(*eval, expected_errors);
20496 
20497  CLEAR_ERRORS
20498 }
20499 
20500 
20501 BOOST_AUTO_TEST_CASE(Test_SEQ_ANNOT_AnnotLOCs)
20502 {
20504  CRef<CSeq_annot> annot(new CSeq_annot());
20505  annot->SetData().SetLocs();
20506  entry->SetSeq().SetAnnot().push_back(annot);
20507 
20509 
20510  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "AnnotLOCs",
20511  "Record contains Seq-annot.data.locs"));
20512  // AddChromosomeNoLocation(expected_errors, entry);
20513  eval = validator.Validate(seh, options);
20514  CheckErrors(*eval, expected_errors);
20515 
20516  CLEAR_ERRORS
20517 }
20518 
20519 
20520 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_WrongQualOnCDS)
20521 {
20524  CRef<CGb_qual> qual(new CGb_qual("gene_synonym", "anything"));
20525  cds->SetQual().push_back(qual);
20526 
20528 
20529  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "WrongQualOnCDS",
20530  "gene_synonym should not be a gbqual on a CDS feature"));
20531  // AddChromosomeNoLocation(expected_errors, entry);
20532  eval = validator.Validate(seh, options);
20533  CheckErrors(*eval, expected_errors);
20534 
20535  CLEAR_ERRORS
20536 }
20537 
20538 
20539 BOOST_AUTO_TEST_CASE(Test_FixLatLonFormat)
20540 {
20541  string to_fix;
20542  string fixed;
20543 
20544 
20545  bool format_correct;
20546  bool precision_correct;
20547  bool lat_in_range;
20548  bool lon_in_range;
20549  double lat_value;
20550  double lon_value;
20551 
20552  CSubSource::IsCorrectLatLonFormat("53.43.20 N 7.43.20 E", format_correct, precision_correct,
20553  lat_in_range, lon_in_range,
20554  lat_value, lon_value);
20555  BOOST_CHECK(!format_correct);
20556 
20557 }
20558 
20559 
20560 BOOST_AUTO_TEST_CASE(Test_FixLatLonCountry)
20561 {
20562  string latlon;
20563  string country;
20564  string error;
20566 
20567  latlon = "35 N 80 E";
20568  country = "USA";
20569  error = CSubSource::ValidateLatLonCountry(country, latlon, false, errcode);
20570  BOOST_CHECK_EQUAL(errcode, CSubSource::eLatLonCountryErr_Value);
20571  BOOST_CHECK_EQUAL(error, "Longitude should be set to W (western hemisphere)");
20572  BOOST_CHECK_EQUAL(latlon, "35.00 N 80.00 W");
20573 
20574  latlon = "25 N 47 E";
20575  country = "Madagascar";
20576  error = CSubSource::ValidateLatLonCountry(country, latlon, false, errcode);
20577  BOOST_CHECK_EQUAL(errcode, CSubSource::eLatLonCountryErr_Value);
20578  BOOST_CHECK_EQUAL(error, "Latitude should be set to S (southern hemisphere)");
20579  BOOST_CHECK_EQUAL(latlon, "25.00 S 47.00 E");
20580 
20581  latlon = "15 N 47 E";
20582  country = "Austria";
20583  error = CSubSource::ValidateLatLonCountry(country, latlon, false, errcode);
20584  BOOST_CHECK_EQUAL(errcode, CSubSource::eLatLonCountryErr_Value);
20585  BOOST_CHECK_EQUAL(error, "Latitude and longitude values appear to be exchanged");
20586  BOOST_CHECK_EQUAL(latlon, "47.00 N 15.00 E");
20587 
20588 }
20589 
20590 
20591 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ShortExon)
20592 {
20594  CRef<CSeq_entry> nseq = entry->SetSet().SetSeq_set().front();
20595  CRef<CSeq_entry> pseq = entry->SetSet().SetSeq_set().back();
20597  CRef<CSeq_feat> prot = pseq->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
20598 
20599  string start = "ATG";
20600  string stop = "TAA";
20601  string splice_left = "GT";
20602  string splice_right = "AG";
20603  string fifteen = "CCCAGAAAAACAGGT";
20604 
20605  string first_exon = start + fifteen;
20606  string intron = splice_left + fifteen + splice_right;
20607  string second_exon = fifteen;
20608  string third_exon = fifteen + stop;
20609 
20610  string nuc_str = first_exon + intron + second_exon + intron + third_exon;
20611  nseq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set(nuc_str);
20612  nseq->SetSeq().SetInst().SetLength(nuc_str.length());
20613 
20614  CRef<CSeq_loc> loc1(new CSeq_loc());
20615  loc1->SetInt().SetId().SetLocal().SetStr("nuc");
20616  loc1->SetInt().SetFrom(0);
20617  TSeqPos offset = first_exon.length();
20618  loc1->SetInt().SetTo(offset - 1);
20619 
20620  offset += intron.length();
20621  CRef<CSeq_loc> loc2(new CSeq_loc());
20622  loc2->SetInt().SetId().SetLocal().SetStr("nuc");
20623  loc2->SetInt().SetFrom(offset);
20624  offset += second_exon.length();
20625  loc2->SetInt().SetTo(offset - 1);
20626 
20627 
20628  offset += intron.length();
20629  CRef<CSeq_loc> loc3(new CSeq_loc());
20630  loc3->SetInt().SetId().SetLocal().SetStr("nuc");
20631  loc3->SetInt().SetFrom(offset);
20632  offset += third_exon.length();
20633  loc3->SetInt().SetTo(offset - 1);
20634 
20635  cds->SetLocation().SetMix().Set().push_back(loc1);
20636  cds->SetLocation().SetMix().Set().push_back(loc2);
20637  cds->SetLocation().SetMix().Set().push_back(loc3);
20638 
20639  string loc_str = first_exon + second_exon + third_exon;
20640  string prot_str;
20641  CSeqTranslator::Translate(loc_str, prot_str);
20642  if (NStr::EndsWith(prot_str, "*")) {
20643  prot_str = prot_str.substr(0, prot_str.length() - 1);
20644  }
20645  pseq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set(prot_str);
20646  pseq->SetSeq().SetInst().SetLength(prot_str.length());
20647 
20648  prot->SetLocation().SetInt().SetTo(prot_str.length() - 1);
20649 
20651 
20652  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ShortExon",
20653  "Internal coding region exon is too short at position 38-52"));
20654  // AddChromosomeNoLocation(expected_errors, entry);
20655  eval = validator.Validate(seh, options);
20656  CheckErrors(*eval, expected_errors);
20657 
20658  CLEAR_ERRORS
20659 }
20660 
20661 
20662 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ExtraProteinFeature)
20663 {
20665  CRef<CSeq_entry> pseq = entry->SetSet().SetSeq_set().back();
20666  CRef<CSeq_feat> second_prot = AddProtFeat(pseq);
20667  second_prot->SetData().SetProt().SetName().front() = "different name";
20668  second_prot->SetLocation().SetInt().SetFrom(1);
20669 
20671 
20672  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "ExtraProteinFeature",
20673  "Protein sequence has multiple unprocessed protein features"));
20674  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "ExtraProteinFeature",
20675  "Protein sequence has multiple unprocessed protein features"));
20676  // AddChromosomeNoLocation(expected_errors, entry);
20677  eval = validator.Validate(seh, options);
20678  CheckErrors(*eval, expected_errors);
20679 
20680  CLEAR_ERRORS
20681 }
20682 
20683 
20684 BOOST_AUTO_TEST_CASE(Test_FixFormatDate)
20685 {
20686  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("999"), "");
20687  BOOST_CHECK_EQUAL(CSubSource::GetCollectionDateProblem("999"), "Collection_date format is not in DD-Mmm-YYYY format");
20688 
20689  //ISO dates are fine as they are
20690  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2014-08-10T12:23:30Z"), "2014-08-10T12:23:30Z");
20691  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2014-08-10T12:23Z"), "2014-08-10T12:23Z");
20692  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2014-08-10T12Z"), "2014-08-10T12Z");
20693  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2014-08-10T12+00:00"), "2014-08-10T12+00:00");
20694 
20695  bool bad_format = false;
20696  bool in_future = false;
20697  CSubSource::IsCorrectDateFormat("collection date: Nov-2010 and Dec-2012", bad_format, in_future);
20698  BOOST_CHECK_EQUAL(true, bad_format);
20699  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("collection date: Nov-2010 and Dec-2012"), "");
20700 
20701  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20-12-2014"), "20-Dec-2014");
20702  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Dec-12-2014"), "12-Dec-2014");
20703  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-11"), "Sep-2011");
20704  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("missing"), "");
20705  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("n/a"), "");
20706  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10-Apr-93"), "10-Apr-1993");
20707  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("1-Apr"), "Apr-2001");
20708  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("#Date"), "");
20709  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("05122011"), "");
20710  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("08-Mar"), "Mar-2008");
20711  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("08022011"), "");
20712  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("1-May"), "May-2001");
20713  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10-Apr"), "Apr-2010");
20714  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10-Dec"), "Dec-2010");
20715  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10-May"), "May-2010");
20716  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10-Nov"), "Nov-2010");
20717  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10022011"), "");
20718  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10082010"), "");
20719  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("1082009"), "");
20720  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("11-Sep"), "Sep-2011");
20721  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Apr"), "Apr-2012");
20722  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Aug"), "Aug-2012");
20723  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Dec"), "Dec-2012");
20724  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Feb"), "Feb-2012");
20725  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Jun"), "Jun-2012");
20726  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Nov"), "Nov-2012");
20727  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Oct"), "Oct-2012");
20728  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("13072010"), "");
20729  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("14-Apr-97"), "14-Apr-1997");
20730  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("14092010"), "");
20731  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("14122011"), "");
20732  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("15/05/98"), "15-May-1998");
20733  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("15072010"), "");
20734  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("15082010"), "");
20735  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("17-Mar-96"), "17-Mar-1996");
20736  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("17062011"), "");
20737  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("19-Jul-99"), "19-Jul-1999");
20738  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("19-Sep-97"), "19-Sep-1997");
20739  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("19012012"), "");
20740  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2-Aug"), "Aug-2002");
20741  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2-Jan-98"), "02-Jan-1998");
20742  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20-Jun-91"), "20-Jun-1991");
20743  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20009-04-14"), "");
20744  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20072010"), "");
20745  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20082010"), "");
20746  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20090415"), "");
20747  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("202008-01-26"), "");
20748  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("202008-01-27"), "");
20749  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("202008-08-25"), "");
20750  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("21-Mar-96"), "21-Mar-1996");
20751  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2209"), "");
20752  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("23-Oct-94"), "23-Oct-1994");
20753  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("25-Apr-20010"), "");
20754  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("25-Jun-99"), "25-Jun-1999");
20755  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("25012012"), "");
20756  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("26-Apr-20010"), "");
20757  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("26-Feb-51"), "26-Feb-1951");
20758  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("27072010"), "");
20759  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Apr-20010"), "");
20760  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-May-98"), "29-May-1998");
20761  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Sep-94"), "29-Sep-1994");
20762  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("3-Jan"), "Jan-2003");
20763  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("3-Mar-93"), "03-Mar-1993");
20764  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("3082010"), "");
20765  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("31082010"), "");
20766  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39259"), "");
20767  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39517"), "");
20768  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39681"), "");
20769  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39762"), "");
20770  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39846"), "");
20771  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39855"), "");
20772  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39873"), "");
20773  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39898"), "");
20774  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39903"), "");
20775  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39910"), "");
20776  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39917"), "");
20777  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39926"), "");
20778  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39980"), "");
20779  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39982"), "");
20780  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("4-Feb"), "Feb-2004");
20781  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40010"), "");
20782  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40035"), "");
20783  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40057"), "");
20784  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40070"), "");
20785  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40087"), "");
20786  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40093"), "");
20787  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40313"), "");
20788  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40359"), "");
20789  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40360"), "");
20790  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40361"), "");
20791  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40367"), "");
20792  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40368"), "");
20793  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40370"), "");
20794  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40379"), "");
20795  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40428"), "");
20796  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40995"), "");
20797  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Oct-20006"), "");
20798  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Sep"), "Sep-2006");
20799  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("7-Dec"), "Dec-2007");
20800  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("8-Jul"), "Jul-2008");
20801  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("8-Sep-99"), "08-Sep-1999");
20802  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("9-Jul"), "Jul-2009");
20803  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("9-Jul-84"), "09-Jul-1984");
20804  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("9-Jun"), "Jun-2009");
20805  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("9-Sep"), "Sep-2009");
20806  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Apr-01"), "Apr-2001");
20807  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Apr-10"), "Apr-2010");
20808  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Aug-05"), "Aug-2005");
20809  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Aug-08"), "Aug-2008");
20810  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Aug-12"), "Aug-2012");
20811  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("August 13"), "Aug-2013");
20812  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("August 27"), "Aug-1927");
20813  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Dec-05"), "Dec-2005");
20814  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Dec-12"), "Dec-2012");
20815  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Dec-98"), "Dec-1998");
20816  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Feb-12"), "Feb-2012");
20817  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Feb-13"), "Feb-2013");
20818  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jan-06"), "Jan-2006");
20819  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jan-13"), "Jan-2013");
20820  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jan-96"), "Jan-1996");
20821  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jul-04"), "Jul-2004");
20822  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jul-08"), "Jul-2008");
20823  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("July 24"), "Jul-2024");
20824  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("July 9"), "Jul-2009");
20825  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-05"), "Jun-2005");
20826  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-08"), "Jun-2008");
20827  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-10"), "Jun-2010");
20828  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-12"), "Jun-2012");
20829  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-67"), "Jun-1967");
20830  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-80"), "Jun-1980");
20831  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("June 11"), "Jun-2011");
20832  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("June 25"), "Jun-1925");
20833  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-02"), "Mar-2002");
20834  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-05"), "Mar-2005");
20835  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-09"), "Mar-2009");
20836  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-10"), "Mar-2010");
20837  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-11"), "Mar-2011");
20838  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-12"), "Mar-2012");
20839  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May 21"), "May-2021");
20840  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May 7"), "May-2007");
20841  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May-05"), "May-2005");
20842  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May-08"), "May-2008");
20843  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May-09"), "May-2009");
20844  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May-10"), "May-2010");
20845  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May-11"), "May-2011");
20846  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Nov-10"), "Nov-2010");
20847  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Nov-11"), "Nov-2011");
20848  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Oct-05"), "Oct-2005");
20849  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Oct-10"), "Oct-2010");
20850  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("October 8"), "Oct-2008");
20851  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-05"), "Sep-2005");
20852  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-08"), "Sep-2008");
20853  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-09"), "Sep-2009");
20854  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-12"), "Sep-2012");
20855  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-93"), "Sep-1993");
20856  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("September 10"), "Sep-2010");
20857  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("September 24"), "Sep-2024");
20858  // fix leading/trailing spaces
20859  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat(" 2010-03-01"), "2010-03-01");
20860 
20861  // ISO Format dates are not ambiguous
20862  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2010-03-01"), "2010-03-01");
20863 
20864  // if one token is NOT zero-padded and less than 10, and the other is either
20865  // 10 or more or IS zero-padded, then the token that is not padded and less
20866  // than 10 is the day, and the other is the year, to which we should add 2000
20867  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Apr-04"), "06-Apr-2004");
20868  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Aug-09"), "06-Aug-2009");
20869  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Feb-08"), "06-Feb-2008");
20870  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Jan-11"), "06-Jan-2011");
20871  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Jun-11"), "06-Jun-2011");
20872  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Jun-12"), "06-Jun-2012");
20873  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-May-03"), "06-May-2003");
20874  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Nov-08"), "06-Nov-2008");
20875  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Oct-09"), "06-Oct-2009");
20876 
20877 
20878  // check for days not in month
20879  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("31-Jun-2013"), "");
20880  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Feb-2013"), "");
20881  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Feb-2012"), "29-Feb-2012");
20882  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Feb-2000"), "29-Feb-2000");
20883  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Feb-1900"), "");
20884  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("01/01/1900"), "01-Jan-1900");
20885  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("04/04/2013"), "04-Apr-2013");
20886  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("11/11/2003"), "11-Nov-2003");
20887 
20888  // look for "named numbers"
20889  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6th July 2010"), "06-Jul-2010");
20890  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("February 24th, 2012"), "24-Feb-2012");
20891  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("1st December 2012"), "01-Dec-2012");
20892  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2nd December 2012"), "02-Dec-2012");
20893  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("3rd December 2012"), "03-Dec-2012");
20894 
20895  // unusual delimiters
20896  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("July-15_2011"), "15-Jul-2011");
20897  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("03-Aug=2011"), "03-Aug-2011");
20898  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jul=2010"), "Jul-2010");
20899  BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("30.12.1998"), "30-Dec-1998");
20900 
20901 }
20902 
20903 
20904 BOOST_AUTO_TEST_CASE(Test_DetectDateFormat)
20905 {
20906  bool ambiguous;
20907  bool day_first;
20908 
20909  CSubSource::DetectDateFormat("1-1-2010", ambiguous, day_first);
20910  BOOST_CHECK_EQUAL(ambiguous, true);
20911 
20912  CSubSource::DetectDateFormat("1-6-2010", ambiguous, day_first);
20913  BOOST_CHECK_EQUAL(ambiguous, true);
20914 
20915  CSubSource::DetectDateFormat("7-15-2010", ambiguous, day_first);
20916  BOOST_CHECK_EQUAL(ambiguous, false);
20917  BOOST_CHECK_EQUAL(day_first, false);
20918 
20919  CSubSource::DetectDateFormat("2010 8 24", ambiguous, day_first);
20920  BOOST_CHECK_EQUAL(ambiguous, false);
20921  BOOST_CHECK_EQUAL(day_first, false);
20922 
20923  CSubSource::DetectDateFormat("31-5-2008", ambiguous, day_first);
20924  BOOST_CHECK_EQUAL(ambiguous, false);
20925  BOOST_CHECK_EQUAL(day_first, true);
20926 
20927 }
20928 
20929 
20930 static void s_USAStateTest(string before, string after, CCountries::EStateCleanup expected)
20931 {
20933  string result = CCountries::USAStateCleanup(before, type);
20934  BOOST_CHECK_EQUAL(result, after);
20935  BOOST_CHECK_EQUAL((int) type, (int) expected);
20936 }
20937 
20938 
20939 BOOST_AUTO_TEST_CASE(Test_USAStateCleanup)
20940 {
20941  s_USAStateTest("Puerto Rico: San Juan", "USA: Puerto Rico, San Juan", CCountries::e_Corrected );
20942  s_USAStateTest("USA: Puerto Rico", "USA: Puerto Rico", CCountries::e_Valid );
20943  s_USAStateTest("USA: Puerto Rico, Florida", "USA: Puerto Rico, Florida", CCountries::e_Ambiguous );
20944  s_USAStateTest("USA: Florida, Puerto Rico", "USA: Florida, Puerto Rico", CCountries::e_Ambiguous );
20945 
20946  s_USAStateTest("USA: Bethesda, State Of maryland", "USA: Maryland, Bethesda", CCountries::e_Corrected );
20947  s_USAStateTest("USA:NY", "USA: New York", CCountries::e_Corrected );
20948  s_USAStateTest("USA: Delaware, county South carolina", "USA: Delaware, county South carolina", CCountries::e_Valid );
20949  s_USAStateTest("USA:LA, EastBatonRougeParish", "USA: Louisiana, East Baton Rouge Parish", CCountries::e_Corrected );
20950  s_USAStateTest("USA: DeSoto Parish, Louisiana", "USA: Louisiana, DeSoto Parish", CCountries::e_Corrected );
20951  s_USAStateTest("USA: Napa, Solano, Yolo, Marin Counties, CA", "USA: California, Napa, Solano, Yolo, Marin Counties", CCountries::e_Corrected );
20952  s_USAStateTest("USA: Montana, Maine", "USA: Montana, Maine", CCountries::e_Ambiguous );
20953  s_USAStateTest("USA: San Diego County, CA", "USA: California, San Diego County", CCountries::e_Corrected );
20954  s_USAStateTest("USA: Madison", "USA: Madison", CCountries::e_Missing );
20955  s_USAStateTest("USA", "USA", CCountries::e_Valid );
20956 
20957  s_USAStateTest("USA: Arkansas, Washington", "USA: Arkansas, Washington", CCountries::e_Ambiguous );
20958  s_USAStateTest("USA: Washington, Arkansas", "USA: Washington, Arkansas", CCountries::e_Ambiguous );
20959  s_USAStateTest("USA: AR, Washington", "USA: Arkansas, Washington", CCountries::e_Ambiguous );
20960  s_USAStateTest("USA: Washington, AR", "USA: Washington, Arkansas", CCountries::e_Ambiguous );
20961  s_USAStateTest("USA: Wisconsin, Oregon", "USA: Wisconsin, Oregon", CCountries::e_Ambiguous );
20962 
20963  s_USAStateTest("Puerto Rico: San Juan", "USA: Puerto Rico, San Juan", CCountries::e_Corrected );
20964  s_USAStateTest("USA: Puerto Rico", "USA: Puerto Rico", CCountries::e_Valid );
20965  s_USAStateTest("USA: Puerto Rico, Florida", "USA: Puerto Rico, Florida", CCountries::e_Ambiguous );
20966  s_USAStateTest("USA: Florida, Puerto Rico", "USA: Florida, Puerto Rico", CCountries::e_Ambiguous );
20967 
20968  s_USAStateTest("USA:Los Angeles", "USA: Los Angeles", CCountries::e_Missing );
20969  s_USAStateTest("USA:Hayward", "USA: Hayward", CCountries::e_Missing );
20970 
20972  exm["USA: Washington, Arkansas"] = "USA: Arkansas, Washington";
20973  // self-entry is needed for converting e_Ambiguous to e_Valid (from full name) or e_Corrected (from abbreviation)
20974  exm["USA: Arkansas, Washington"] = "USA: Arkansas, Washington";
20975  exm["USA: Puerto Rico, Florida"] = "USA: Puerto Rico, Florida";
20976  exm["USA: Florida, Puerto Rico"] = "USA: Puerto Rico, Florida";
20977  exm["USA: Los Angeles"] = "USA: California, Los Angeles";
20978  exm["USA:Hayward"] = "USA: California, Hayward";
20979  // exm["USA:PR"] = "USA: Puerto Rico";
20980  // exm["USA:GU"] = "USA: Guam";
20981  // exm["USA:VI"] = "USA: US Virgin Islands";
20982  // exm["USA:AS"] = "USA: American Samoa";
20984 
20985  s_USAStateTest("USA: Arkansas, Washington", "USA: Arkansas, Washington", CCountries::e_Valid );
20986  s_USAStateTest("USA: Washington, Arkansas", "USA: Arkansas, Washington", CCountries::e_Corrected );
20987  s_USAStateTest("USA: AR, Washington", "USA: Arkansas, Washington", CCountries::e_Corrected );
20988  s_USAStateTest("USA: Washington, AR", "USA: Arkansas, Washington", CCountries::e_Corrected );
20989  s_USAStateTest("USA: Wisconsin, Oregon", "USA: Wisconsin, Oregon", CCountries::e_Ambiguous );
20990 
20991  s_USAStateTest("Puerto Rico: San Juan", "USA: Puerto Rico, San Juan", CCountries::e_Corrected );
20992  s_USAStateTest("USA: Puerto Rico", "USA: Puerto Rico", CCountries::e_Valid );
20993  s_USAStateTest("USA: Puerto Rico, Florida", "USA: Puerto Rico, Florida", CCountries::e_Valid );
20994  s_USAStateTest("USA: Florida, Puerto Rico", "USA: Puerto Rico, Florida", CCountries::e_Corrected );
20995 
20996  s_USAStateTest("USA:Los Angeles", "USA: California, Los Angeles", CCountries::e_Corrected );
20997  s_USAStateTest("USA:Hayward", "USA: California, Hayward", CCountries::e_Corrected );
20998  s_USAStateTest("USA: Springfield", "USA: Springfield", CCountries::e_Missing );
20999 
21000  s_USAStateTest("USA:GU", "USA: Guam", CCountries::e_Corrected );
21001  s_USAStateTest("Belize", "Belize", CCountries::e_NotUSA );
21002  s_USAStateTest("France: Paris", "France: Paris", CCountries::e_NotUSA );
21003 }
21004 
21005 
21006 BOOST_AUTO_TEST_CASE(Test_NewFixCountry)
21007 {
21008  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Russia, Tatarstan, Kazan"), "Russia: Tatarstan, Kazan");
21009  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Egypt: Red Sea, Ras Mohamed, Sinai"), "Egypt: Red Sea, Ras Mohamed, Sinai");
21010  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Kenya."), "Kenya");
21011  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("U.S.A."), "USA");
21012  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("\"United Kingdom: Scotland, Edinburgh\""), "United Kingdom: Scotland, Edinburgh");
21013  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("1896"), "");
21014  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Anderson, Mesa Verde, Colorado"), "");
21015  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Ansirabe"), "");
21016  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Antarctic Territory Claimed by Australia"), "");
21017  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Ari Ksatr"), "");
21018  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Australia: south-western australia"), "Australia: south-western australia");
21019  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Auwahi, Maui"), "");
21020  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Between Liberia and Ivory Coast"), "");
21021  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Caroline Island, Leticia"), "");
21022  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Catalina Island, California"), "");
21023  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Chia-i"), "");
21024  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Congo"), "");
21025  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Cousin Island"), "");
21026  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Czechoslovakia"), "Czechoslovakia");
21027  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("DE"), "");
21028  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("France: North East France Nievre-Morvan Breuil Chenue forest"), "France: North East France Nievre-Morvan Breuil Chenue forest");
21029  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Great Britain"), "United Kingdom: Great Britain");
21030  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Greenland: Saqqaq Culture site Qeqertasussuk, north-western Greenland"), "Greenland: Saqqaq Culture site Qeqertasussuk, north-western Greenland");
21031  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Guadaloupe Island"), "");
21032  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Hawaii"), "USA: Hawaii");
21033  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Hamoa Bay, Maui, Hawaii, USA"), "USA: Hamoa Bay, Maui, Hawaii");
21034  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Hortus Leiden, the Netherlands"), "Netherlands: Hortus Leiden");
21035  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Hortus, Leiden, the Netherlands"), "Netherlands: Hortus, Leiden");
21036  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Joffreville"), "");
21037  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Korea"), "Korea");
21038  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Kuala Belalong, Ulu Temburong National Park"), "");
21039  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Lake Fryxell"), "");
21040  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Luxemburg"), "Luxembourg");
21041  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Mediterranean Sea, Spain"), "");
21042  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Mexico. Loreto Bay, Gulf of California."), "Mexico: Loreto Bay, Gulf of California");
21043  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Meyendel, the Netherlands"), "Netherlands: Meyendel");
21044  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("micronesia, federated States of"), "Micronesia, Federated States of");
21045  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Micronesia"), "Micronesia, Federated States of");
21046  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Mount St. Helena, California"), "");
21047  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Nanyuki"), "");
21048  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Netherland"), "Netherlands");
21049  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("New Guinea"), "Papua New Guinea");
21050  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("North Sea, Netherlands"), "");
21051  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Noumea"), "");
21052  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Roosendaal, De Moeren, the Netherlands"), "Netherlands: Roosendaal, De Moeren");
21053  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("SPAIN (orig)"), "Spain: (orig)");
21054  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("San Tome and Principe Island (1998)"), "Sao Tome and Principe: (1998)");
21055  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Scotland"), "United Kingdom: Scotland");
21056  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USA (orig)"), "USA: (orig)");
21057  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USA: Boqueron National Wildlife Refuge, Puerto Rico"), "USA: Boqueron National Wildlife Refuge, Puerto Rico");
21058  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USA: hypersaline sediment collected at Bitter Lake, New Mexico"), "USA: hypersaline sediment collected at Bitter Lake, New Mexico");
21059  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Wales"), "United Kingdom: Wales");
21060  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("West Germany"), "Germany: West Germany");
21061  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("West Lobe Bonney"), "");
21062  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Wissenkerke, Keihoogteweg, the Netherlands"), "Netherlands: Wissenkerke, Keihoogteweg");
21063  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Wolfskill Orchand, Winters, California"), "");
21064  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Yun Shui"), "");
21065  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USSR: Kazakhstan, Kurtu"), "USSR: Kazakhstan, Kurtu");
21066  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USA:"), "USA");
21067  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("south sudan"), "South Sudan");
21068  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("UK: Whiteford Burrows, Gower, Wales"), "United Kingdom: Whiteford Burrows, Gower, Wales");
21069  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Whiteford Burrows, Gower, Wales"), "United Kingdom: Wales, Whiteford Burrows, Gower");
21070  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Burma"), "Myanmar");
21071  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Siam"), "Thailand");
21072  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("AA:BB:CC"), "");
21073  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("AA:BB:Southern China"), "");
21074  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("UK: Whiteford Burrows: Gower: Wales"), "United Kingdom: Whiteford Burrows, Gower, Wales");
21075  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USA:DE:Dover"), "USA:DE,Dover");
21076  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Burma:A:B"), "Burma:A,B");
21077  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Puerto Rico"), "Puerto Rico");
21078  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Puerto Rico", true), "USA: Puerto Rico");
21079  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Puerto Rico, San Juan", true), "USA: Puerto Rico, San Juan");
21080  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Guam", true), "USA: Guam");
21081  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("American Samoa", true), "USA: American Samoa");
21082  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Virgin Islands", true), "USA: US Virgin Islands");
21083  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("puerto rico"), "Puerto Rico");
21084  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("puerto rico", true), "USA: Puerto Rico");
21085  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("guam"), "Guam");
21086  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("guam", true), "USA: Guam");
21087  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("United States: Georgia"), "USA: Georgia");
21088  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("United States: Georgia", true), "USA: Georgia");
21089  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("United States: Guam"), "USA: Guam");
21090  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("United States: Guam", true), "USA: Guam");
21091  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Georgia"), "Georgia");
21092  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Georgia", true), "Georgia");
21093  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Georgia: Tbilisi"), "Georgia: Tbilisi");
21094  BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Georgia: Tbilisi", true), "Georgia: Tbilisi");
21095 }
21096 
21097 
21098 BOOST_AUTO_TEST_CASE(Fix_Structured_Voucher)
21099 {
21100  //removed while issues with updating list are sorted out
21101  string val = "USNM<USA>:12345";
21103  BOOST_CHECK_EQUAL(val, "USNM<USA>:12345");
21104 
21105  // can't fix, needs country code
21106  val = "ABS<CHN>:12345";
21108  BOOST_CHECK_EQUAL(val, "ABS<CHN>:12345");
21109 
21110  // removed while structure-fixing questions are considered
21111  // add structure when space instead of colon
21112  val = "AMNH 12345";
21113  BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "s"), true);
21114  BOOST_CHECK_EQUAL(val, "AMNH:12345");
21115 
21116  // add structure when letters and numbers
21117  val = "ABB666";
21118  BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "c"), true);
21119  BOOST_CHECK_EQUAL(val, "ABB:666");
21120 
21121  // can also fix biomaterial
21122  val = "CNWGRGL123";
21123  BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "b"), true);
21124  BOOST_CHECK_EQUAL(val, "CNWGRGL:123");
21125 
21126  // will not fix for too short code
21127  val = "A12345";
21128  BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "s"), false);
21129  BOOST_CHECK_EQUAL(val, "A12345");
21130 
21131 
21132  // if institution code in parentheses at end of unstructured value, reorder
21133  // GB-6454
21134  val = "M.Riewe 182 (CAS)";
21135  BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "s"), true);
21136  BOOST_CHECK_EQUAL(val, "CAS:M.Riewe 182");
21137 
21138  // don't fix if value in parentheses is not an institution code
21139  val = "L.R. Xu 0081 (WUG)";
21140  BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "s"), false);
21141  BOOST_CHECK_EQUAL(val, "L.R. Xu 0081 (WUG)");
21142 
21143 
21144 }
21145 
21146 
21147 BOOST_AUTO_TEST_CASE(Test_CheckEnds)
21148 {
21150  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNNAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAANNNNNNNNNN");
21151  entry->SetSeq().SetInst().SetLength(62);
21153  CScope scope(*objmgr);
21154  scope.AddDefaults();
21155  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
21156 
21161  bool begin_ambig = false, end_ambig = false;
21162 
21163  CBioseq_Handle bsh = seh.GetSeq();
21164  CheckBioseqEndsForNAndGap(bsh, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
21165  BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_All);
21166  BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_All);
21167  BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_All);
21168  BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_All);
21169  BOOST_CHECK_EQUAL(begin_ambig, true);
21170  BOOST_CHECK_EQUAL(end_ambig, true);
21171 
21172  scope.RemoveTopLevelSeqEntry(seh);
21173  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAANNNNNNNNN");
21174  entry->SetSeq().SetInst().SetLength(60);
21175  seh = scope.AddTopLevelSeqEntry(*entry);
21176 
21177  bsh = seh.GetSeq();
21178  CheckBioseqEndsForNAndGap(bsh, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
21179  BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_Last);
21180  BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_Last);
21181  BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_Last);
21182  BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_Last);
21183  BOOST_CHECK_EQUAL(begin_ambig, true);
21184  BOOST_CHECK_EQUAL(end_ambig, true);
21185 
21186  scope.RemoveTopLevelSeqEntry(seh);
21187  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAA");
21188  entry->SetSeq().SetInst().SetLength(42);
21189  seh = scope.AddTopLevelSeqEntry(*entry);
21190 
21191  bsh = seh.GetSeq();
21192  CheckBioseqEndsForNAndGap(bsh, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
21193  BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21194  BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21195  BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21196  BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21197  BOOST_CHECK_EQUAL(begin_ambig, false);
21198  BOOST_CHECK_EQUAL(end_ambig, false);
21199 
21200  scope.RemoveTopLevelSeqEntry(seh);
21201  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ANANTNNNCAAAATTGGCCAAAATTGGCCAAAANTNNCNCNA");
21202  entry->SetSeq().SetInst().SetLength(42);
21203  seh = scope.AddTopLevelSeqEntry(*entry);
21204 
21205  bsh = seh.GetSeq();
21206  CheckBioseqEndsForNAndGap(bsh, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
21207  BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21208  BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21209  BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21210  BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21211  BOOST_CHECK_EQUAL(begin_ambig, true);
21212  BOOST_CHECK_EQUAL(end_ambig, true);
21213 
21214  scope.RemoveTopLevelSeqEntry(seh);
21215  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("GTGTGANANTNNNCNNNNNTGGCCAAAATTGGCCAAAANTNNCNCNAGTGTG");
21216  entry->SetSeq().SetInst().SetLength(52);
21217  seh = scope.AddTopLevelSeqEntry(*entry);
21218 
21219  bsh = seh.GetSeq();
21220  CheckBioseqEndsForNAndGap(bsh, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
21221  BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21222  BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21223  BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21224  BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21225  BOOST_CHECK_EQUAL(begin_ambig, true);
21226  BOOST_CHECK_EQUAL(end_ambig, true);
21227 
21228 }
21229 
21230 
21232 {
21234  SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Jatropha cf.");
21235 
21237 
21238  // AddChromosomeNoLocation(expected_errors, entry);
21239  eval = validator.Validate(seh, options);
21240  CheckErrors(*eval, expected_errors);
21241 
21242  CLEAR_ERRORS
21243 }
21244 
21245 
21247 {
21249  CRef<CSeqdesc> create_date(new CSeqdesc());
21250  create_date->SetCreate_date().SetStd().SetMonth(6);
21251  create_date->SetCreate_date().SetStd().SetDay(12);
21252  create_date->SetCreate_date().SetStd().SetYear(1998);
21253  entry->SetSet().SetDescr().Set().push_back(create_date);
21254  CRef<CSeqdesc> update_date(new CSeqdesc());
21255  update_date->SetUpdate_date().SetStd().SetMonth(6);
21256  update_date->SetUpdate_date().SetStd().SetDay(11);
21257  update_date->SetUpdate_date().SetStd().SetYear(1998);
21258  entry->SetSet().SetDescr().Set().push_back(update_date);
21259 
21260  CRef<CSeq_entry> nuc = entry->SetSet().SetSeq_set().front();
21261  CRef<CSeq_id> gi_id(new CSeq_id());
21262  gi_id->SetGi(GI_CONST(1322283));
21263  nuc->SetSeq().SetId().push_front(gi_id);
21264  CRef<CSeq_id> accv_id(new CSeq_id("gb|U54469.1"));
21265  nuc->SetSeq().SetId().push_front(accv_id);
21266 
21268 
21269  expected_errors.push_back(new CExpectedError("gb|U54469.1|", eDiag_Warning, "InconsistentDates",
21270  "Inconsistent create_date [Jun 12, 1998] and update_date [Jun 11, 1998]"));
21271  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "InconsistentDates",
21272  "Inconsistent create_date [Jun 12, 1998] and update_date [Jun 11, 1998]"));
21273  // AddChromosomeNoLocation(expected_errors, entry);
21274 
21275  eval = validator.Validate(seh, options);
21276  CheckErrors(*eval, expected_errors);
21277 
21278  CLEAR_ERRORS
21279 }
21280 
21281 
21282 BOOST_AUTO_TEST_CASE(Test_SQD_1470)
21283 {
21284  // prepare entry
21286  edit::CGenomeAssemblyComment gac1;
21287  gac1.SetAssemblyMethodProgram("a");
21288  gac1.SetAssemblyMethodVersion("1");
21289  gac1.SetGenomeCoverage("3x");
21290  gac1.SetSequencingTechnology("foo");
21291 
21292  CRef<CSeqdesc> sd1(new CSeqdesc());
21293  sd1->SetUser(*(gac1.MakeUserObject()));
21294  entry->SetSeq().SetDescr().Set().push_back(sd1);
21295 
21296  CRef<CSeqdesc> sd2(new CSeqdesc());
21297  sd2->SetUser(*(gac1.MakeUserObject()));
21298  entry->SetSeq().SetDescr().Set().push_back(sd2);
21299 
21301 
21302  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MultipleStrucComms",
21303  "Multiple structured comments with prefix ##Genome-Assembly-Data-START##"));
21304  // AddChromosomeNoLocation(expected_errors, entry);
21305 
21306  eval = validator.Validate(seh, options);
21307  CheckErrors(*eval, expected_errors);
21308 
21309  CLEAR_ERRORS
21310 }
21311 
21312 
21313 BOOST_AUTO_TEST_CASE(Test_SQD_1309)
21314 {
21315  // prepare entry
21317  unit_test_util::RevComp(entry);
21319  SetTech(nentry, CMolInfo::eTech_tsa);
21321  nentry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
21322 
21324 
21325  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSonMinusStrandTranscribedRNA",
21326  "Coding region on TSA transcribed RNA should not be on the minus strand"));
21327  // AddChromosomeNoLocation(expected_errors, entry);
21328 
21329  eval = validator.Validate(seh, options);
21330  CheckErrors(*eval, expected_errors);
21331 
21332  CLEAR_ERRORS
21333 
21334  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSonMinusStrandTranscribedRNA",
21335  "Coding region on TSA transcribed RNA should not be on the minus strand"));
21336  eval = validator.GetTSACDSOnMinusStrandErrors(seh);
21337  CheckErrors(*eval, expected_errors);
21338 
21339  CLEAR_ERRORS
21340 }
21341 
21342 
21343 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadCDScomment)
21344 {
21345  // prepare entry
21348 
21351  cds->SetComment("ambiguity in stop codon");
21352 
21353  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "BadCDScomment",
21354  "Feature comment indicates ambiguity in stop codon but no ambiguities are present in stop codon."));
21355  // AddChromosomeNoLocation(expected_errors, entry);
21356 
21357  eval = validator.Validate(seh, options);
21358  CheckErrors(*eval, expected_errors);
21359 
21360  CLEAR_ERRORS
21361 
21362  edit::AddTerminalCodeBreak(*cds, seh.GetScope());
21363  scope.RemoveTopLevelSeqEntry(seh);
21364  seh = scope.AddTopLevelSeqEntry(*entry);
21365  // AddChromosomeNoLocation(expected_errors, entry);
21366  eval = validator.Validate(seh, options);
21367  CheckErrors(*eval, expected_errors);
21368 
21369  scope.RemoveTopLevelSeqEntry(seh);
21370  nentry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTNAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
21371  seh = scope.AddTopLevelSeqEntry(*entry);
21372 
21373 // Error below is not expected anymore since VR-110 issue fixed:
21374 //
21375 // expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryTranslExcept",
21376 // "Unexpected transl_except * at position 9 just past end of protein"));
21377 
21378  eval = validator.Validate(seh, options);
21379  CheckErrors(*eval, expected_errors);
21380 
21381  CLEAR_ERRORS
21382 }
21383 
21384 
21385 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidFuzz)
21386 {
21387  // prepare entry
21390  misc1->SetLocation().SetInt().SetFuzz_from().SetLim(CInt_fuzz::eLim_tl);
21391  misc1->SetLocation().SetInt().SetFuzz_to().SetLim(CInt_fuzz::eLim_tl);
21392 
21394  misc2->SetLocation().SetInt().SetFrom(5);
21395  misc2->SetLocation().SetInt().SetFuzz_from().SetLim(CInt_fuzz::eLim_tr);
21396  misc2->SetLocation().SetInt().SetFuzz_to().SetLim(CInt_fuzz::eLim_tr);
21397 
21399  CRef<CSeq_id> id(new CSeq_id());
21400  id->Assign(*(entry->GetSeq().GetId().front()));
21401  CRef<CSeq_interval> int1(new CSeq_interval(*id, 0, 5));
21402  int1->SetFuzz_from().SetLim(CInt_fuzz::eLim_tl);
21403  int1->SetFuzz_to().SetLim(CInt_fuzz::eLim_tl);
21404  CRef<CSeq_interval> int2(new CSeq_interval(*id, 10, 15));
21405  int2->SetFuzz_from().SetLim(CInt_fuzz::eLim_tr);
21406  int2->SetFuzz_to().SetLim(CInt_fuzz::eLim_tr);
21407 
21408  misc3->SetLocation().SetPacked_int().Set().push_back(int1);
21409  misc3->SetLocation().SetPacked_int().Set().push_back(int2);
21410 
21412 
21413  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21414  "Should not specify 'space to left' for both ends of interval"));
21415  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21416  "Should not specify 'space to right' for both ends of interval"));
21417  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21418  "Should not specify 'space to left' for both ends of interval"));
21419  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21420  "Should not specify 'space to right' for both ends of interval"));
21421  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21422  "Should not specify 'space to left' at first position of non-circular sequence"));
21423  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21424  "Should not specify 'space to left' at first position of non-circular sequence"));
21425  // AddChromosomeNoLocation(expected_errors, entry);
21426 
21427  eval = validator.Validate(seh, options);
21428  CheckErrors(*eval, expected_errors);
21429 
21430  CLEAR_ERRORS
21431 }
21432 
21433 
21434 BOOST_AUTO_TEST_CASE(Test_SQD_1532)
21435 {
21436  BOOST_CHECK_EQUAL(COrgMod::IsCultureCollectionValid("50% TSB + 2mM Cr(VI)"), "Culture_collection should be structured, but is not");
21437 }
21438 
21439 
21440 BOOST_AUTO_TEST_CASE(Test_SexQualifiers)
21441 {
21442  BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("M"), true);
21443  BOOST_CHECK_EQUAL(CSubSource::FixSexQualifierValue("M"), "male");
21444  BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("Male"), true);
21445  BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("male"), true);
21446  BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("llama"), false);
21447  BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("m/f"), true);
21448  BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("pooled males and females"), true);
21449  BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("pooled male and female"), true);
21450  BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("mixed"), true);
21451 
21452  BOOST_CHECK_EQUAL(CSubSource::FixSexQualifierValue("m/f"), "male and female");
21453  BOOST_CHECK_EQUAL(CSubSource::FixSexQualifierValue("m/f/neuter"), "male, female, and neuter");
21454  BOOST_CHECK_EQUAL(CSubSource::FixSexQualifierValue("male and female (pooled)"), "pooled male and female");
21455 
21456 }
21457 
21458 
21459 BOOST_AUTO_TEST_CASE(TEST_DisableStrainForwarding)
21460 {
21461  CBioSource src;
21462 
21463  src.SetDisableStrainForwarding(true);
21464  BOOST_CHECK_EQUAL(src.GetOrg().GetOrgname().GetAttrib(), "nomodforward");
21465  BOOST_CHECK_EQUAL(src.GetDisableStrainForwarding(), true);
21466  src.SetDisableStrainForwarding(false);
21467  BOOST_CHECK_EQUAL(src.GetDisableStrainForwarding(), false);
21468 }
21469 
21470 
21472 {
21474  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN");
21475  entry->SetSeq().SetInst().SetLength(30);
21476 
21478 
21479  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "AllNs",
21480  "Sequence is all Ns"));
21481  // AddChromosomeNoLocation(expected_errors, entry);
21482 
21483  eval = validator.Validate(seh, options);
21484  CheckErrors(*eval, expected_errors);
21485 
21486  CLEAR_ERRORS
21487 }
21488 
21489 
21490 BOOST_AUTO_TEST_CASE(Test_SubSourceAutofix)
21491 {
21492  CRef<CSubSource> ss(new CSubSource());
21493 
21495  ss->SetName("Maryland, USA");
21496  ss->AutoFix();
21497  BOOST_CHECK_EQUAL(ss->GetName(), "USA: Maryland");
21498 
21500  ss->SetName("1-14-97");
21501  ss->AutoFix();
21502  BOOST_CHECK_EQUAL(ss->GetName(), "14-Jan-1997");
21503 
21505  ss->SetName("Lattitude: 25.790544; longitude: -80.214930");
21506  ss->AutoFix();
21507  BOOST_CHECK_EQUAL(ss->GetName(), "25.790544 N 80.214930 W");
21508 
21510  ss->SetName("m/f/neuter");
21511  ss->AutoFix();
21512  BOOST_CHECK_EQUAL(ss->GetName(), "male, female, and neuter");
21513 
21515  ss->SetName("123 ft.");
21516  ss->AutoFix();
21517  BOOST_CHECK_EQUAL(ss->GetName(), "37 m");
21518 
21519 }
21520 
21521 
21522 BOOST_AUTO_TEST_CASE(Test_OrgModAutofix)
21523 {
21524  CRef<COrgMod> om(new COrgMod());
21525  om->SetSubtype(COrgMod::eSubtype_strain);
21526  om->SetSubname("ATCC1234");
21527  om->AutoFix();
21528  BOOST_CHECK_EQUAL(om->GetSubname(), "ATCC 1234");
21529  om->SetSubname("DSM 567");
21530  om->AutoFix();
21531  BOOST_CHECK_EQUAL(om->GetSubname(), "DSM 567");
21532 
21533  om->SetSubtype(COrgMod::eSubtype_nat_host);
21534  om->SetSubname("human");
21535  om->AutoFix();
21536  BOOST_CHECK_EQUAL(om->GetSubname(), "Homo sapiens");
21537 }
21538 
21539 
21540 BOOST_AUTO_TEST_CASE(Test_RmCultureNotes)
21541 {
21542  CRef<CSubSource> ss(new CSubSource());
21544  ss->SetName("a; [mixed bacterial source]; b");
21545  ss->RemoveCultureNotes();
21546  BOOST_CHECK_EQUAL(ss->GetName(), "a; b");
21547  ss->SetName("[uncultured (using species-specific primers) bacterial source]");
21548  ss->RemoveCultureNotes();
21549  BOOST_CHECK_EQUAL(ss->GetName(), "amplified with species-specific primers");
21550  ss->SetName("[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]");
21551  ss->RemoveCultureNotes();
21552  BOOST_CHECK_EQUAL(ss->IsSetName(), false);
21553  ss->SetName("[BankIt_uncultured16S_wizard]; [species_specific primers]; [dgge]");
21554  ss->RemoveCultureNotes();
21555  BOOST_CHECK_EQUAL(ss->GetName(), "amplified with species-specific primers");
21556 
21557  CRef<CBioSource> src(new CBioSource());
21558  ss->SetName("a; [mixed bacterial source]; b");
21559  src->SetSubtype().push_back(ss);
21560  src->RemoveCultureNotes();
21561  BOOST_CHECK_EQUAL(ss->GetName(), "a; b");
21562  ss->SetName("[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]");
21563  src->RemoveCultureNotes();
21564  BOOST_CHECK_EQUAL(src->IsSetSubtype(), false);
21565 }
21566 
21567 
21569 {
21570  // prepare entry
21572 
21574 
21575  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NonViralSegment",
21576  "Non-viral source feature should not have a segment qualifier"));
21577  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NonViralSegment",
21578  "Non-viral source feature should not have a segment qualifier"));
21579 
21580  expected_errors.push_back(new CExpectedError("lcl|good",
21581  eDiag_Warning,
21582  "MultipleSourceQualifiers",
21583  "Multiple segment qualifiers present"));
21584  // AddChromosomeNoLocation(expected_errors, entry);
21585 
21586  // Mutliple segment qualifiers
21589 
21590  eval = validator.Validate(seh, options);
21591  CheckErrors(*eval, expected_errors);
21593 
21594  CLEAR_ERRORS
21595 
21596  expected_errors.push_back(new CExpectedError("lcl|good",
21597  eDiag_Warning,
21598  "MultipleSourceQualifiers",
21599  "Multiple segment qualifiers present"));
21600 
21601  // Multiple collected_by qualifiers
21604  expected_errors[0]->SetErrMsg("Multiple collected_by qualifiers present");
21605 
21606  eval = validator.Validate(seh, options);
21607  CheckErrors(*eval, expected_errors);
21609 
21610  // Multiple identified_by qualifiers
21613  expected_errors[0]->SetErrMsg("Multiple identified_by qualifiers present");
21614 
21615  eval = validator.Validate(seh, options);
21616  CheckErrors(*eval, expected_errors);
21618 
21619  // Multiple collection_date qualifiers
21622  expected_errors[0]->SetErrMsg("Multiple collection_date qualifiers present");
21623 
21624  eval = validator.Validate(seh, options);
21625  CheckErrors(*eval, expected_errors);
21627 
21628  CLEAR_ERRORS
21629 }
21630 
21631 
21633 {
21635 
21637 
21638  entry->SetSet().SetSeq_set().back()->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MP*K*E*N");
21639  entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("GTGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
21641  cds->SetExcept(true);
21642  cds->SetExcept_text("unclassified translation discrepancy");
21643 
21644  BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
21645  BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
21646 
21647  // list of expected errors
21648  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
21649  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "ExceptionProblem", "unclassified translation discrepancy is not a legal exception explanation"));
21650  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
21651  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
21652  "CDS has unnecessary translated product replaced exception"));
21653  // AddChromosomeNoLocation(expected_errors, entry);
21654 
21655  eval = validator.Validate(seh, options);
21656  CheckErrors(*eval, expected_errors);
21657 
21658  CLEAR_ERRORS
21659 
21660  // now suppress an error
21661  CRef<CSeqdesc> suppress(new CSeqdesc());
21662  suppress->SetUser().SetObjectType(CUser_object::eObjectType_ValidationSuppression);
21664  entry->SetSet().SetDescr().Set().push_back(suppress);
21665 
21666  // list of expected errors
21667  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
21668  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "ExceptionProblem", "unclassified translation discrepancy is not a legal exception explanation"));
21669  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
21670  "CDS has unnecessary translated product replaced exception"));
21671  // AddChromosomeNoLocation(expected_errors, entry);
21672 
21673  eval = validator.Validate(seh, options);
21674  CheckErrors(*eval, expected_errors);
21675 
21676  CLEAR_ERRORS
21677 
21678  // suppress two errors
21680 
21681  // list of expected errors
21682  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
21683  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
21684  "CDS has unnecessary translated product replaced exception"));
21685  // AddChromosomeNoLocation(expected_errors, entry);
21686 
21687  eval = validator.Validate(seh, options);
21688  CheckErrors(*eval, expected_errors);
21689 
21690  CLEAR_ERRORS
21691 
21692  // suppress three errors
21694 
21695  // list of expected errors
21696  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
21697  "CDS has unnecessary translated product replaced exception"));
21698  // AddChromosomeNoLocation(expected_errors, entry);
21699 
21700  eval = validator.Validate(seh, options);
21701  CheckErrors(*eval, expected_errors);
21702 
21703  CLEAR_ERRORS
21704 }
21705 
21706 BOOST_AUTO_TEST_CASE(Test_RemoveLineageSourceNotes)
21707 {
21708  CRef<CBioSource> bsrc(new CBioSource);
21709  bsrc->SetOrg().SetTaxname("Influenza A virus");
21710  bsrc->SetOrg().SetOrgname().SetLineage("Viruses; ssRNA negative-strand viruses; Orthomyxoviridae; Influenzavirus A");
21711 
21712  CRef<CSubSource> subsrc(new CSubSource(CSubSource::eSubtype_other, "Organism: viruses"));
21713  bsrc->SetSubtype().push_back(subsrc);
21714  CRef<COrgMod> mod_a(new COrgMod(COrgMod::eSubtype_strain, "virus strain"));
21715  CRef<COrgMod> mod_b(new COrgMod(COrgMod::eSubtype_other, "note: influenza A"));
21716  bsrc->SetOrg().SetOrgname().SetMod().push_back(mod_a);
21717  bsrc->SetOrg().SetOrgname().SetMod().push_back(mod_b);
21718 
21719  bool removed = bsrc->RemoveLineageSourceNotes();
21720  BOOST_CHECK_EQUAL(removed, false); // it won't remove the notes as there is no taxid
21721  bsrc->SetOrg().SetTaxId(TAX_ID_CONST(11320));
21722 
21723  removed = bsrc->RemoveLineageSourceNotes();
21724  BOOST_CHECK_EQUAL(removed, true);
21725  BOOST_CHECK_EQUAL(bsrc->IsSetSubtype(), false);
21726  FOR_EACH_ORGMOD_ON_BIOSOURCE( orgmod, *bsrc) {
21727  if ((*orgmod)->IsSetSubtype()) {
21728  BOOST_CHECK_EQUAL((*orgmod)->GetSubtype() == COrgMod::eSubtype_other, false);
21729  }
21730  }
21731 
21732  CRef<COrgMod> mod_c(new COrgMod(COrgMod::eSubtype_other, "domain: unknown domain"));
21733  removed = bsrc->RemoveLineageSourceNotes();
21734  BOOST_CHECK_EQUAL(removed, false);
21735  FOR_EACH_ORGMOD_ON_BIOSOURCE( orgmod, *bsrc) {
21736  if ((*orgmod)->IsSetSubtype() && (*orgmod)->GetSubtype() != COrgMod::eSubtype_strain) {
21737  BOOST_CHECK_EQUAL((*orgmod)->GetSubtype() == COrgMod::eSubtype_other, true);
21738  }
21739  }
21740 }
21741 
21742 
21744 {
21745  string orig = CGb_qual::BuildExperiment("", "experiment", "");
21746  BOOST_CHECK_EQUAL(orig, "experiment");
21747 
21748  string experiment;
21749  string category;
21750  string doi;
21751 
21752  CGb_qual::ParseExperiment(orig, category, experiment, doi);
21753  BOOST_CHECK_EQUAL(category, "");
21754  BOOST_CHECK_EQUAL(experiment, "experiment");
21755  BOOST_CHECK_EQUAL(doi, "");
21756 
21757  orig = CGb_qual::BuildExperiment("", "experiment2", "DOI");
21758  BOOST_CHECK_EQUAL(orig, "experiment2[DOI]");
21759  CGb_qual::ParseExperiment(orig, category, experiment, doi);
21760  BOOST_CHECK_EQUAL(category, "");
21761  BOOST_CHECK_EQUAL(experiment, "experiment2");
21762  BOOST_CHECK_EQUAL(doi, "DOI");
21763 
21764  orig = CGb_qual::BuildExperiment("COORDINATES", "experiment3", "");
21765  BOOST_CHECK_EQUAL(orig, "COORDINATES:experiment3");
21766  CGb_qual::ParseExperiment(orig, category, experiment, doi);
21767  BOOST_CHECK_EQUAL(category, "COORDINATES");
21768  BOOST_CHECK_EQUAL(experiment, "experiment3");
21769  BOOST_CHECK_EQUAL(doi, "");
21770 
21771  orig = CGb_qual::BuildExperiment("EXISTENCE", "experiment4", "DOI2");
21772  BOOST_CHECK_EQUAL(orig, "EXISTENCE:experiment4[DOI2]");
21773  CGb_qual::ParseExperiment(orig, category, experiment, doi);
21774  BOOST_CHECK_EQUAL(category, "EXISTENCE");
21775  BOOST_CHECK_EQUAL(experiment, "experiment4");
21776  BOOST_CHECK_EQUAL(doi, "DOI2");
21777 
21778 }
21779 
21780 
21781 BOOST_AUTO_TEST_CASE(Test_SQD_2036)
21782 {
21783  string msg = CSubSource::CheckCellLine("222", "Homo sapiens");
21784  BOOST_CHECK_EQUAL(msg, "The International Cell Line Authentication Committee database indicates that 222 from Homo sapiens is known to be contaminated by PA1 from Human. Please see http://iclac.org/databases/cross-contaminations/ for more information and references.");
21785 
21786  msg = CSubSource::CheckCellLine("223", "Homo sapiens");
21787  BOOST_CHECK_EQUAL(msg, "");
21788 
21789  msg = CSubSource::CheckCellLine("222", "Canis familiaris");
21790  BOOST_CHECK_EQUAL(msg, "");
21791 
21792  // prepare entry
21794  unit_test_util::SetTaxname(entry, "Cavia porcellus");
21795  unit_test_util::SetTaxon(entry, 0);
21796  unit_test_util::SetTaxon(entry, 10141);
21798 
21800 
21801  expected_errors.push_back(new CExpectedError("lcl|good",
21802  eDiag_Warning,
21803  "SuspectedContaminatedCellLine",
21804  "The International Cell Line Authentication Committee database indicates that GPS-M from Cavia porcellus is known to be contaminated by Strain L-M from Mouse. Please see http://iclac.org/databases/cross-contaminations/ for more information and references."));
21805  // AddChromosomeNoLocation(expected_errors, entry);
21806 
21807  eval = validator.Validate(seh, options);
21808  CheckErrors(*eval, expected_errors);
21809  CLEAR_ERRORS
21810 }
21811 
21812 
21814 {
21819  CRef<CSeq_entry> entry(new CSeq_entry());
21821  entry->SetSet().SetSeq_set().push_back(e1);
21822  entry->SetSet().SetSeq_set().push_back(e2);
21823 
21825 
21826  expected_errors.push_back(new CExpectedError("lcl|good",
21827  eDiag_Error,
21828  "NoPubFound",
21829  "No publications anywhere on this entire record."));
21830  expected_errors.push_back(new CExpectedError("lcl|good",
21831  eDiag_Info,
21832  "MissingPubRequirement",
21833  "No submission citation anywhere on this entire record."));
21834  // AddChromosomeNoLocation(expected_errors, entry);
21835 
21836  eval = validator.Validate(seh, options);
21837  CheckErrors(*eval, expected_errors);
21839  expected_errors[1]->SetSeverity(eDiag_Error);
21840  eval = validator.Validate(seh, options);
21841  CheckErrors(*eval, expected_errors);
21842 
21843  CLEAR_ERRORS
21844 
21846  // AddChromosomeNoLocation(expected_errors, entry);
21847  eval = validator.Validate(seh, options);
21848  CheckErrors(*eval, expected_errors);
21849 
21850  CLEAR_ERRORS
21851 }
21852 
21853 
21855 {
21857  CRef<CSeq_feat> repeat_region = unit_test_util::AddMiscFeature(entry);
21858  repeat_region->SetData().SetImp().SetKey("repeat_region");
21859  repeat_region->ResetComment();
21861  misc->ResetComment();
21862 
21864 
21865  expected_errors.push_back(new CExpectedError("lcl|good",
21866  eDiag_Warning,
21867  "MiscFeatureNeedsNote",
21868  "A note or other qualifier is required for a misc_feature"));
21869  expected_errors.push_back(new CExpectedError("lcl|good",
21870  eDiag_Warning,
21871  "RepeatRegionNeedsNote",
21872  "repeat_region has no qualifiers"));
21873  // AddChromosomeNoLocation(expected_errors, entry);
21874 
21875  eval = validator.Validate(seh, options);
21876  CheckErrors(*eval, expected_errors);
21877  // bump to error for -U
21879  expected_errors[0]->SetSeverity(eDiag_Error);
21880  expected_errors[1]->SetSeverity(eDiag_Error);
21881  eval = validator.Validate(seh, options);
21882  CheckErrors(*eval, expected_errors);
21883 
21884  // only warning for EMBL/DDBJ
21885  scope.RemoveTopLevelSeqEntry(seh);
21886  CRef<CSeq_id> other_acc(new CSeq_id());
21887  other_acc->SetEmbl().SetAccession("HE717023");
21888  other_acc->SetEmbl().SetVersion(1);
21889  entry->SetSeq().SetId().push_back(other_acc);
21890  seh = scope.AddTopLevelSeqEntry(*entry);
21891  expected_errors[0]->SetSeverity(eDiag_Warning);
21892  expected_errors[1]->SetSeverity(eDiag_Warning);
21893  ChangeErrorAcc(expected_errors, "emb|HE717023.1|");
21894  eval = validator.Validate(seh, options);
21895  CheckErrors(*eval, expected_errors);
21896 
21897  CLEAR_ERRORS
21898 }
21899 
21900 
21901 BOOST_AUTO_TEST_CASE(Test_IsLocationInFrame)
21902 {
21905 
21907  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry);
21908 
21909  CSeq_feat_Handle fh = scope->GetSeq_featHandle(*cds);
21910  CRef<CSeq_loc> loc(new CSeq_loc());
21911  loc->Assign(cds->GetLocation());
21912 
21913  BOOST_CHECK_EQUAL(feature::eLocationInFrame_InFrame, feature::IsLocationInFrame(fh, *loc));
21914  loc->SetInt().SetFrom(loc->GetInt().GetFrom() + 1);
21915  BOOST_CHECK_EQUAL(feature::eLocationInFrame_BadStart, feature::IsLocationInFrame(fh, *loc));
21916  loc->SetInt().SetFrom(loc->GetInt().GetFrom() + 1);
21917  BOOST_CHECK_EQUAL(feature::eLocationInFrame_BadStart, feature::IsLocationInFrame(fh, *loc));
21918  loc->SetInt().SetFrom(loc->GetInt().GetFrom() + 1);
21919  BOOST_CHECK_EQUAL(feature::eLocationInFrame_InFrame, feature::IsLocationInFrame(fh, *loc));
21920  loc->Assign(cds->GetLocation());
21921  loc->SetInt().SetTo(loc->GetInt().GetTo() - 1);
21922  BOOST_CHECK_EQUAL(feature::eLocationInFrame_BadStop, feature::IsLocationInFrame(fh, *loc));
21923  loc->SetInt().SetTo(loc->GetInt().GetTo() - 1);
21924  BOOST_CHECK_EQUAL(feature::eLocationInFrame_BadStop, feature::IsLocationInFrame(fh, *loc));
21925  loc->SetInt().SetTo(loc->GetInt().GetTo() - 1);
21926  BOOST_CHECK_EQUAL(feature::eLocationInFrame_InFrame, feature::IsLocationInFrame(fh, *loc));
21927 
21928  loc->Assign(cds->GetLocation());
21929  loc->SetInt().SetFrom(loc->GetInt().GetFrom() + 1);
21930  loc->SetInt().SetTo(loc->GetInt().GetTo() - 1);
21932  loc->SetInt().SetFrom(loc->GetInt().GetFrom() + 1);
21933  loc->SetInt().SetTo(loc->GetInt().GetTo() - 1);
21935 
21936  loc->SetInt().SetFrom(cds->GetLocation().GetStop(eExtreme_Biological) + 1);
21937  loc->SetInt().SetTo(loc->GetInt().GetFrom() + 2);
21938  BOOST_CHECK_EQUAL(feature::eLocationInFrame_NotIn, feature::IsLocationInFrame(fh, *loc));
21939 
21940  CRef<CSeq_id> loc_id(new CSeq_id());
21941  loc_id->Assign(loc->GetInt().GetId());
21942  cds->SetLocation().Assign(*(unit_test_util::MakeMixLoc(loc_id)));
21943  loc->SetInt().SetFrom(cds->GetLocation().GetStart(eExtreme_Biological));
21944  loc->SetInt().SetTo(cds->GetLocation().GetStop(eExtreme_Biological));
21945  BOOST_CHECK_EQUAL(feature::eLocationInFrame_NotIn, feature::IsLocationInFrame(fh, *loc));
21946 }
21947 
21949 {
21951  CRef<CT3Reply> t3reply(new CT3Reply);
21953  t3reply->SetError().SetMessage(message);
21954  reply->SetReply().push_back(t3reply);
21955  return reply;
21956 }
21957 
21958 
21959 //removed until issues with caching and mocking service can be resolved
21960 BOOST_AUTO_TEST_CASE(Test_Empty_Taxon_Reply)
21961 {
21963 
21964  CMockTaxon::TReplies replies;
21965  for (size_t i = 0; i < 50; i++) {
21967  replies.push_back(reply);
21968  }
21969 
21971 
21972  eval = validator.Validate(seh, options);
21973 
21974  expected_errors.push_back(new CExpectedError("lcl|good",
21975  eDiag_Error,
21976  "TaxonomyServiceProblem",
21977  "Taxonomy service connection failure"));
21978  // AddChromosomeNoLocation(expected_errors, entry);
21979 
21980  CheckErrors(*eval, expected_errors);
21981 
21982  CLEAR_ERRORS
21983 }
21984 
21985 
21987 {
21989  string id_str = "ABCD123456789";
21990  CRef<CSeq_id> id(new CSeq_id());
21991  id->SetGenbank().SetAccession(id_str);
21992 
21993  unit_test_util::ChangeNucId(entry, id);
21994 
21996 
21997  expected_errors.push_back(new CExpectedError("gb|"+id_str+"|", eDiag_Error, "InconsistentMolInfoTechnique", "WGS accession should have Mol-info.tech of wgs"));
21998  // AddChromosomeNoLocation(expected_errors, entry);
21999  eval = validator.Validate(seh, options);
22000  CheckErrors(*eval, expected_errors);
22001 
22002  // error suppressed for TLS
22003  CLEAR_ERRORS
22004  // AddChromosomeNoLocation(expected_errors, entry);
22006  eval = validator.Validate(seh, options);
22007  CheckErrors(*eval, expected_errors);
22008  CLEAR_ERRORS
22009 }
22010 
22011 
22013 {
22015  unit_test_util::SetNucProtSetProductName(entry, "This product name contains RefSeq");
22016  CRef<CSeqdesc> defline(new CSeqdesc());
22017  defline->SetTitle("This title contains RefSeq");
22019  nuc->SetSeq().SetDescr().Set().push_back(defline);
22020 
22022 
22023  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "RefSeqInText", "Protein name contains 'RefSeq'"));
22024  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "RefSeqInText", "Definition line contains 'RefSeq'"));
22025  // AddChromosomeNoLocation(expected_errors, entry);
22026  eval = validator.Validate(seh, options);
22027  CheckErrors(*eval, expected_errors);
22028 
22029  CLEAR_ERRORS
22030 }
22031 
22032 
22034 {
22037 
22039 
22040  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "OrgModValueInvalid", "Orgmod.strain should not be 'yes'"));
22041  // AddChromosomeNoLocation(expected_errors, entry);
22042  eval = validator.Validate(seh, options);
22043  CheckErrors(*eval, expected_errors);
22044 
22047  expected_errors[0]->SetErrMsg("Orgmod.strain should not be 'NO'");
22048  eval = validator.Validate(seh, options);
22049  CheckErrors(*eval, expected_errors);
22050 
22053  expected_errors[0]->SetErrMsg("Orgmod.strain should not be '-'");
22054  eval = validator.Validate(seh, options);
22055  CheckErrors(*eval, expected_errors);
22056 
22059  expected_errors[0]->SetErrMsg("Orgmod.strain should not be 'microbial'");
22060  eval = validator.Validate(seh, options);
22061  CheckErrors(*eval, expected_errors);
22062 
22063  CLEAR_ERRORS
22064 }
22065 
22066 BOOST_AUTO_TEST_CASE(Test_BadLocation)
22067 {
22069 
22070  CRef<CSeq_feat> gene(new CSeq_feat());
22071  gene->SetData().SetGene().SetLocus("badguy");
22072  CRef<CSeq_loc> loc1(new CSeq_loc());
22073  loc1->SetInt().SetFrom(0);
22074  loc1->SetInt().SetTo(10);
22075  loc1->SetInt().SetId().SetLocal().SetStr("good1");
22076  CRef<CSeq_loc> loc2(new CSeq_loc());
22077  loc2->SetInt().SetFrom(0);
22078  loc2->SetInt().SetTo(10);
22079  loc2->SetInt().SetId().SetLocal().SetStr("good2");
22080  CRef<CSeq_loc> loc3(new CSeq_loc());
22081  loc3->SetInt().SetFrom(0);
22082  loc3->SetInt().SetTo(10);
22083  loc3->SetInt().SetId().SetLocal().SetStr("good3");
22084 
22085  gene->SetLocation().SetMix().Set().push_back(loc1);
22086  gene->SetLocation().SetMix().Set().push_back(loc2);
22087  gene->SetLocation().SetMix().Set().push_back(loc3);
22088 
22089  unit_test_util::AddFeat(gene, entry->SetSet().SetSeq_set().front());
22090 
22092 
22093  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "BadLocation",
22094  "Feature location intervals should all be on the same sequence"));
22095  // AddChromosomeNoLocation(expected_errors, entry);
22096  eval = validator.Validate(seh, options);
22097  CheckErrors(*eval, expected_errors);
22098 
22099  CLEAR_ERRORS
22100 
22101  // error goes away if organelle small genome set
22103  // remove title, not appropriate for small genome set
22105  for (auto& s : entry->SetSet().SetSeq_set()) {
22107  }
22108  // AddChromosomeNoLocation(expected_errors, entry);
22109  eval = validator.Validate(seh, options);
22110  CheckErrors(*eval, expected_errors);
22111 
22112  CLEAR_ERRORS
22113 }
22114 
22115 
22117 {
22123  mrna->SetData().SetRna().SetExt().SetName(prot->GetData().GetProt().GetName().front());
22127 
22129  // AddChromosomeNoLocation(expected_errors, entry);
22130  eval = validator.Validate(seh, options);
22131  CheckErrors(*eval, expected_errors);
22132 
22133  scope.RemoveTopLevelSeqEntry(seh);
22134  unit_test_util::SetNucProtSetPartials(entry, true, false);
22135  seh = scope.AddTopLevelSeqEntry(*entry);
22136 
22137  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch5Prime",
22138  "gene should not be 5' complete if coding region is 5' partial"));
22139  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch5Prime",
22140  "mRNA should not be 5' complete if coding region is 5' partial"));
22141  eval = validator.Validate(seh, options);
22142  CheckErrors(*eval, expected_errors);
22143  CLEAR_ERRORS
22144 
22145  scope.RemoveTopLevelSeqEntry(seh);
22146  unit_test_util::SetNucProtSetPartials(entry, false, true);
22147  seh = scope.AddTopLevelSeqEntry(*entry);
22148 
22149  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch3Prime",
22150  "gene should not be 3' complete if coding region is 3' partial"));
22151  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch3Prime",
22152  "mRNA should not be 3' complete if coding region is 3' partial"));
22153  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
22154  "3' partial is not at end of sequence, gap, or consensus splice site"));
22155  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
22156  "Got stop codon, but 3'end is labeled partial"));
22157  // AddChromosomeNoLocation(expected_errors, entry);
22158 
22159  eval = validator.Validate(seh, options);
22160  CheckErrors(*eval, expected_errors);
22161 
22162  CLEAR_ERRORS
22163 
22164  scope.RemoveTopLevelSeqEntry(seh);
22165  unit_test_util::SetNucProtSetPartials(entry, true, true);
22166  seh = scope.AddTopLevelSeqEntry(*entry);
22167 
22168  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch5Prime",
22169  "gene should not be 5' complete if coding region is 5' partial"));
22170  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch5Prime",
22171  "mRNA should not be 5' complete if coding region is 5' partial"));
22172  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch3Prime",
22173  "gene should not be 3' complete if coding region is 3' partial"));
22174  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch3Prime",
22175  "mRNA should not be 3' complete if coding region is 3' partial"));
22176  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
22177  "3' partial is not at end of sequence, gap, or consensus splice site"));
22178  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
22179  "Got stop codon, but 3'end is labeled partial"));
22180  // AddChromosomeNoLocation(expected_errors, entry);
22181  eval = validator.Validate(seh, options);
22182  CheckErrors(*eval, expected_errors);
22183 
22184  CLEAR_ERRORS
22185 }
22186 
22187 
22189 {
22190  string host = "Atlantic white-sided dolphin";
22191  string error_msg;
22192 
22193  BOOST_CHECK_EQUAL(true, IsSpecificHostValid("Exotic creature", error_msg)); // RW-1491
22194  BOOST_CHECK_EQUAL("Atlantic white-sided dolphin", FixSpecificHost("Atlantic white-sided dolphin"));
22195  BOOST_CHECK_EQUAL(true, IsSpecificHostValid("Atlantic white-sided dolphin", error_msg));
22196 
22197 
22199  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Atlantic white-sided dolphin");
22200 
22202 
22203  // AddChromosomeNoLocation(expected_errors, entry);
22204  eval = validator.Validate(seh, options);
22205  CheckErrors(*eval, expected_errors);
22206 
22207  CLEAR_ERRORS
22208 }
22209 
22210 
22211 BOOST_AUTO_TEST_CASE(TEST_TitleNotAppropriateForSet)
22212 {
22215 
22217 
22218  eval = validator.Validate(seh, options);
22219  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "TitleNotAppropriateForSet",
22220  "Only Pop/Phy/Mut/Eco sets should have titles"));
22221  // AddChromosomeNoLocation(expected_errors, entry);
22222  CheckErrors(*eval, expected_errors);
22223 
22224  CLEAR_ERRORS
22225 }
22226 
22227 
22229 {
22231  CRef<CUser_object> user = edit::CGenomeAssemblyComment::MakeEmptyUserObject();
22232  edit::CGenomeAssemblyComment::SetAssemblyMethod(*user, "x v. y");
22233  CRef<CUser_field> assembly_name(new CUser_field());
22234  assembly_name->SetLabel().SetStr("Assembly Name");
22235  assembly_name->SetData().SetStr("valid value");
22236  user->SetData().push_back(assembly_name);
22237  edit::CGenomeAssemblyComment::SetGenomeCoverage(*user, "2x");
22238  edit::CGenomeAssemblyComment::SetSequencingTechnology(*user, "z");
22239  CRef<CSeqdesc> desc(new CSeqdesc());
22240  desc->SetUser().Assign(*user);
22241  entry->SetSeq().SetDescr().Set().push_back(desc);
22242 
22244 
22245  // AddChromosomeNoLocation(expected_errors, entry);
22246  eval = validator.Validate(seh, options);
22247  CheckErrors(*eval, expected_errors);
22248 
22249  assembly_name->SetData().SetStr("not,valid");
22250  desc->SetUser().Assign(*user);
22251  eval = validator.Validate(seh, options);
22252  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info,
22253  "BadStrucCommInvalidFieldValue",
22254  "Structured Comment invalid; the field value and/or name are incorrect"));
22255  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
22256  "BadStrucCommInvalidFieldValue",
22257  "not,valid is not a valid value for Assembly Name"));
22258  CheckErrors(*eval, expected_errors);
22259 
22260  CLEAR_ERRORS
22261 
22262  // AddChromosomeNoLocation(expected_errors, entry);
22263  assembly_name->SetData().SetStr("Ec2009C-3227");
22264  desc->SetUser().Assign(*user);
22265  eval = validator.Validate(seh, options);
22266  CheckErrors(*eval, expected_errors);
22267 
22268  assembly_name->SetData().SetStr("Anop_step_SDA-500_V1");
22269  desc->SetUser().Assign(*user);
22270  eval = validator.Validate(seh, options);
22271  CheckErrors(*eval, expected_errors);
22272 
22273  CLEAR_ERRORS
22274 }
22275 
22276 
22278 {
22280  CRef<CSeq_feat> gene(new CSeq_feat());
22281  gene->SetData().SetGene().SetLocus("a");
22282  CRef<CSeq_loc> int1(new CSeq_loc());
22283  int1->SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
22284  int1->SetInt().SetFrom(0);
22285  int1->SetInt().SetTo(5);
22286  CRef<CSeq_loc> int2(new CSeq_loc());
22287  int2->SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
22288  int2->SetInt().SetFrom(10);
22289  int2->SetInt().SetTo(15);
22290  gene->SetLocation().SetMix().Set().push_back(int1);
22291  gene->SetLocation().SetMix().Set().push_back(int2);
22292  unit_test_util::AddFeat(gene, entry);
22293 
22294  CRef<CSeq_feat> mobile_element(new CSeq_feat());
22295  mobile_element->SetData().SetImp().SetKey("mobile_element");
22296  mobile_element->SetLocation().SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
22297  mobile_element->SetLocation().SetInt().SetFrom(6);
22298  mobile_element->SetLocation().SetInt().SetTo(9);
22299  CRef<CGb_qual> qual(new CGb_qual("mobile_element_type", "superintegron"));
22300  mobile_element->SetQual().push_back(qual);
22301  unit_test_util::AddFeat(mobile_element, entry);
22302 
22304 
22305  // AddChromosomeNoLocation(expected_errors, entry);
22306  eval = validator.Validate(seh, options);
22307  CheckErrors(*eval, expected_errors);
22308 
22309  CLEAR_ERRORS
22310 }
22311 
22312 
22314 {
22317  gene->SetData().SetGene().SetLocus("X");
22318  gene->SetExcept(true);
22319  gene->SetExcept_text("trans-splicing");
22320 
22322 
22323  eval = validator.Validate(seh, options);
22324  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
22325  "BadTranssplicedInterval",
22326  "Trans-spliced feature should have multiple intervals"));
22327  // AddChromosomeNoLocation(expected_errors, entry);
22328  CheckErrors(*eval, expected_errors);
22329 
22330  CLEAR_ERRORS
22331 }
22332 
22333 
22335 {
22338  recomb->SetData().SetImp().SetKey("misc_recomb");
22339  CRef<CGb_qual> qual(new CGb_qual("recombination_class", "other"));
22340  recomb->SetQual().push_back(qual);
22341 
22343 
22344  // first check ok because recomb has comment
22345  // AddChromosomeNoLocation(expected_errors, entry);
22346  eval = validator.Validate(seh, options);
22347  CheckErrors(*eval, expected_errors);
22348 
22349  // error because 'other' and no comment
22350  recomb->ResetComment();
22351  eval = validator.Validate(seh, options);
22352  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22353  "RecombinationClassOtherNeedsNote",
22354  "The recombination_class 'other' is missing the required /note"));
22355  CheckErrors(*eval, expected_errors);
22356 
22357  // info because not other and not valid
22358  // removed per VR-770
22359  // qual->SetVal("not a valid recombination class");
22360  // expected_errors[0]->SetErrMsg("'not a valid recombination class' is not a legal value for recombination_class");
22361  // expected_errors[0]->SetSeverity(eDiag_Info);
22362  // eval = validator.Validate(seh, options);
22363  // CheckErrors(*eval, expected_errors);
22364 
22365  CLEAR_ERRORS
22366 
22367  // no error because legal
22368  qual->SetVal("mitotic");
22369  // AddChromosomeNoLocation(expected_errors, entry);
22370  eval = validator.Validate(seh, options);
22371  CheckErrors(*eval, expected_errors);
22372 
22373  CLEAR_ERRORS
22374 }
22375 
22376 
22377 void AddOrgmod(COrg_ref& org, const string& val, COrgMod::ESubtype subtype)
22378 {
22379  CRef<COrgMod> om(new COrgMod(subtype, val));
22380  org.SetOrgname().SetMod().push_back(om);
22381 }
22382 
22383 
22384 void AddOrgmodDescriptor(CRef<CSeq_entry> entry, const string& val, COrgMod::ESubtype subtype)
22385 {
22386  CRef<CSeqdesc> src_desc(new CSeqdesc());
22387  // should look up
22388  src_desc->SetSource().SetOrg().SetTaxname("Influenza A virus");
22389  AddOrgmod(src_desc->SetSource().SetOrg(), val, subtype);
22390  entry->SetDescr().Set().push_back(src_desc);
22391 }
22392 
22393 void AddOrgmodFeat(CRef<CSeq_entry> entry, const string& val, COrgMod::ESubtype subtype)
22394 {
22396  src_feat->SetData().SetBiosrc().SetOrg().SetTaxname("Influenza virus A");
22397  AddOrgmod(src_feat->SetData().SetBiosrc().SetOrg(), val, subtype);
22398 }
22399 
22400 typedef vector<pair<string, string>> THostStringsVector;
22401 
22402 
22404 {
22406 
22407  vector<CRef<COrg_ref>> original;
22408  vector<CRef<COrg_ref>> to_adjust;
22409 
22410  for (const auto& it : test_values) {
22412  AddOrgmodFeat(entry, it.first, COrgMod::eSubtype_nat_host);
22413  CRef<COrg_ref> org(new COrg_ref());
22414  org->SetTaxname("foo");
22416  org->SetOrgname().SetMod().push_back(om);
22417  to_adjust.push_back(org);
22418  CRef<COrg_ref> cpy(new COrg_ref());
22419  cpy->Assign(*org);
22420  original.push_back(cpy);
22421  }
22422 
22424  tval.Init(*entry);
22425  vector<CRef<COrg_ref>> org_rq_list = tval.GetSpecificHostLookupRequest(true);
22426 
22428  CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(org_rq_list);
22429  BOOST_CHECK_EQUAL(reply->GetReply().size(), org_rq_list.size());
22430 
22431  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *reply, to_adjust), true);
22432 
22433  vector<CRef<COrg_ref>>::const_iterator org = to_adjust.begin();
22434  vector<CRef<COrg_ref>>::const_iterator cpy = original.begin();
22435  while (org != to_adjust.cend()) {
22436  const string& before = (*cpy)->GetOrgname().GetMod().front()->GetSubname();
22437  const string& after = (*org)->GetOrgname().GetMod().front()->GetSubname();
22438  THostStringsVector::const_iterator tvit = test_values.cbegin();
22439  while (tvit != test_values.cend() && !NStr::Equal(tvit->first, before)) {
22440  ++tvit;
22441  }
22442 
22443  BOOST_CHECK_EQUAL(after, tvit->second);
22444  ++org;
22445  ++cpy;
22446  ++tvit;
22447  }
22448 }
22449 
22450 BOOST_AUTO_TEST_CASE(Test_SQD_4354)
22451 {
22452  THostStringsVector test_values;
22453  test_values.push_back(make_pair("Zymomonas anaerobia", "Zymomonas mobilis"));
22454  TestBulkSpecificHostFixList(test_values);
22455 
22456  test_values.clear();
22457  test_values.push_back(make_pair("Zymononas mobilis", "Zymomonas mobilis"));
22458  TestBulkSpecificHostFixList(test_values);
22459 }
22460 
22461 
22462 BOOST_AUTO_TEST_CASE(Test_BulkSpecificHostFix)
22463 {
22465 
22466  THostStringsVector test_values;
22467  test_values.push_back(make_pair("Homo supiens", "Homo supiens")); // non-fixable spelling problem
22468  test_values.push_back(make_pair("HUMAN", "Homo sapiens"));
22469  TestBulkSpecificHostFixList(test_values);
22470  test_values.push_back(make_pair("Homo sapiens", "Homo sapiens"));
22471  TestBulkSpecificHostFixList(test_values);
22472  test_values.push_back(make_pair("Gallus Gallus", "Gallus gallus"));
22473  TestBulkSpecificHostFixList(test_values);
22474  test_values.push_back(make_pair("Conservemos nuestros", "Conservemos nuestros")); // non-fixable spelling problem
22475  TestBulkSpecificHostFixList(test_values);
22476  test_values.push_back(make_pair("Pinus sp.", "Pinus sp.")); // ambiguous
22477  TestBulkSpecificHostFixList(test_values);
22478  test_values.push_back(make_pair("Eschericia coli", "Escherichia coli")); // fixable spelling problem
22479  TestBulkSpecificHostFixList(test_values);
22480  test_values.push_back(make_pair("Avian", "Avian"));
22481  TestBulkSpecificHostFixList(test_values);
22482  test_values.push_back(make_pair("Bovine", "Bovine"));
22483  TestBulkSpecificHostFixList(test_values);
22484  test_values.push_back(make_pair("Pig", "Pig"));
22485  TestBulkSpecificHostFixList(test_values);
22486  test_values.push_back(make_pair(" Chicken", "Chicken")); // truncate space
22487  TestBulkSpecificHostFixList(test_values);
22488  test_values.push_back(make_pair("Homo sapiens; sex: female", "Homo sapiens; sex: female"));
22489  TestBulkSpecificHostFixList(test_values);
22490  test_values.push_back(make_pair("Atlantic white-sided dolphin", "Atlantic white-sided dolphin"));
22491  TestBulkSpecificHostFixList(test_values);
22492  test_values.push_back(make_pair("Zymomonas anaerobia", "Zymomonas mobilis"));
22493  TestBulkSpecificHostFixList(test_values);
22494 
22495  vector<CRef<COrg_ref>> to_adjust;
22496  vector<CRef<COrg_ref>> original;
22497 
22498  for (const auto& it : test_values) {
22500  AddOrgmodFeat(entry, it.first, COrgMod::eSubtype_nat_host);
22501  CRef<COrg_ref> org(new COrg_ref());
22502  org->SetTaxname("foo");
22504  org->SetOrgname().SetMod().push_back(om);
22505  to_adjust.push_back(org);
22506  CRef<COrg_ref> cpy(new COrg_ref());
22507  cpy->Assign(*org);
22508  original.push_back(cpy);
22509  }
22510  string error_message;
22511 
22513  tval.Init(*entry);
22514  vector<CRef<COrg_ref>> org_rq_list = tval.GetSpecificHostLookupRequest(true);
22515  // don't create update requests for single-word values
22516  // Homo sapiens is ignored because "HUMAN" already corrects to it
22517  BOOST_CHECK_EQUAL(org_rq_list.size(), test_values.size() - 6);
22518 
22520  CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(org_rq_list);
22521 
22522  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *reply, to_adjust), true);
22523 
22524  vector<CRef<COrg_ref>>::const_iterator org = to_adjust.begin();
22525  vector<CRef<COrg_ref>>::const_iterator cpy = original.begin();
22526  while (org != to_adjust.cend()) {
22527  const string& before = (*cpy)->GetOrgname().GetMod().front()->GetSubname();
22528  const string& after = (*org)->GetOrgname().GetMod().front()->GetSubname();
22529  THostStringsVector::const_iterator tvit = test_values.cbegin();
22530  while (tvit != test_values.cend() && !NStr::Equal(tvit->first, before)) {
22531  ++tvit;
22532  }
22533 
22534  BOOST_CHECK_EQUAL(after, tvit->second);
22535  ++org;
22536  ++cpy;
22537  ++tvit;
22538  }
22539 
22540  CRef<COrg_ref> test_src(new COrg_ref());
22541  AddOrgmod(*test_src, "Conservemos nuestros", COrgMod::eSubtype_nat_host); // don't change because bad
22542  AddOrgmod(*test_src, "Pinus sp.", COrgMod::eSubtype_nat_host); // don't change because ambivalent
22543  AddOrgmod(*test_src, "Eschericia coli", COrgMod::eSubtype_nat_host); // change because spelling
22544 
22545  to_adjust.clear();
22546  to_adjust.push_back(test_src);
22547  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *reply, to_adjust), true);
22548  COrgName::TMod::const_iterator m = test_src->GetOrgname().GetMod().begin();
22549  BOOST_CHECK_EQUAL((*m)->GetSubname(), "Conservemos nuestros");
22550  ++m;
22551  BOOST_CHECK_EQUAL((*m)->GetSubname(), "Pinus sp.");
22552  ++m;
22553  BOOST_CHECK_EQUAL((*m)->GetSubname(), "Escherichia coli");
22554  // already fixed all problems, don't fix again
22555  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *reply, to_adjust), false);
22556  m = test_src->GetOrgname().GetMod().begin();
22557  BOOST_CHECK_EQUAL((*m)->GetSubname(), "Conservemos nuestros");
22558  ++m;
22559  BOOST_CHECK_EQUAL((*m)->GetSubname(), "Pinus sp.");
22560  ++m;
22561  BOOST_CHECK_EQUAL((*m)->GetSubname(), "Escherichia coli");
22562 
22563  vector<CRef<COrg_ref>> original_orgs = tval.GetTaxonomyLookupRequest();
22564  vector<CRef<COrg_ref>> edited_orgs = tval.GetTaxonomyLookupRequest();
22565  CRef<CTaxon3_reply> lookup_reply = taxon3.SendOrgRefList(original_orgs);
22566  BOOST_CHECK_EQUAL(lookup_reply->GetReply().size(), original_orgs.size());
22567  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithTaxLookupReply(*lookup_reply, edited_orgs, error_message), true);
22568  // second time should produce no additional changes
22569  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithTaxLookupReply(*lookup_reply, edited_orgs, error_message), false);
22570  vector<CRef<COrg_ref>> spec_host_rq = tval.GetSpecificHostLookupRequest(true);
22571  CRef<CTaxon3_reply> spec_host_reply = taxon3.SendOrgRefList(spec_host_rq);
22572  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *spec_host_reply, edited_orgs), true);
22573  // second time should produce no additional changes
22574  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *spec_host_reply, edited_orgs), false);
22575 
22576  size_t num_descs = tval.NumDescs();
22577  size_t num_updated_descs = 0;
22578  for (size_t n = 0; n < num_descs; n++) {
22579  if (!original_orgs[n]->Equals(*(edited_orgs[n]))) {
22580  CConstRef<CSeqdesc> desc = tval.GetDesc(n);
22581  CRef<CSeqdesc> new_desc(new CSeqdesc());
22582  new_desc->Assign(*desc);
22583  new_desc->SetSource().SetOrg().Assign(*(edited_orgs[n]));
22584  num_updated_descs++;
22585  }
22586  }
22587  // we expect that all descs will be updated, because they have a recognizable taxname but none of the other data
22588  BOOST_CHECK_EQUAL(num_updated_descs, num_descs);
22589 
22590  size_t num_updated_feats = 0;
22591  for (size_t n = 0; n < tval.NumFeats(); n++) {
22592  if (!original_orgs[n + num_descs]->Equals(*edited_orgs[n + num_descs])) {
22593  CConstRef<CSeq_feat> feat = tval.GetFeat(n);
22594  CRef<CSeq_feat> new_feat(new CSeq_feat());
22595  new_feat->Assign(*feat);
22596  new_feat->SetData().SetBiosrc().SetOrg().Assign(*(edited_orgs[n]));
22597  num_updated_feats++;
22598  }
22599  }
22600  // only five of the feats will be updated, because their taxnames cannot be
22601  // recognized, and only five of the specific hosts are altered.
22602  BOOST_CHECK_EQUAL(num_updated_feats, (size_t)5);
22603 }
22604 
22605 
22607 {
22608  CRef<COrg_ref> org(new COrg_ref());
22609 
22610  org->SetTaxname("Dickeya dadantii subsp. dieffenbachiae");
22611  CRef<CDbtag> dbtag(new CDbtag());
22612  dbtag->SetDb("taxon");
22613  dbtag->SetTag().SetId(204040);
22614  org->SetDb().push_back(dbtag);
22615  org->SetOrgname().SetName().SetBinomial().SetGenus("Dickeya");
22616  org->SetOrgname().SetName().SetBinomial().SetSpecies("dadantii");
22617  org->SetOrgname().SetName().SetBinomial().SetSubspecies("dieffenbachiae");
22618  org->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_strain, "PA1")));
22619  org->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_nat_host, "Phalaenopsis sp. (orchid)")));
22620  org->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_sub_species, "dieffenbachiae")));
22621  org->SetOrgname().SetLineage("Bacteria; Proteobacteria; Gammaproteobacteria");
22622  org->SetOrgname().SetGcode(11);
22623  org->SetOrgname().SetDiv("BCT");
22624 
22625  vector<CRef<COrg_ref>> org_rq;
22626  org_rq.push_back(org);
22627 
22628  vector<CRef<COrg_ref>> edited_orgs;
22629  CRef<COrg_ref> cpy(new COrg_ref());
22630  cpy->Assign(*org);
22631  edited_orgs.push_back(cpy);
22632 
22634 
22636 
22637  CRef<CTaxon3_reply> org_reply = taxon3.SendOrgRefList(org_rq);
22638  string error_message;
22639  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithTaxLookupReply(*org_reply, edited_orgs, error_message), true);
22640  BOOST_CHECK_EQUAL(cpy->GetTaxname(), "Dickeya fangzhongdai");
22641 
22642  org->Reset();
22643  org->SetTaxname("Alnus cordata");
22644  dbtag->SetTag().SetId(109058);
22645  org->SetDb().push_back(dbtag);
22646  org->SetOrgname().SetName().SetBinomial().SetGenus("Alnus");
22647  org->SetOrgname().SetName().SetBinomial().SetSpecies("cordata");
22648  org->SetOrgname().SetAttrib("specified");
22649  org->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_isolate, "AZ12-2")));
22650  org->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_sub_species, "Alnus cordata AZ12-2 chloroplast, complete genome")));
22651  org->SetOrgname().SetLineage("Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; eudicotyledons; Gunneridae; Pentapetalae; rosids; fabids; Fagales; Betulaceae; Alnus");
22652  org->SetOrgname().SetGcode(1);
22653  org->SetOrgname().SetMgcode(1);
22654  org->SetOrgname().SetDiv("PLN");
22655  org->SetOrgname().SetPgcode(11);
22656 
22657  cpy->Assign(*org);
22658 
22659  org_reply = taxon3.SendOrgRefList(org_rq);
22660  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithTaxLookupReply(*org_reply, edited_orgs, error_message), false);
22661  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithTaxLookupReply(*org_reply, edited_orgs, error_message, true), true);
22662  BOOST_CHECK_EQUAL(cpy->GetTaxname(), "Alnus cordata subsp. Alnus cordata AZ12-2 chloroplast, complete genome");
22663 
22664 }
22665 
22666 
22667 BOOST_AUTO_TEST_CASE(Test_BulkSpecificHostFixIncremental)
22668 {
22670 
22671  THostStringsVector test_values;
22672  test_values.push_back(make_pair("Homo supiens", "Homo supiens")); // non-fixable spelling problem
22673  test_values.push_back(make_pair("HUMAN", "Homo sapiens"));
22674  test_values.push_back(make_pair("Homo sapiens", "Homo sapiens"));
22675  test_values.push_back(make_pair("Pinus sp.", "Pinus sp.")); // ambiguous
22676  test_values.push_back(make_pair("Gallus Gallus", "Gallus gallus"));
22677  test_values.push_back(make_pair("Eschericia coli", "Escherichia coli")); // fixable spelling problem
22678  test_values.push_back(make_pair("Avian", "Avian"));
22679  test_values.push_back(make_pair("Bovine", "Bovine"));
22680  test_values.push_back(make_pair("Pig", "Pig"));
22681  test_values.push_back(make_pair(" Chicken", "Chicken")); // truncate space
22682  test_values.push_back(make_pair("Homo sapiens; sex: female", "Homo sapiens; sex: female"));
22683  test_values.push_back(make_pair("Atlantic white-sided dolphin", "Atlantic white-sided dolphin"));
22684 
22685  vector<CRef<COrg_ref>> to_adjust;
22686 
22687  for (const auto& it : test_values) {
22689  AddOrgmodFeat(entry, it.first, COrgMod::eSubtype_nat_host);
22690  CRef<COrg_ref> org(new COrg_ref());
22691  org->SetTaxname("foo");
22692  AddOrgmod(*org, it.first, COrgMod::eSubtype_nat_host);
22693  to_adjust.push_back(org);
22694  }
22695  string error_message;
22696 
22698  tval.Init(*entry);
22699  vector<CRef<COrg_ref>> spec_host_rq = tval.GetSpecificHostLookupRequest(true);
22700  // don't create update requests for single-word values
22701  // Homo sapiens is ignored because "HUMAN" already corrects to it
22702  BOOST_CHECK_EQUAL(spec_host_rq.size(), test_values.size() - 6);
22703 
22705 
22706  size_t chunk_size = 3;
22707  size_t i = 0;
22708  while (i < spec_host_rq.size()) {
22709  size_t len = min(chunk_size, spec_host_rq.size() - i);
22710  vector<CRef<COrg_ref>> tmp_rq(spec_host_rq.begin() + i, spec_host_rq.begin() + i + len);
22711  CRef<CTaxon3_reply> tmp_spec_host_reply = taxon3.SendOrgRefList(tmp_rq);
22712  BOOST_CHECK_EQUAL(tval.IncrementalSpecificHostMapUpdate(tmp_rq, *tmp_spec_host_reply), kEmptyStr);
22713  i += chunk_size;
22714  }
22715 
22716  BOOST_CHECK_EQUAL(tval.IsSpecificHostMapUpdateComplete(), true);
22717 
22718  BOOST_CHECK_EQUAL(tval.AdjustOrgRefsForSpecificHosts(to_adjust), true);
22719 
22720  vector<CRef<COrg_ref>>::iterator org = to_adjust.begin();
22721  THostStringsVector::iterator tvit = test_values.begin();
22722  while (org != to_adjust.end()) {
22723  BOOST_CHECK_EQUAL((*org)->GetOrgname().GetMod().front()->GetSubname(), tvit->second);
22724  ++org;
22725  ++tvit;
22726  }
22727 
22728 }
22729 
22730 
22731 void AddStrainDescriptor(CSeq_entry& entry, const string& taxname, const string& strain, const string& lineage, TTaxId taxID)
22732 {
22733  CRef<CSeqdesc> src_desc(new CSeqdesc());
22734  // should look up
22735  src_desc->SetSource().SetOrg().SetTaxname(taxname);
22736  AddOrgmod(src_desc->SetSource().SetOrg(), strain, COrgMod::eSubtype_strain);
22737  src_desc->SetSource().SetOrg().SetOrgname().SetLineage(lineage);
22738  if (taxID != ZERO_TAX_ID) {
22739  unit_test_util::SetTaxon(src_desc->SetSource(), taxID);
22740  }
22741  entry.SetDescr().Set().push_back(src_desc);
22742 }
22743 
22744 
22745 void TestOneStrain(const string& taxname, const string& strain, const string& lineage, TTaxId taxID, bool expect_err)
22746 {
22748  CBioseq::TDescr::Tdata& cont = entry->SetSeq().SetDescr().Set();
22749  cont.remove_if(
22750  [](CSeqdesc* it) { return (it->IsSource()); });
22751  AddStrainDescriptor(*entry, taxname, strain, lineage, taxID); // expect no report
22753 
22754  if (taxID == 0) {
22755  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
22756  "BioSource is missing taxon ID"));
22757  }
22758  if (expect_err) {
22759  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StrainContainsTaxInfo",
22760  "Strain '" + strain + "' contains taxonomic name information"));
22761  }
22762 
22763  eval = validator.Validate(seh, options);
22764  CheckErrors(*eval, expected_errors);
22765 
22766  CLEAR_ERRORS
22767 }
22768 
22769 
22770 BOOST_AUTO_TEST_CASE(Test_BulkStrainIncremental)
22771 {
22773 
22774  AddStrainDescriptor(*entry, "Gorilla gorilla", "abc", "xyz", TAX_ID_CONST(9593)); // expect no report
22775  AddStrainDescriptor(*entry, "Gorilla gorilla", "Aeromonas punctata", "xyz", TAX_ID_CONST(9593)); // expect a report
22776  AddStrainDescriptor(*entry, "Gorilla gorilla", "Klebsiella_quasipneumoniae", "xyz", TAX_ID_CONST(9593)); // expect a report
22777  AddStrainDescriptor(*entry, "Bacillus sp.", "cereus", "xyz", TAX_ID_CONST(1409));
22778  AddStrainDescriptor(*entry, "Hippopotamus amphibius", "giraffe cow", "xyz", TAX_ID_CONST(9833)); // no error - giraffe looks up but is not in taxname
22779 
22780  string error_message;
22781 
22783  tval.Init(*entry);
22784 
22785  vector<CRef<COrg_ref>> strain_rq = tval.GetStrainLookupRequest();
22786  BOOST_CHECK_EQUAL(strain_rq.size(), (size_t)9);
22787 
22789 
22790  size_t chunk_size = 3;
22791  size_t i = 0;
22792  while (i < strain_rq.size()) {
22793  size_t len = min(chunk_size, strain_rq.size() - i);
22794  vector<CRef<COrg_ref>> tmp_rq(strain_rq.begin() + i, strain_rq.begin() + i + len);
22795  CRef<CTaxon3_reply> tmp_strain_reply = taxon3.SendOrgRefList(tmp_rq);
22796  BOOST_CHECK_EQUAL(tval.IncrementalStrainMapUpdate(tmp_rq, *tmp_strain_reply), kEmptyStr);
22797  i += chunk_size;
22798  }
22799 
22800  BOOST_CHECK_EQUAL(tval.IsStrainMapUpdateComplete(), true);
22801 
22802  // commented out until TM-725 is resolved
22803  TestOneStrain("Hippopotamus amphibius", "giraffe cow", "xyz", TAX_ID_CONST(9833), false); // no error - giraffe looks up but is not in taxname
22804  TestOneStrain("Gorilla gorilla", "abc", "xyz", TAX_ID_CONST(9593), false);
22805  TestOneStrain("Gorilla gorilla", "Aeromonas punctata", "xyz", TAX_ID_CONST(9593), true);
22806 
22807  TestOneStrain("Gorilla gorilla", "Klebsiella_quasipneumoniae", "xyz", TAX_ID_CONST(9593), false);
22808  TestOneStrain("Gorilla gorilla", "Klebsiella_quasipneumoniae", "xyz", ZERO_TAX_ID, true);
22809 
22810  TestOneStrain("Bacillus sp.", "cereus", "xyz", TAX_ID_CONST(1409), true);
22811 
22812  TestOneStrain("Ralstonia phage phiRSL1", "Aeromonas punctata", "xyz", TAX_ID_CONST(1980924), false);
22813  TestOneStrain("Gorilla gorilla", "Aeromonas punctata", "viroid", TAX_ID_CONST(9593), false);
22814  TestOneStrain("Acetobacter sp.", "DsW_063", "Bacteria", TAX_ID_CONST(440), false);
22815 }
22816 
22817 
22819 {
22820  TestOneStrain("Cystobasidium minutum", "P22", "xyz", TAX_ID_CONST(29899), false);
22821 }
22822 
22824 {
22827 
22828  CRef<CCode_break> codebreak(new CCode_break());
22829  codebreak->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
22830  codebreak->SetLoc().SetInt().SetFrom(24);
22831  codebreak->SetLoc().SetInt().SetTo(26);
22832  codebreak->SetLoc().SetPartialStop(true, eExtreme_Positional);
22833  cds->SetData().SetCdregion().SetCode_break().push_back(codebreak);
22834 
22836 
22837  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TranslExceptIsPartial",
22838  "Translation exception locations should not be partial"));
22839  // AddChromosomeNoLocation(expected_errors, entry);
22840  eval = validator.Validate(seh, options);
22841  CheckErrors(*eval, expected_errors);
22842  CLEAR_ERRORS
22843 }
22844 
22845 
22847 {
22850  exon->SetData().SetImp().SetKey("exon");
22851  exon->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("number", "group I")));
22852 
22854 
22855  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidNumberQualifier",
22856  "Number qualifiers should not contain spaces"));
22857  // AddChromosomeNoLocation(expected_errors, entry);
22858  eval = validator.Validate(seh, options);
22859  CheckErrors(*eval, expected_errors);
22860  CLEAR_ERRORS
22861 }
22862 
22863 
22865 {
22868  feat->SetLocation().SetInt().SetFrom(0);
22869  feat->SetLocation().SetInt().SetFuzz_from().SetLim(CInt_fuzz::eLim_tl);
22870  feat->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength() - 1);
22871 
22873 
22874  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
22875  "Should not specify 'space to left' at first position of non-circular sequence"));
22876  // AddChromosomeNoLocation(expected_errors, entry);
22877  eval = validator.Validate(seh, options);
22878  CheckErrors(*eval, expected_errors);
22879 
22880  CLEAR_ERRORS
22881 
22882  scope.RemoveTopLevelSeqEntry(seh);
22883  feat->SetLocation().SetInt().SetFuzz_from().SetLim(CInt_fuzz::eLim_tr);
22884  seh = scope.AddTopLevelSeqEntry(*entry);
22885  // not an error
22886  // AddChromosomeNoLocation(expected_errors, entry);
22887  eval = validator.Validate(seh, options);
22888  CheckErrors(*eval, expected_errors);
22889 
22890  CLEAR_ERRORS
22891 
22892  scope.RemoveTopLevelSeqEntry(seh);
22893  feat->SetLocation().SetInt().ResetFuzz_from();
22894  feat->SetLocation().SetInt().SetFuzz_to().SetLim(CInt_fuzz::eLim_tr);
22895  seh = scope.AddTopLevelSeqEntry(*entry);
22896  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
22897  "Should not specify 'space to right' at last position of non-circular sequence"));
22898  // AddChromosomeNoLocation(expected_errors, entry);
22899  eval = validator.Validate(seh, options);
22900  CheckErrors(*eval, expected_errors);
22901 
22902  CLEAR_ERRORS
22903  //suppress if circular
22904  scope.RemoveTopLevelSeqEntry(seh);
22905  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
22906  seh = scope.AddTopLevelSeqEntry(*entry);
22907  eval = validator.Validate(seh, options);
22908  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CompleteCircleProblem", "Circular topology without complete flag set"));
22909  // AddChromosomeNoLocation(expected_errors, entry);
22910  CheckErrors(*eval, expected_errors);
22911 
22912  // also suppress for point
22913  scope.RemoveTopLevelSeqEntry(seh);
22914  feat->SetLocation().SetPnt().SetId().Assign(*(entry->GetSeq().GetId().front()));
22915  feat->SetLocation().SetPnt().SetPoint(0);
22916  feat->SetLocation().SetPnt().SetFuzz().SetLim(CInt_fuzz::eLim_tl);
22917  seh = scope.AddTopLevelSeqEntry(*entry);
22918  eval = validator.Validate(seh, options);
22919  CheckErrors(*eval, expected_errors);
22920  CLEAR_ERRORS
22921 
22922  scope.RemoveTopLevelSeqEntry(seh);
22923  entry->SetSeq().SetInst().ResetTopology();
22924  seh = scope.AddTopLevelSeqEntry(*entry);
22925  eval = validator.Validate(seh, options);
22926  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
22927  "Should not specify 'space to left' at first position of non-circular sequence"));
22928  // AddChromosomeNoLocation(expected_errors, entry);
22929  CheckErrors(*eval, expected_errors);
22930 
22931  CLEAR_ERRORS
22932 
22933  scope.RemoveTopLevelSeqEntry(seh);
22934  feat->SetLocation().SetPnt().SetPoint(entry->GetSeq().GetInst().GetLength() - 1);
22935  feat->SetLocation().SetPnt().SetFuzz().SetLim(CInt_fuzz::eLim_tr);
22936  seh = scope.AddTopLevelSeqEntry(*entry);
22937  eval = validator.Validate(seh, options);
22938  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
22939  "Should not specify 'space to right' at last position of non-circular sequence"));
22940  // AddChromosomeNoLocation(expected_errors, entry);
22941  CheckErrors(*eval, expected_errors);
22942 
22943  CLEAR_ERRORS
22944 }
22945 
22946 
22948 {
22950  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
22951  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
22952  entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
22954 
22956 
22957  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22958  "FarLocationExcludesFeatures",
22959  "Scaffold points to some but not all of gb|AY123456|, excluded portion contains features"));
22960  AddChromosomeNoLocation(expected_errors, entry);
22961  eval = validator.Validate(seh, options);
22962  CheckErrors(*eval, expected_errors);
22963  CLEAR_ERRORS
22964 
22965  // suppress error if RefSeq
22966  scope.RemoveTopLevelSeqEntry(seh);
22967  entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_00000001");
22968  CRef<CSeqdesc> biosample(new CSeqdesc());
22969  biosample->SetUser().SetType().SetStr("DBLink");
22971  f->SetLabel().SetStr("BioSample");
22972  f->SetData().SetStr("SAME0001");
22973  biosample->SetUser().SetData().push_back(f);
22974  CRef<CUser_field> f2(new CUser_field());
22975  f2->SetLabel().SetStr("BioProject");
22976  f2->SetData().SetStrs().push_back("PRJNA12345");
22977  biosample->SetUser().SetData().push_back(f2);
22978  entry->SetSeq().SetDescr().Set().push_back(biosample);
22979 
22980  seh = scope.AddTopLevelSeqEntry(*entry);
22981  AddChromosomeNoLocation(expected_errors, entry);
22982  eval = validator.Validate(seh, options);
22983  CheckErrors(*eval, expected_errors);
22984 
22985  CLEAR_ERRORS
22986 }
22987 
22988 
22990 {
22995 
22997 
22998  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22999  "BadPlasmidChromosomeLinkageName",
23000  "Problematic plasmid/chromosome/linkage group name '_abc'"));
23001  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23002  "BadPlasmidChromosomeLinkageName",
23003  "Problematic plasmid/chromosome/linkage group name '*123'"));
23004  eval = validator.Validate(seh, options);
23005  CheckErrors(*eval, expected_errors);
23006  CLEAR_ERRORS
23007 }
23008 
23009 
23011 {
23012  string host = "Rhesus monkey";
23013  string error_msg;
23014 
23015  BOOST_CHECK_EQUAL("Rhesus monkey", FixSpecificHost("Rhesus monkey"));
23016  BOOST_CHECK_EQUAL(true, IsSpecificHostValid("Rhesus monkey", error_msg));
23017 
23018 
23020  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Rhesus monkey");
23021 
23023 
23024  // AddChromosomeNoLocation(expected_errors, entry);
23025  eval = validator.Validate(seh, options);
23026  CheckErrors(*eval, expected_errors);
23027 
23028  CLEAR_ERRORS
23029 }
23030 
23031 #if 0
23032 // commented out for now
23033 BOOST_AUTO_TEST_CASE(Test_VR_723)
23034 {
23036  CRef<CBioSource> src;
23037  for (auto& it : entry->SetSeq().SetDescr().Set()) {
23038  if (it->IsSource()) {
23039  src.Reset(&(it->SetSource()));
23040  }
23041  }
23042  COrgName::C_Name& orgname = src->SetOrg().SetOrgname().SetName();
23044 
23045  // binomial
23046  orgname.SetBinomial().SetGenus("Sebaea");
23047  orgname.SetBinomial().SetSpecies("microphylla");
23048  eval = validator.Validate(seh, options);
23049  CheckErrors(*eval, expected_errors);
23050 
23051  orgname.SetBinomial().SetGenus("x");
23052  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23053  "BioSourceInconsistency",
23054  "Taxname does not match orgname ('Sebaea microphylla', 'x microphylla')"));
23055  eval = validator.Validate(seh, options);
23056  CheckErrors(*eval, expected_errors);
23057 
23058  orgname.SetBinomial().SetSpecies("y");
23059  expected_errors[0]->SetErrMsg("Taxname does not match orgname ('Sebaea microphylla', 'x y')");
23060  eval = validator.Validate(seh, options);
23061  CheckErrors(*eval, expected_errors);
23062 
23063  orgname.SetBinomial().SetSubspecies("z");
23064  expected_errors[0]->SetErrMsg("Taxname does not match orgname ('Sebaea microphylla', 'x y subsp. z')");
23065  eval = validator.Validate(seh, options);
23066  CheckErrors(*eval, expected_errors);
23067 
23068  // virus
23069  orgname.SetVirus("x");
23070  expected_errors[0]->SetErrMsg("Taxname does not match orgname ('Sebaea microphylla', 'x')");
23071  eval = validator.Validate(seh, options);
23072  CheckErrors(*eval, expected_errors);
23073 
23074  CLEAR_ERRORS
23075  orgname.SetVirus("Sebaea microphylla");
23076  eval = validator.Validate(seh, options);
23077  CheckErrors(*eval, expected_errors);
23078 
23079  // hybrid
23080  CRef<COrgName> org1(new COrgName());
23081  org1->SetName().SetBinomial().SetSpecies("z");
23082  org1->SetName().SetBinomial().SetGenus("x");
23083  CRef<COrgName> org2(new COrgName());
23084  org2->SetName().SetBinomial().SetGenus("y");
23085  org2->SetName().SetBinomial().SetSpecies("z");
23086  orgname.SetHybrid().Set().push_back(org1);
23087  orgname.SetHybrid().Set().push_back(org2);
23088  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23089  "BioSourceInconsistency",
23090  "Taxname does not match orgname ('Sebaea microphylla', 'x z')"));
23091  eval = validator.Validate(seh, options);
23092  CheckErrors(*eval, expected_errors);
23093 
23094  org2->SetName().SetBinomial().SetGenus("Sebaea");
23095  org2->SetName().SetBinomial().SetSpecies("microphylla");
23096  CLEAR_ERRORS
23097  eval = validator.Validate(seh, options);
23098  CheckErrors(*eval, expected_errors);
23099 
23100  // named hybrid
23101  orgname.SetNamedhybrid().SetGenus("Sebaea");
23102  orgname.SetNamedhybrid().SetSpecies("microphylla");
23103  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23104  "BioSourceInconsistency",
23105  "Taxname does not match orgname ('Sebaea microphylla', 'Sebaea x microphylla')"));
23106 
23107  eval = validator.Validate(seh, options);
23108  CheckErrors(*eval, expected_errors);
23109 
23110  CLEAR_ERRORS
23111  orgname.SetNamedhybrid().SetGenus("x");
23112  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23113  "BioSourceInconsistency",
23114  "Taxname does not match orgname ('Sebaea microphylla', 'x x microphylla')"));
23115  eval = validator.Validate(seh, options);
23116  CheckErrors(*eval, expected_errors);
23117 
23118  // partial
23119  CRef<CTaxElement> elem1(new CTaxElement());
23120  elem1->SetFixed_level(CTaxElement::eFixed_level_class);
23121  elem1->SetName("x");
23122  orgname.SetPartial().Set().push_back(elem1);
23123  expected_errors[0]->SetErrMsg("Taxname does not match orgname ('Sebaea microphylla', 'x')");
23124  eval = validator.Validate(seh, options);
23125  CheckErrors(*eval, expected_errors);
23126 
23127  CRef<CTaxElement> elem2(new CTaxElement());
23128  elem2->SetFixed_level(CTaxElement::eFixed_level_family);
23129  elem2->SetName("Sebaea microphylla");
23130  orgname.SetPartial().Set().push_back(elem2);
23131  CLEAR_ERRORS
23132  eval = validator.Validate(seh, options);
23133  CheckErrors(*eval, expected_errors);
23134 }
23135 #endif
23136 
23137 
23139 {
23140  CRef<CSeq_entry> entry = BuildGoodSeq();
23141  entry->SetSeq().SetId().front()->SetGeneral().SetDb("NCBIFILE");
23142  entry->SetSeq().SetId().front()->SetGeneral().SetTag().SetStr("x");
23143 
23145 
23146  expected_errors.push_back(new CExpectedError("gnl|NCBIFILE|x", eDiag_Critical,
23147  "NoIdOnBioseq",
23148  "The only ids on this Bioseq will be stripped during ID load"));
23149  // AddChromosomeNoLocation(expected_errors, entry);
23150  eval = validator.Validate(seh, options);
23151  CheckErrors(*eval, expected_errors);
23152  CLEAR_ERRORS
23153 
23154  scope.RemoveTopLevelSeqEntry(seh);
23155  CRef<CSeq_id> other_id(new CSeq_id());
23156  other_id->SetLocal().SetStr("x");
23157  entry->SetSeq().SetId().push_back(other_id);
23158  seh = scope.AddTopLevelSeqEntry(*entry);
23159  // AddChromosomeNoLocation(expected_errors, entry);
23160  eval = validator.Validate(seh, options);
23161  CheckErrors(*eval, expected_errors);
23162 
23163  CLEAR_ERRORS
23164 
23165  scope.RemoveTopLevelSeqEntry(seh);
23166  CRef<CSeq_id> bankit(new CSeq_id());
23167  bankit->SetGeneral().SetDb("BankIt");
23168  bankit->SetGeneral().SetTag().SetStr("x");
23169  entry->SetSeq().SetId().push_back(bankit);
23170  CRef<CSeq_feat> misc = AddMiscFeature(entry);
23171  misc->SetLocation().SetInt().SetId().Assign(*bankit);
23172  seh = scope.AddTopLevelSeqEntry(*entry);
23173 
23174  expected_errors.push_back(new CExpectedError("lcl|x", eDiag_Critical,
23175  "BadSeqIdFormat",
23176  "Feature locations should not use Seq-ids that will be stripped during ID load"));
23177  // AddChromosomeNoLocation(expected_errors, entry);
23178 
23179  eval = validator.Validate(seh, options);
23180  CheckErrors(*eval, expected_errors);
23181  CLEAR_ERRORS
23182 }
23183 
23184 
23186 {
23187  CRef<CSeq_entry> entry = BuildGoodSeq();
23188  CRef<CSeq_feat> f = AddMiscFeature(entry);
23189  f->SetLocation().SetInt().SetStrand(eNa_strand_both);
23190 
23192 
23193  // expect no errors for misc_feat
23194  // AddChromosomeNoLocation(expected_errors, entry);
23195  eval = validator.Validate(seh, options);
23196  CheckErrors(*eval, expected_errors);
23197  CLEAR_ERRORS
23198 
23199  scope.RemoveTopLevelSeqEntry(seh);
23200  f->SetData().SetImp().SetKey("exon");
23201 
23202  seh = scope.AddTopLevelSeqEntry(*entry);
23203  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23204  "BothStrands",
23205  "exon may not be on both (forward) strands"));
23206  // AddChromosomeNoLocation(expected_errors, entry);
23207  eval = validator.Validate(seh, options);
23208  CheckErrors(*eval, expected_errors);
23209  CLEAR_ERRORS
23210 }
23211 
23212 
23213 
23214 void TestOnePlasmid(const string& plasmid_name, bool expect_error)
23215 {
23220 
23221  if (expect_error) {
23222  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadPlasmidChromosomeLinkageName",
23223  "Problematic plasmid/chromosome/linkage group name '" + plasmid_name + "'"));
23224  }
23225  // AddChromosomeNoLocation(expected_errors, entry);
23226  eval = validator.Validate(seh, options);
23227  CheckErrors(*eval, expected_errors);
23228  CLEAR_ERRORS
23229 }
23230 
23231 
23233 {
23234  TestOnePlasmid("plasmid", true);
23235  TestOnePlasmid("Sebaea microphylla", true);
23236 
23237  // these values are ok
23238  TestOnePlasmid("megaplasmid", false);
23239  TestOnePlasmid("2micron", false);
23240  TestOnePlasmid("psomething", false);
23241  TestOnePlasmid("unnamed", false);
23242  TestOnePlasmid("unnamed2", false);
23243  TestOnePlasmid("unnamed234", false);
23244 }
23245 
23246 
23248 {
23249  BOOST_CHECK_EQUAL(IsLikelyTaxname("Convolvulus sindicus"), true);
23250  BOOST_CHECK_EQUAL(IsLikelyTaxname("Lasiurus scindicus"), true);
23251  BOOST_CHECK_EQUAL(IsLikelyTaxname("Atlantic white-sided dolphin"), false);
23252 }
23253 
23254 
23255 BOOST_AUTO_TEST_CASE(Test_TripletEncodesStopCodon)
23256 {
23260 
23261  nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGATAAACAGAGATATAATAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
23262  CRef<CSeq_id> id = nuc->SetSeq().SetId().front();
23263  // first two "introns" are stop codons, third is not
23264  CRef<CSeq_loc> int1(new CSeq_loc(*id, 0, 8));
23265  CRef<CSeq_loc> int2(new CSeq_loc(*id, 12, 20));
23266  CRef<CSeq_loc> int3(new CSeq_loc(*id, 24, 44));
23267  CRef<CSeq_loc> int4(new CSeq_loc(*id, 48, 59));
23268  cds->SetLocation().SetMix().Set().push_back(int1);
23269  cds->SetLocation().SetMix().Set().push_back(int2);
23270  cds->SetLocation().SetMix().Set().push_back(int3);
23271  cds->SetLocation().SetMix().Set().push_back(int4);
23272 
23274 
23275  vector<CRef<CSeq_loc>> nonsense = CCDSTranslationProblems::GetNonsenseIntrons(*cds, scope);
23276  BOOST_CHECK_EQUAL(nonsense.size(), (size_t)2);
23277  BOOST_CHECK_EQUAL(nonsense.front()->GetInt().GetFrom(), (size_t)9);
23278  BOOST_CHECK_EQUAL(nonsense.front()->GetInt().GetTo(), (size_t)11);
23279  BOOST_CHECK_EQUAL(nonsense.back()->GetInt().GetFrom(), (size_t)21);
23280  BOOST_CHECK_EQUAL(nonsense.back()->GetInt().GetTo(), (size_t)23);
23281 
23282  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "IntronIsStopCodon",
23283  "Triplet intron encodes stop codon"));
23284  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "IntronIsStopCodon",
23285  "Triplet intron encodes stop codon"));
23286  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ShortExon", "Internal coding region exon is too short at position 13-21"));
23287  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop", "2 internal stops. Genetic code [0]"));
23288  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoStop", "Missing stop codon"));
23289  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen", "Given protein length [8] does not match translation length [17]"));
23290  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Splice donor consensus (GT) not found after exon ending at position 9 of lcl|nuc"));
23291  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Splice donor consensus (GT) not found after exon ending at position 21 of lcl|nuc"));
23292  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Splice donor consensus (GT) not found after exon ending at position 45 of lcl|nuc"));
23293  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor", "Splice acceptor consensus (AG) not found before exon starting at position 13 of lcl|nuc"));
23294  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor", "Splice acceptor consensus (AG) not found before exon starting at position 25 of lcl|nuc"));
23295  // AddChromosomeNoLocation(expected_errors, entry);
23296  eval = validator.Validate(seh, options);
23297  CheckErrors(*eval, expected_errors);
23298  CLEAR_ERRORS
23299 }
23300 
23302 {
23303  // make protein
23304  CRef<CBioseq> pseq(new CBioseq());
23305  pseq->SetInst().SetMol(CSeq_inst::eMol_aa);
23306  pseq->SetInst().SetRepr(CSeq_inst::eRepr_delta);
23307  pseq->SetInst().SetExt().SetDelta().AddLiteral("MPRK", CSeq_inst::eMol_aa);
23308  CRef<CDelta_seq> gap_seg(new CDelta_seq());
23309  gap_seg->SetLiteral().SetSeq_data().SetGap();
23310  gap_seg->SetLiteral().SetLength(10);
23311  pseq->SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
23312  pseq->SetInst().SetExt().SetDelta().AddLiteral("TEIN", CSeq_inst::eMol_aa);
23313  pseq->SetInst().SetLength(18);
23314 
23315  CRef<CSeq_id> pid(new CSeq_id());
23316  pid->SetLocal().SetStr("prot");
23317  pseq->SetId().push_back(pid);
23318 
23319  CRef<CSeqdesc> mpdesc(new CSeqdesc());
23322  pseq->SetDescr().Set().push_back(mpdesc);
23323 
23324  CRef<CSeq_entry> entry(new CSeq_entry());
23325  entry->SetSeq(*pseq);
23326 
23327  AddGoodSource(entry);
23328  AddGoodPub(entry);
23329 
23330  CRef<CSeq_feat> feat(new CSeq_feat());
23331  feat->SetData().SetProt().SetName().push_back("fake protein name");
23332  feat->SetLocation().SetInt().SetId().SetLocal().SetStr("prot");
23333  feat->SetLocation().SetInt().SetFrom(0);
23334  feat->SetLocation().SetInt().SetTo(17);
23335  AddFeat(feat, entry);
23336 
23338 
23339  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "ProteinShouldNotHaveGaps", "Protein sequences should not have gaps"));
23340  // AddChromosomeNoLocation(expected_errors, entry);
23341  eval = validator.Validate(seh, options);
23342  CheckErrors(*eval, expected_errors);
23343 
23344  CLEAR_ERRORS
23345 }
23346 
23347 
23348 void CheckLocalId(const string& id, const string& badchar)
23349 {
23350  CRef<CSeq_entry> entry = BuildGoodSeq();
23351  entry->SetSeq().SetId().front()->SetLocal().SetStr(id);
23353 
23354  expected_errors.push_back(new CExpectedError("lcl|" + id, eDiag_Warning, "BadSeqIdCharacter",
23355  "Bad character '" + badchar + "' in local ID '" + id + "'"));
23356  // AddChromosomeNoLocation(expected_errors, entry);
23357  eval = validator.Validate(seh, options);
23358  CheckErrors(*eval, expected_errors);
23359 
23360  CLEAR_ERRORS
23361 }
23362 
23363 
23365 {
23366  CheckLocalId("abc|def", "|");
23367 }
23368 
23369 
23370 BOOST_AUTO_TEST_CASE(Test_IsDateInPast)
23371 {
23372  CRef<CDate> date(new CDate());
23373  BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23374 
23376  BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23377  auto curr_day = date->GetStd().GetDay();
23378  if (curr_day < 28) {
23379  date->SetStd().SetDay(curr_day + 1);
23380  BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23381  }
23382  if (curr_day > 1) {
23383  date->SetStd().SetDay(curr_day - 1);
23384  BOOST_CHECK_EQUAL(IsDateInPast(*date), true);
23385  }
23386  date->SetStd().ResetDay();
23387  BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23388 
23389  auto curr_month = date->GetStd().GetMonth();
23390  if (curr_month < 11) {
23391  date->SetStd().SetMonth(curr_month + 1);
23392  BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23393  }
23394  if (curr_month != 0) {
23395  date->SetStd().SetMonth(curr_month - 1);
23396  BOOST_CHECK_EQUAL(IsDateInPast(*date), true);
23397  }
23398  date->SetStd().ResetMonth();
23399  BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23400 
23401  auto curr_year = date->GetStd().GetYear();
23402  date->SetStd().SetYear(curr_year + 1);
23403  BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23404  date->SetStd().SetYear(curr_year - 1);
23405  BOOST_CHECK_EQUAL(IsDateInPast(*date), true);
23406 }
23407 
23408 
23409 void AddYear(CDate& add_date)
23410 {
23411  CTime t(add_date.GetStd().GetYear(), add_date.GetStd().GetMonth(), add_date.GetStd().GetDay());
23412  t.AddYear();
23413  CDate new_date(t);
23414  add_date.Assign(new_date);
23415 }
23416 
23417 
23418 void AddMonth(CDate& add_date)
23419 {
23420  CTime t(add_date.GetStd().GetYear(), add_date.GetStd().GetMonth(), add_date.GetStd().GetDay());
23421  t.AddMonth();
23422  CDate new_date(t);
23423  add_date.Assign(new_date);
23424 }
23425 
23426 
23427 void AddDay(CDate& add_date)
23428 {
23429  CTime t(add_date.GetStd().GetYear(), add_date.GetStd().GetMonth(), add_date.GetStd().GetDay());
23430  t.AddDay();
23431  CDate new_date(t);
23432  add_date.Assign(new_date);
23433 }
23434 
23435 
23437 {
23439 
23440  // find sub pub and other pub
23441  CRef<CPub> subpub;
23442  for (auto& it : entry->SetSeq().SetDescr().Set()) {
23443  if (it->IsPub()) {
23444  if (it->GetPub().GetPub().Get().front()->IsSub()) {
23445  subpub = it->SetPub().SetPub().Set().front();
23446  }
23447  }
23448  }
23449 
23451 
23452  time_t time_now = time(NULL);
23453  CDate today(time_now);
23454  CDate future(time_now);
23455 
23456  AddYear(future);
23457  subpub->SetSub().SetDate().Assign(future);
23458  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDate",
23459  "Submission citation date is in the future"));
23460  // AddChromosomeNoLocation(expected_errors, entry);
23461  eval = validator.Validate(seh, options);
23462  CheckErrors(*eval, expected_errors);
23463 
23464  future.Assign(today);
23465  AddMonth(future);
23466  subpub->SetSub().SetDate().Assign(future);
23467  eval = validator.Validate(seh, options);
23468  CheckErrors(*eval, expected_errors);
23469 
23470  future.Assign(today);
23471  AddDay(future);
23472  subpub->SetSub().SetDate().Assign(future);
23473  eval = validator.Validate(seh, options);
23474  CheckErrors(*eval, expected_errors);
23475 
23476  CLEAR_ERRORS
23477 
23478  subpub->SetSub().SetDate().Assign(today);
23479  eval = validator.Validate(seh, options);
23480  // AddChromosomeNoLocation(expected_errors, entry);
23481  CheckErrors(*eval, expected_errors);
23482 
23483  CLEAR_ERRORS
23484 }
23485 
23486 
23487 BOOST_AUTO_TEST_CASE(Test_InconsistentPseudogeneValue)
23488 {
23490 
23492  cds->SetData().SetCdregion();
23493  cds->ResetComment();
23494  cds->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "unitary")));
23495 
23497  mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
23498  mrna->ResetComment();
23499  mrna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "unitary")));
23500 
23502  gene->SetData().SetGene().SetLocus("x");
23503  gene->ResetComment();
23504  gene->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "unitary")));
23505 
23506  // no errors, all features have matching pseudogene values
23508 
23509  eval = validator.Validate(seh, options);
23510  // AddChromosomeNoLocation(expected_errors, entry);
23511  CheckErrors(*eval, expected_errors);
23512 
23513  // no errors if cds has no pseudogene but mrna and gene do
23514  cds->ResetQual();
23515  eval = validator.Validate(seh, options);
23516  CheckErrors(*eval, expected_errors);
23517 
23518  // no errors if mrna and cds have no pseudogene but gene does
23519  mrna->ResetQual();
23520  eval = validator.Validate(seh, options);
23521  CheckErrors(*eval, expected_errors);
23522 
23523  // no errors if mrna has no pseudogene but cds and gene do
23524  cds->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "unitary")));
23525  eval = validator.Validate(seh, options);
23526  CheckErrors(*eval, expected_errors);
23527 
23528  // error if cds has pseudogene but gene does not
23529  gene->ResetQual();
23530  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23531  "InconsistentPseudogeneValue",
23532  "CDS has pseudogene qualifier, gene does not"));
23533  eval = validator.Validate(seh, options);
23534  CheckErrors(*eval, expected_errors);
23535 
23536  // also error if mRNA has pseudogene but gene does not
23537  mrna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "unitary")));
23538  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23539  "InconsistentPseudogeneValue",
23540  "mRNA has pseudogene qualifier, gene does not"));
23541  eval = validator.Validate(seh, options);
23542  CheckErrors(*eval, expected_errors);
23543 
23544  CLEAR_ERRORS
23545 
23546  // different errors when pseudogene values conflict
23547  gene->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "allelic")));
23548  mrna->SetQual().front()->SetVal("processed");
23549  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23550  "InconsistentPseudogeneValue",
23551  "Different pseudogene values on CDS (unitary) and gene (allelic)"));
23552  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23553  "InconsistentPseudogeneValue",
23554  "Different pseudogene values on mRNA (processed) and gene (allelic)"));
23555  // AddChromosomeNoLocation(expected_errors, entry);
23556 
23557  eval = validator.Validate(seh, options);
23558  CheckErrors(*eval, expected_errors);
23559 
23560  CLEAR_ERRORS
23561 }
23562 
23563 
23564 BOOST_AUTO_TEST_CASE(Test_InvalidPseudoQualifier)
23565 {
23567 
23569  gene->SetData().SetGene().SetLocus("x");
23570  gene->ResetComment();
23571  gene->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "")));
23572 
23574 
23575  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23576  "InvalidPseudoQualifier",
23577  "/pseudogene value should not be empty"));
23578  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23579  "InvalidPunctuation",
23580  "Qualifier other than replace has just quotation marks"));
23581  // AddChromosomeNoLocation(expected_errors, entry);
23582  eval = validator.Validate(seh, options);
23583  CheckErrors(*eval, expected_errors);
23584 
23585  CLEAR_ERRORS
23586 
23587  gene->SetQual().front()->SetVal("abc");
23588  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23589  "InvalidPseudoQualifier",
23590  "/pseudogene value should not be 'abc'"));
23591  // AddChromosomeNoLocation(expected_errors, entry);
23592  eval = validator.Validate(seh, options);
23593  CheckErrors(*eval, expected_errors);
23594 
23595  CLEAR_ERRORS
23596 }
23597 
23598 
23599 BOOST_AUTO_TEST_CASE(Test_InvalidRptUnitRange)
23600 {
23602 
23604  rpt->SetData().SetImp().SetKey("repeat_region");
23605  rpt->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("rpt_unit_range", "x")));
23606 
23608 
23609  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23610  "InvalidRptUnitRange",
23611  "/rpt_unit_range is not a base range"));
23612  // AddChromosomeNoLocation(expected_errors, entry);
23613  eval = validator.Validate(seh, options);
23614  CheckErrors(*eval, expected_errors);
23615 
23616  rpt->SetQual().front()->SetVal("a..b");
23617  eval = validator.Validate(seh, options);
23618  CheckErrors(*eval, expected_errors);
23619 
23620  CLEAR_ERRORS
23621 
23622  rpt->SetQual().front()->SetVal("1..5");
23623  // AddChromosomeNoLocation(expected_errors, entry);
23624  eval = validator.Validate(seh, options);
23625  CheckErrors(*eval, expected_errors);
23626 
23627  CLEAR_ERRORS
23628 }
23629 
23630 
23631 BOOST_AUTO_TEST_CASE(Test_InvalidRptUnitSeqCharacters)
23632 {
23634 
23636  rpt->SetData().SetImp().SetKey("repeat_region");
23637  rpt->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("rpt_unit_seq", "x..y")));
23638 
23640 
23641  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23642  "InvalidRptUnitSeqCharacters",
23643  "/rpt_unit_seq has illegal characters"));
23644  // AddChromosomeNoLocation(expected_errors, entry);
23645  eval = validator.Validate(seh, options);
23646  CheckErrors(*eval, expected_errors);
23647 
23648  CLEAR_ERRORS
23649 
23650  rpt->SetQual().front()->SetVal("(atgc)");
23651  // AddChromosomeNoLocation(expected_errors, entry);
23652  eval = validator.Validate(seh, options);
23653  CheckErrors(*eval, expected_errors);
23654 
23655  CLEAR_ERRORS
23656 }
23657 
23658 
23659 BOOST_AUTO_TEST_CASE(Test_MismatchedAllele)
23660 {
23662 
23664  rna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
23665  rna->SetData().SetRna().SetExt().SetName("16S ribosomal RNA");
23666  rna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("allele", "x")));
23668  unit_test_util::AddFeat(gene1, entry);
23669  gene1->SetData().SetGene().SetAllele("y");
23670 
23672 
23673  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23674  "MismatchedAllele",
23675  "Mismatched allele qualifier on gene (y) and feature (x)"));
23676  // AddChromosomeNoLocation(expected_errors, entry);
23677  eval = validator.Validate(seh, options);
23678  CheckErrors(*eval, expected_errors);
23679 
23680  CLEAR_ERRORS
23681 }
23682 
23683 
23684 BOOST_AUTO_TEST_CASE(Test_InvalidAlleleDuplicates)
23685 {
23687 
23689  rna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
23690  rna->SetData().SetRna().SetExt().SetName("16S ribosomal RNA");
23691  rna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("allele", "x")));
23693  unit_test_util::AddFeat(gene1, entry);
23694  gene1->SetData().SetGene().SetAllele("x");
23695 
23697 
23698  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23699  "InvalidAlleleDuplicates",
23700  "Redundant allele qualifier (x) on gene and feature"));
23701  // AddChromosomeNoLocation(expected_errors, entry);
23702  eval = validator.Validate(seh, options);
23703  CheckErrors(*eval, expected_errors);
23704 
23705  CLEAR_ERRORS
23706 }
23707 
23708 
23709 BOOST_AUTO_TEST_CASE(Test_InvalidOperonMatchesGene)
23710 {
23713  operon->SetData().SetImp().SetKey("operon");
23714  operon->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("operon", "x")));
23715 
23717  unit_test_util::AddFeat(gene, entry);
23718  gene->SetData().SetGene().SetLocus("x");
23719 
23721 
23722  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23723  "InvalidOperonMatchesGene",
23724  "Operon is same as gene - x"));
23725  // AddChromosomeNoLocation(expected_errors, entry);
23726  eval = validator.Validate(seh, options);
23727  CheckErrors(*eval, expected_errors);
23728 
23729  CLEAR_ERRORS
23730 }
23731 
23732 
23733 BOOST_AUTO_TEST_CASE(Test_InvalidCompareRefSeqAccession)
23734 {
23736  entry->SetSeq().SetId().push_back(CRef<CSeq_id>(new CSeq_id("AY123456.1")));
23738  var->SetData().SetImp().SetKey("variation");
23739  var->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("compare", "NC_000001.1")));
23740 
23742 
23743  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error,
23744  "InvalidCompareRefSeqAccession",
23745  "RefSeq accession NC_000001.1 cannot be used for qualifier compare"));
23746  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
23747  eval = validator.Validate(seh, options);
23748  CheckErrors(*eval, expected_errors);
23749 
23750  CLEAR_ERRORS
23751 }
23752 
23753 
23754 BOOST_AUTO_TEST_CASE(Test_InvalidCompareMissingVersion)
23755 {
23757  entry->SetSeq().SetId().push_back(CRef<CSeq_id>(new CSeq_id("AY123456.1")));
23759  var->SetData().SetImp().SetKey("variation");
23760  var->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("compare", "NC_000001")));
23761 
23763 
23764  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error,
23765  "InvalidCompareMissingVersion",
23766  "NC_000001 accession missing version for qualifier compare"));
23767  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
23768  eval = validator.Validate(seh, options);
23769  CheckErrors(*eval, expected_errors);
23770 
23771  CLEAR_ERRORS
23772 }
23773 
23774 
23775 BOOST_AUTO_TEST_CASE(Test_InvalidCompareBadAccession)
23776 {
23778  entry->SetSeq().SetId().push_back(CRef<CSeq_id>(new CSeq_id("AY123456.1")));
23780  var->SetData().SetImp().SetKey("variation");
23781  var->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("compare", "x_y")));
23782 
23784 
23785  expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error,
23786  "InvalidCompareBadAccession",
23787  "x_y is not a legal accession for qualifier compare"));
23788  // AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
23789  eval = validator.Validate(seh, options);
23790  CheckErrors(*eval, expected_errors);
23791 
23792  CLEAR_ERRORS
23793 }
23794 
23795 
23796 BOOST_AUTO_TEST_CASE(Test_RegulatoryClassOtherNeedsNote)
23797 {
23800  reg->SetData().SetImp().SetKey("regulatory");
23801  CRef<CGb_qual> qual(new CGb_qual("regulatory_class", "other"));
23802  reg->SetQual().push_back(qual);
23803 
23805 
23806  // first check ok because recomb has comment
23807  // AddChromosomeNoLocation(expected_errors, entry);
23808  eval = validator.Validate(seh, options);
23809  CheckErrors(*eval, expected_errors);
23810 
23811  // error because 'other' and no comment
23812  reg->ResetComment();
23813  eval = validator.Validate(seh, options);
23814  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23815  "RegulatoryClassOtherNeedsNote",
23816  "The regulatory_class 'other' is missing the required /note"));
23817  CheckErrors(*eval, expected_errors);
23818 
23819  CLEAR_ERRORS
23820 }
23821 
23822 
23823 BOOST_AUTO_TEST_CASE(Test_UnparsedtRNAAnticodon)
23824 {
23827  trna->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
23828  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetNcbieaa('A');
23829  trna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("anticodon", "other")));
23830 
23832 
23833  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23834  "UnparsedtRNAAnticodon",
23835  "Unparsed anticodon qualifier in tRNA"));
23836  // AddChromosomeNoLocation(expected_errors, entry);
23837  eval = validator.Validate(seh, options);
23838  CheckErrors(*eval, expected_errors);
23839 
23840  CLEAR_ERRORS
23841 }
23842 
23843 
23844 BOOST_AUTO_TEST_CASE(Test_UnparsedtRNAProduct)
23845 {
23848  trna->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
23849  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetNcbieaa('A');
23850  trna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("product", "other")));
23851 
23853 
23854  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23855  "UnparsedtRNAProduct",
23856  "Unparsed product qualifier in tRNA"));
23857  // AddChromosomeNoLocation(expected_errors, entry);
23858  eval = validator.Validate(seh, options);
23859  CheckErrors(*eval, expected_errors);
23860 
23861  CLEAR_ERRORS
23862 }
23863 
23864 
23865 BOOST_AUTO_TEST_CASE(Test_rRNADoesNotHaveProduct)
23866 {
23869  rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
23870 
23872 
23873  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23874  "rRNADoesNotHaveProduct",
23875  "rRNA has no name"));
23876  // AddChromosomeNoLocation(expected_errors, entry);
23877  eval = validator.Validate(seh, options);
23878  CheckErrors(*eval, expected_errors);
23879 
23880  CLEAR_ERRORS
23881 }
23882 
23883 
23884 BOOST_AUTO_TEST_CASE(Test_MobileElementInvalidQualifier)
23885 {
23888  misc->SetData().SetImp().SetKey("repeat_region");
23889  misc->AddQualifier("mobile_element", "foo");
23890 
23892 
23893  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MobileElementInvalidQualifier",
23894  "foo is not a legal value for qualifier mobile_element"));
23895  // AddChromosomeNoLocation(expected_errors, entry);
23896  eval = validator.Validate(seh, options);
23897  CheckErrors(*eval, expected_errors);
23898 
23899  CLEAR_ERRORS
23900 
23901  misc->SetQual().front()->SetVal("integron");
23902  // AddChromosomeNoLocation(expected_errors, entry);
23903  eval = validator.Validate(seh, options);
23904  CheckErrors(*eval, expected_errors);
23905 
23906  CLEAR_ERRORS
23907 }
23908 
23909 
23910 BOOST_AUTO_TEST_CASE(Test_InvalidReplace)
23911 {
23914  misc->SetData().SetImp().SetKey("misc_difference");
23915  misc->AddQualifier("replace", "123");
23916 
23918 
23919  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidReplace",
23920  "123 is not a legal value for qualifier replace - should only be composed of acgtmrwsykvhdbn nucleotide bases"));
23921  // AddChromosomeNoLocation(expected_errors, entry);
23922  eval = validator.Validate(seh, options);
23923  CheckErrors(*eval, expected_errors);
23924 
23925  CLEAR_ERRORS
23926 
23927  misc->SetQual().front()->SetVal("aaccttgg");
23928  eval = validator.Validate(seh, options);
23929  // AddChromosomeNoLocation(expected_errors, entry);
23930  CheckErrors(*eval, expected_errors);
23931 
23932  CLEAR_ERRORS
23933 
23934  scope.RemoveTopLevelSeqEntry(seh);
23937 
23938  misc = unit_test_util::AddMiscFeature(prot, prot->GetSeq().GetLength() - 1);
23939  misc->SetData().SetImp().SetKey("misc_difference");
23940  misc->AddQualifier("replace", "123");
23941  seh = scope.AddTopLevelSeqEntry(*entry);
23942 
23943  expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "InvalidReplace",
23944  "123 is not a legal value for qualifier replace - should only be composed of acdefghiklmnpqrstuvwy* amino acids"));
23945  // AddChromosomeNoLocation(expected_errors, entry);
23946  eval = validator.Validate(seh, options);
23947  CheckErrors(*eval, expected_errors);
23948 
23949  CLEAR_ERRORS
23950 }
23951 
23952 
23953 BOOST_AUTO_TEST_CASE(Test_InvalidVariationReplace)
23954 {
23957  misc->SetData().SetImp().SetKey("variation");
23958  misc->AddQualifier("replace", "123");
23959 
23961 
23962  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidVariationReplace",
23963  "123 is not a legal value for qualifier replace - should only be composed of acgt unambiguous nucleotide bases"));
23964  // AddChromosomeNoLocation(expected_errors, entry);
23965  eval = validator.Validate(seh, options);
23966  CheckErrors(*eval, expected_errors);
23967 
23968  CLEAR_ERRORS
23969 
23970  misc->SetQual().front()->SetVal("aaccttgg");
23971  // AddChromosomeNoLocation(expected_errors, entry);
23972  eval = validator.Validate(seh, options);
23973  CheckErrors(*eval, expected_errors);
23974 
23975  CLEAR_ERRORS
23976 }
23977 
23978 
23979 BOOST_AUTO_TEST_CASE(Test_InvalidProductOnGene)
23980 {
23983  gene->SetData().SetGene().SetLocus("x");
23984  gene->AddQualifier("product", "hypothetical protein");
23985 
23987 
23988  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "InvalidProductOnGene",
23989  "A product qualifier is not used on a gene feature"));
23990  // AddChromosomeNoLocation(expected_errors, entry);
23991  eval = validator.Validate(seh, options);
23992  CheckErrors(*eval, expected_errors);
23993 
23994  CLEAR_ERRORS
23995 }
23996 
23997 
23998 BOOST_AUTO_TEST_CASE(Test_InvalidCodonStart)
23999 {
24002  cds->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("codon_start", "z")));
24004 
24005  expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InvalidCodonStart",
24006  "codon_start value should be 1, 2, or 3"));
24007  // AddChromosomeNoLocation(expected_errors, entry);
24008  eval = validator.Validate(seh, options);
24009  CheckErrors(*eval, expected_errors);
24010 
24011  CLEAR_ERRORS
24012 }
24013 
24014 
24015 BOOST_FIXTURE_TEST_CASE(Test_InconsistentBioSources_ConLocation, CGenBankFixture)
24016 {
24019  CSeq_loc& l1 = entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc();
24020  l1.SetInt().SetId().SetGenbank().SetAccession("AY123456");
24021  l1.SetInt().SetFrom(0);
24022  l1.SetInt().SetTo(99);
24023  CSeq_loc& l2 = entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLoc();
24024  l2.SetInt().SetId().SetGenbank().SetAccession("AY123457");
24025  l2.SetInt().SetFrom(0);
24026  l2.SetInt().SetTo(99);
24027 
24028  entry->SetSeq().SetInst().SetLength(210);
24029 
24031 
24032  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentBioSources_ConLocation",
24033  "Genome difference between parent and component"));
24034  // AddChromosomeNoLocation(expected_errors, entry);
24035  eval = validator.Validate(seh, options);
24036  CheckErrors(*eval, expected_errors);
24037 
24038  CLEAR_ERRORS
24039 }
24040 
24041 
24042 void TestOverlappingRNAFeatures(const CSeq_loc& loc1, const CSeq_loc& loc2, bool expect_err)
24043 {
24046  rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
24047  rrna->SetData().SetRna().SetExt().SetName("16S ribosomal RNA");
24048  rrna->SetLocation().Assign(loc1);
24049 
24051  trna->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
24052  trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('A');
24053  trna->SetLocation().Assign(loc2);
24054 
24056 
24057  if (expect_err) {
24058  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadRRNAcomponentOverlapTRNA",
24059  "tRNA-rRNA overlap"));
24060  }
24061  // AddChromosomeNoLocation(expected_errors, entry);
24062  eval = validator.Validate(seh, options);
24063  CheckErrors(*eval, expected_errors);
24064 
24065  CLEAR_ERRORS
24066 }
24067 
24068 
24069 BOOST_AUTO_TEST_CASE(Test_BADRRNAcomponentOverlapTRNA)
24070 {
24071  CRef<CSeq_loc> loc1(new CSeq_loc());
24072  loc1->SetInt().SetId().SetLocal().SetStr("good");
24073  loc1->SetInt().SetFrom(0);
24074  loc1->SetInt().SetTo(10);
24075 
24076  CRef<CSeq_loc> loc2(new CSeq_loc());
24077  loc2->Assign(*loc1);
24078 
24079  TestOverlappingRNAFeatures(*loc1, *loc2, true);
24080 
24081  loc2->SetInt().SetFrom(6);
24082  loc2->SetInt().SetTo(16);
24083  TestOverlappingRNAFeatures(*loc1, *loc2, true);
24084 
24085  loc2->SetInt().SetFrom(7);
24086  loc2->SetInt().SetTo(17);
24087  TestOverlappingRNAFeatures(*loc1, *loc2, false);
24088 
24089  loc2->SetInt().SetFrom(11);
24090  loc2->SetInt().SetTo(17);
24091  TestOverlappingRNAFeatures(*loc1, *loc2, false);
24092 
24093 }
24094 
24095 
24097 {
24098  const string cMitoMezoMsg = "Mitochondrial Metazoan sequences should be less than 65000 bp";
24099 
24100  CRef<CSeq_entry> entry = BuildGoodSeq();
24101  SetLineage(entry, "Metazoan");
24103  entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
24104  entry->SetSeq().SetInst().SetLength(110000);
24107  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
24108  "MitoMetazoanTooLong", cMitoMezoMsg));
24109  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24110  "SeqDataLenWrong", "Bioseq.seq_data too short [60] for given length [110000]"));
24111  eval = validator.Validate(seh, options);
24112  CheckErrors(*eval, expected_errors);
24113 
24114  CLEAR_ERRORS
24115 
24116 
24117  // for RW-991
24118  scope.RemoveTopLevelSeqEntry(seh);
24119  entry->SetSeq().SetInst().SetLength(64000);
24120  seh = scope.AddTopLevelSeqEntry(*entry);
24121  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24122  "SeqDataLenWrong", "Bioseq.seq_data too short [60] for given length [64000]"));
24123  eval = validator.Validate(seh, options);
24124  CheckErrors(*eval, expected_errors);
24125 
24126  CLEAR_ERRORS
24127 }
24128 
24129 
24130 
24132 {
24133  CRef<CSeq_id> n1(new CSeq_id());
24134  n1->SetLocal().SetStr("nuc_" + NStr::NumericToString(num + 1));
24135  return n1;
24136 }
24137 
24138 
24140 {
24142  ce->SetId(code_id);
24144  code->Set().push_back(ce);
24145  cds.SetData().SetCdregion().SetCode(*code);
24146 }
24147 
24148 
24150 {
24151  CRef<CSeq_entry> entry(new CSeq_entry());
24153 
24154  for (size_t i = 0; i < num_np; i++) {
24157  AddGeneticCode(*cds, 11);
24161  CRef<CSeq_id> p1(new CSeq_id());
24162  p1->SetLocal().SetStr("prot_" + NStr::NumericToString(i + 1));
24163  ChangeNucProtSetProteinId(np1, p1);
24164  entry->SetSet().SetSeq_set().push_back(np1);
24165  }
24166  return entry;
24167 }
24168 
24169 
24170 void AddCdregionToSmallGenomeSet(CRef<CSeq_entry> entry, size_t cdr1_num, size_t cdr2_num, size_t cdr_pos, size_t p_pos)
24171 {
24172  CRef<CSeq_feat> cdregion(new CSeq_feat());
24173  AddGeneticCode(*cdregion, 11);
24174  CRef<CSeq_loc> loc1(new CSeq_loc());
24175  loc1->SetInt().SetFrom(0);
24176  loc1->SetInt().SetTo(10);
24177  CRef<CSeq_id> n1 = MakeSmallGenomeSetNucId(cdr1_num);
24178  loc1->SetInt().SetId().Assign(*n1);
24179  CRef<CSeq_loc> loc2(new CSeq_loc());
24180  loc2->SetInt().SetFrom(11);
24181  loc2->SetInt().SetTo(26);
24182  CRef<CSeq_id> n2 = MakeSmallGenomeSetNucId(cdr2_num);
24183  loc2->SetInt().SetId().Assign(*n2);
24184 
24185  cdregion->SetLocation().SetMix().Set().push_back(loc1);
24186  cdregion->SetLocation().SetMix().Set().push_back(loc2);
24187 
24189  cdregion->SetProduct().SetWhole().Assign(*(prot->GetSeq().GetId().front()));
24190 
24191  auto it = entry->SetSet().SetSeq_set().begin();
24192  size_t offset = 1;
24193  while (it != entry->SetSet().SetSeq_set().end()) {
24194  if (offset == cdr_pos) {
24195  (*it)->SetSet().SetAnnot().front()->SetData().SetFtable().push_back(cdregion);
24196  }
24197  if (offset == p_pos) {
24198  (*it)->SetSet().SetSeq_set().push_back(prot);
24199  }
24200  it++;
24201  offset++;
24202  }
24203 }
24204 
24205 
24206 // If we have a small genome set, then a feature could legitimately
24207 // have a location with intervals on multiple sequences.
24208 // This should not trigger the CDSproductPackagingProblem error as long
24209 // as the protein sequence is packaged in the same nuc-prot set as one
24210 // of the nucleotide sequences that the coding region is located on
24212 {
24214 
24216 
24217  // no errors with no trans-spliced coding region
24218  eval = validator.Validate(seh, options);
24219  // AddChromosomeNoLocation(expected_errors, entry);
24220  CheckErrors(*eval, expected_errors);
24221 
24222  // first combination should not generate errors
24223  scope.RemoveTopLevelSeqEntry(seh);
24224  AddCdregionToSmallGenomeSet(entry, 0, 1, 1, 1);
24225  seh = scope.AddTopLevelSeqEntry(*entry);
24226  eval = validator.Validate(seh, options);
24227  CheckErrors(*eval, expected_errors);
24228 
24229  // second combination should not generate errors
24230  scope.RemoveTopLevelSeqEntry(seh);
24231  entry = BuildSmallGenomeSet(3);
24232  AddCdregionToSmallGenomeSet(entry, 0, 1, 2, 2);
24233  seh = scope.AddTopLevelSeqEntry(*entry);
24234  eval = validator.Validate(seh, options);
24235  CheckErrors(*eval, expected_errors);
24236 
24237  // third combination should produce an error because
24238  // protein on wrong sequence
24239  scope.RemoveTopLevelSeqEntry(seh);
24240  entry = BuildSmallGenomeSet(3);
24241  AddCdregionToSmallGenomeSet(entry, 0, 1, 2, 3);
24242  seh = scope.AddTopLevelSeqEntry(*entry);
24243 
24244  expected_errors.push_back(new CExpectedError("", eDiag_Warning,
24245  "CDSproductPackagingProblem",
24246  "Protein product not packaged in nuc-prot set with nucleotide in small genome set"));
24247 
24248  eval = validator.Validate(seh, options);
24249  CheckErrors(*eval, expected_errors);
24250 
24251 
24252  CLEAR_ERRORS
24253 }
24254 
24255 
24256 BOOST_AUTO_TEST_CASE(Test_BadKeywordUnverified)
24257 {
24258  CRef<CSeq_entry> entry = BuildGoodSeq();
24259  AddGenbankKeyword(entry, "BARCODE");
24261  CRef<CSeqdesc> desc(new CSeqdesc());
24263  entry->SetSeq().SetDescr().Set().push_back(desc);
24264 
24266 
24267  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24268  "BadKeywordUnverified",
24269  "Sequence has both BARCODE and UNVERIFIED keywords"));
24270  // AddChromosomeNoLocation(expected_errors, entry);
24271 
24272  eval = validator.Validate(seh, options);
24273  CheckErrors(*eval, expected_errors);
24274 
24275  CLEAR_ERRORS
24276 }
24277 
24278 
24279 BOOST_AUTO_TEST_CASE(Test_BINDoesNotMatch)
24280 {
24281  CRef<CSeq_entry> entry = BuildGoodSeq();
24282  SetTaxname(entry, "BOLD bacterium sp. zz");
24283  CRef<CUser_object> sc = edit::CStructuredCommentField::MakeUserObject("International Barcode of Life (iBOL)Data");
24284  CRef<CSeqdesc> desc(new CSeqdesc());
24285  desc->SetUser().Assign(*sc);
24286 
24287  CRef<CUser_field> uf(new CUser_field());
24288  uf->SetLabel().SetStr("Barcode Index Number");
24289  uf->SetData().SetStr("xxx");
24290  desc->SetUser().SetData().push_back(uf);
24291  entry->SetSeq().SetDescr().Set().push_back(desc);
24292 
24294 
24295  // error was removed per VR-843
24296  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24297  "OrganismNotFound", "Organism not found in taxonomy database"));
24298  // AddChromosomeNoLocation(expected_errors, entry);
24299 
24300  eval = validator.Validate(seh, options);
24301  CheckErrors(*eval, expected_errors);
24302 
24303  CLEAR_ERRORS
24304 }
24305 
24306 
24307 void AddStrsField(CUser_object& user, const string& label, const string& val)
24308 {
24309  CRef<CUser_field> uf(new CUser_field());
24310  uf->SetLabel().SetStr(label);
24311  uf->SetData().SetStrs().push_back(val);
24312  user.SetData().push_back(uf);
24313 }
24314 
24315 
24316 BOOST_AUTO_TEST_CASE(Test_BadDBLink)
24317 {
24318  CRef<CSeq_entry> entry = BuildGoodSeq();
24319  CRef<CSeqdesc> db1(new CSeqdesc());
24321  edit::CDBLink::SetAssembly(db1->SetUser(), "ZZZ");
24322  edit::CDBLink::SetBioProject(db1->SetUser(), "XXX");
24323  // for bad capitalization
24324  AddStrsField(db1->SetUser(), "Sequence read archive", "AAA");
24325  // for unknown field
24326  AddStrsField(db1->SetUser(), "unknown", "BBB");
24327 
24328  entry->SetSeq().SetDescr().Set().push_back(db1);
24329 
24330  CRef<CSeqdesc> db2(new CSeqdesc());
24332  edit::CDBLink::SetAssembly(db2->SetUser(), "YYY");
24333  entry->SetSeq().SetDescr().Set().push_back(db2);
24334 
24335  CRef<CSeqdesc> db3(new CSeqdesc());
24337  entry->SetSeq().SetDescr().Set().push_back(db3);
24338 
24340 
24341  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24342  "MultipleDBLinkObjects", "3 DBLink user objects apply to a Bioseq"));
24343  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24344  "DBLinkBadAssembly",
24345  "Assembly entries appear in 2 DBLink user objects"));
24346  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24347  "DBLinkBadFormat",
24348  "Unrecognized entries appear in 1 DBLink user object"));
24349  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
24350  "DBLinkBadBioProject", "Bad BioProject format - XXX"));
24351  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
24352  "DBLinkBadSRAaccession", "Bad Sequence Read Archive format - AAA"));
24353  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24354  "DBLinkBadCapitalization", "Bad DBLink capitalization - Sequence read archive"));
24355  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24356  "DBLinkMissingUserObject", "DBLink user object descriptor is empty"));
24357  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
24358  "UserObjectNoData", "User object with no data"));
24359  // AddChromosomeNoLocation(expected_errors, entry);
24360 
24361  eval = validator.Validate(seh, options);
24362  CheckErrors(*eval, expected_errors);
24363 
24364  CLEAR_ERRORS
24365 }
24366 
24367 
24368 BOOST_AUTO_TEST_CASE(Test_DBLinkOnSet)
24369 {
24371  CRef<CSeqdesc> db1(new CSeqdesc());
24373  edit::CDBLink::SetBioSample(db1->SetUser(), "SAMN1234");
24374 
24375  entry->SetSet().SetDescr().Set().push_back(db1);
24376 
24378 
24379  expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error,
24380  "DBLinkOnSet", "DBLink user object should not be on this set"));
24381  // AddChromosomeNoLocation(expected_errors, entry);
24382  eval = validator.Validate(seh, options);
24383  CheckErrors(*eval, expected_errors);
24384 
24385  CLEAR_ERRORS
24386 }
24387 
24388 
24389 BOOST_AUTO_TEST_CASE(Test_AssemblyGapFeatureProblem)
24390 {
24392  CRef<CSeq_feat> assembly_gap = AddMiscFeature(entry);
24393  assembly_gap->SetData().SetImp().SetKey("assembly_gap");
24394  assembly_gap->SetLocation().SetInt().SetFrom(12);
24395  assembly_gap->SetLocation().SetInt().SetTo(21);
24396  assembly_gap->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("estimated_length", "10")));
24397  assembly_gap->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("gap_type", "fragment")));
24398 
24400 
24401  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24402  "AssemblyGapFeatureProblem", "An assembly_gap feature should only be on a contig record"));
24403  // AddChromosomeNoLocation(expected_errors, entry);
24404  eval = validator.Validate(seh, options);
24405  CheckErrors(*eval, expected_errors);
24406 
24407  CLEAR_ERRORS
24408 }
24409 
24410 
24411 void MakeLeft(CSeq_loc& loc)
24412 {
24413  loc.SetInt().SetFrom(0);
24414  loc.SetInt().SetTo(5);
24415 }
24416 
24417 void MakeRight(CSeq_loc& loc, TSeqPos stop)
24418 {
24419  loc.SetInt().SetFrom(stop - 6);
24420  loc.SetInt().SetTo(stop - 1);
24421 }
24422 
24423 void TestUTRPair(bool add_gene, bool is_minus)
24424 {
24425  CRef<CSeq_entry> entry = BuildGoodSeq();
24426  TSeqPos stop = entry->GetSeq().GetLength() - 1;
24427  if (add_gene) {
24428  CRef<CSeq_feat> gene = AddMiscFeature(entry);
24429  gene->ResetComment();
24430  gene->SetData().SetGene().SetLocus("x");
24431  gene->SetLocation().SetInt().SetTo(stop);
24432  if (is_minus) {
24433  gene->SetLocation().SetInt().SetStrand(eNa_strand_minus);
24434  }
24435  }
24436 
24437  CRef<CSeq_feat> utr5 = AddMiscFeature(entry);
24438  utr5->ResetComment();
24439  utr5->SetData().SetImp().SetKey("5'UTR");
24440  if (is_minus) {
24441  MakeRight(utr5->SetLocation(), stop);
24442  utr5->SetLocation().SetInt().SetStrand(eNa_strand_minus);
24443  } else {
24444  MakeLeft(utr5->SetLocation());
24445  }
24446 
24447  CRef<CSeq_feat> utr3 = AddMiscFeature(entry);
24448  utr3->ResetComment();
24449  utr3->SetData().SetImp().SetKey("3'UTR");
24450  if (is_minus) {
24451  MakeLeft(utr3->SetLocation());
24452  utr3->SetLocation().SetInt().SetStrand(eNa_strand_minus);
24453  } else {
24454  MakeRight(utr3->SetLocation(), stop);
24455  }
24456 
24458 
24459  if (add_gene) {
24460  if (is_minus) {
24461  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24462  "NoCDSbetweenUTRs", "CDS not between 5'UTR and 3'UTR on minus strand"));
24463  } else {
24464  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24465  "NoCDSbetweenUTRs", "CDS not between 5'UTR and 3'UTR on plus strand"));
24466  }
24467  }
24468  // AddChromosomeNoLocation(expected_errors, entry);
24469 
24470  eval = validator.Validate(seh, options);
24471  CheckErrors(*eval, expected_errors);
24472 
24473  CLEAR_ERRORS
24474 }
24475 
24476 BOOST_AUTO_TEST_CASE(Test_NoCDSbetweenUTRs)
24477 {
24478  TestUTRPair(false, false);
24479  TestUTRPair(false, true);
24480  TestUTRPair(true, false);
24481  TestUTRPair(true, true);
24482 }
24483 
24484 BOOST_AUTO_TEST_CASE(Test_FormatBadSpecificHostAlternateName)
24485 {
24486  CRef<CSeq_entry> entry = BuildGoodSeq();
24487  SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Gromphadorina portentosa");
24488 
24490 
24491  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadSpecificHost",
24492  "Specific host value is alternate name: Gromphadorina portentosa should be Gromphadorhina portentosa"));
24493  // AddChromosomeNoLocation(expected_errors, entry);
24494 
24495  eval = validator.Validate(seh, options);
24496  CheckErrors(*eval, expected_errors);
24497 
24498  CValidErrorFormat format(*objmgr);
24499  string val = format.FormatForSubmitterReport(*(eval->GetErrs().back()), scope);
24500  BOOST_CHECK_EQUAL(val, "lcl|good\tGromphadorina portentosa should be Gromphadorhina portentosa");
24501 
24502  CLEAR_ERRORS
24503 }
24504 
24506 {
24507  CRef<CSeq_entry> entry = BuildGoodSeq();
24509  rna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
24510  rna->SetData().SetRna().SetExt().SetName("23S ribosomal RNA");
24511  rna->SetProduct().SetWhole().SetGi(GI_CONST(507148189));
24512 
24514 
24515  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TranscriptLen",
24516  "Transcript length [11] less than (far) product length [3132], and tail < 95% polyA"));
24517  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TranscriptMismatches",
24518  "There are 7 mismatches out of 11 bases between the transcript and (far) product sequence"));
24519  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RnaProductMismatch",
24520  "Type of RNA does not match MolInfo of product Bioseq"));
24521  // AddChromosomeNoLocation(expected_errors, entry);
24522 
24523  eval = validator.Validate(seh, options);
24524  CheckErrors(*eval, expected_errors);
24525  CLEAR_ERRORS
24526 }
24527 
24528 
24529 BOOST_AUTO_TEST_CASE(Test_ExceptionRequiresLocusTag)
24530 {
24531  CRef<CSeq_entry> entry = BuildGoodSeq();
24532  CRef<CSeq_feat> gene = AddMiscFeature(entry);
24533  gene->SetData().SetGene().SetLocus("x");
24534  gene->SetExcept(true);
24535  gene->SetExcept_text("gene split at contig boundary");
24536 
24538 
24539  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24540  "ExceptionRequiresLocusTag",
24541  "Gene has split exception but no locus_tag"));
24542  // AddChromosomeNoLocation(expected_errors, entry);
24543  eval = validator.Validate(seh, options);
24544  CheckErrors(*eval, expected_errors);
24545  CLEAR_ERRORS
24546 }
24547 
24548 
24550 {
24551  CRef<CSeq_submit> ss(new CSeq_submit());
24552  ss->SetSub().SetTool("Geneious");
24554  ss->SetSub().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
24555  ss->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetAffil("some affiliation");
24556  ss->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetCountry("Russia");
24557 
24558  ss->SetSub().SetCit().SetDate().SetStd().SetYear(2009);
24559  ss->SetSub().SetCit().SetDate().SetStd().SetMonth(12);
24560  ss->SetSub().SetCit().SetDate().SetStd().SetDay(31);
24561 
24562  return ss;
24563 }
24564 
24565 
24566 BOOST_AUTO_TEST_CASE(Test_Geneious)
24567 {
24569 
24571  CRef<CSeq_feat> gene = AddMiscFeature(entry);
24572  CRef<CSeq_loc> gene_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
24573  gene_loc->SetMix().Set().front()->SetInt().SetFrom(0);
24574  gene_loc->SetMix().Set().front()->SetInt().SetTo(0);
24575  gene_loc->SetMix().Set().front()->SetInt().SetStrand(eNa_strand_minus);
24576  gene_loc->SetMix().Set().back()->SetInt().SetFrom(9);
24577  gene_loc->SetMix().Set().back()->SetInt().SetTo(10);
24578  gene->SetLocation().Assign(*gene_loc);
24579 
24580  ss->SetData().SetEntrys().push_back(entry);
24581 
24583 
24584  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MixedStrand",
24585  "Location: Mixed strands in SeqLoc [(lcl|good:c1-1, 10-11)]"));
24586  // AddChromosomeNoLocation(expected_errors, entry);
24587 
24589  eval = validator.Validate(*ss, &scope, options);
24590 
24591  CheckErrors(*eval, expected_errors);
24592 
24593  CLEAR_ERRORS
24594 }
24595 
24596 
24597 // From VR-793
24598 // A. For segment, endogenous_virus_name:
24599 // 1. Must begin with a letter or number
24600 // 2. Spaces and other printable characters are permitted
24601 // 3. Must not be empty, must not be longer than 240 characters
24602 // B. For chromosome, linkage_group and plasmid_name values:
24603 // 4. Must begin with a letter or number
24604 // 5. Must not be empty (not currently true), must not be longer than 32 characters
24605 // 6. Must not contain <tab>
24606 // 7. Spaces and other printable characters are permitted
24607 // 8. Must not contain the word "plasmid" (ignoring case)
24608 // 9. Must not contain the word "chromosome" (ignoring case)
24609 // 10. Must not contain the phrase "linkage group" (ignoring case)
24610 // 11. Must not contain the series of letters "chr" (ignoring case)
24611 // 12. Must not contain the taxname (ignoring case)
24612 // 14. Must not contain the genus (ignoring case)
24613 // 15. Must not contain the species (ignoring case)
24614 // 16. Must not contain the series of letters "chrm" (ignoring case)
24615 // 17. Must not contain the series of letters "chrom" (ignoring case)
24616 // 18. Must not contain the phrase "linkage-group" (ignoring case)
24617 // C. For plasmid_name values:
24618 // 19. Exception- megaplasmid is legal
24619 // D. plastid_name is obsolete so no value is legal.
24620 // 20. digits or numerals: Plastid name subsource contains unrecognized value
24621 // 21. organelle: Plastid name subsource chloroplast but not chloroplast location
24622 
24623 void TestOneReplicon(CSubSource::ESubtype subtype, const string& val, const string& err_code, EDiagSev sev, const string& msg)
24624 {
24625  CRef<CSeq_entry> entry = BuildGoodSeq();
24626  for (auto it : entry->SetSeq().SetDescr().Set()) {
24627  if (it->IsSource()) {
24628  bool found = false;
24629  for (auto sit : it->SetSource().SetSubtype()) {
24630  if (sit->GetSubtype() == subtype) {
24631  sit->SetName(val);
24632  found = true;
24633  break;
24634  }
24635  }
24636  if (!found) {
24637  CRef<CSubSource> ss(new CSubSource(subtype, val));
24638  it->SetSource().SetSubtype().push_back(ss);
24639  }
24640  if (subtype == CSubSource::eSubtype_plasmid_name) {
24641  it->SetSource().SetGenome(CBioSource::eGenome_plasmid);
24642  }
24643  }
24644  }
24645 
24647 
24648  if (!NStr::IsBlank(err_code)) {
24649  expected_errors.push_back(new CExpectedError("lcl|good", sev, err_code, msg));
24650  }
24651 
24652  if (subtype == CSubSource::eSubtype_segment) {
24653  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NonViralSegment",
24654  "Non-viral source feature should not have a segment qualifier"));
24655  }
24656 
24657  eval = validator.Validate(seh, options);
24658 
24659  CheckErrors(*eval, expected_errors);
24660 
24661  CLEAR_ERRORS
24662 }
24663 
24664 
24665 void TestAlwaysBadReplicon(const string& val)
24666 {
24667  TestOneReplicon(CSubSource::eSubtype_chromosome, val, "BadPlasmidChromosomeLinkageName",
24668  eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + val + "'");
24669  TestOneReplicon(CSubSource::eSubtype_linkage_group, val, "BadPlasmidChromosomeLinkageName",
24670  eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + val + "'");
24671  TestOneReplicon(CSubSource::eSubtype_plasmid_name, val, "BadPlasmidChromosomeLinkageName",
24672  eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + val + "'");
24673  TestOneReplicon(CSubSource::eSubtype_segment, val, "BadTextInSourceQualifier", eDiag_Error, "segment value should start with letter or number");
24674  TestOneReplicon(CSubSource::eSubtype_endogenous_virus_name, val, "BadTextInSourceQualifier", eDiag_Error, "endogenous-virus-name value should start with letter or number");
24675 }
24676 
24677 
24678 void TestAlwaysGoodReplicon(const string& val)
24679 {
24685 }
24686 
24687 
24688 void TestRepliconTaxname(CSubSource::ESubtype subtype, bool expect_errs)
24689 {
24690  TestOneReplicon(subtype, "Sebaea microphylla",
24691  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24692  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'Sebaea microphylla'");
24693 
24694  TestOneReplicon(subtype, "Sebaea",
24695  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24696  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'Sebaea'");
24697 
24698  TestOneReplicon(subtype, "microphylla",
24699  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24700  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'microphylla'");
24701 
24702 }
24703 
24704 void TestRepliconForbiddenWords(CSubSource::ESubtype subtype, bool expect_errs)
24705 {
24706  TestOneReplicon(subtype, "some CHROMOSOME",
24707  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24708  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'some CHROMOSOME'");
24709 
24710  TestOneReplicon(subtype, "linkage group x",
24711  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24712  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'linkage group x'");
24713 
24714  TestOneReplicon(subtype, "linkage-group x",
24715  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24716  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'linkage-group x'");
24717 
24718  TestOneReplicon(subtype, "linkage_group x",
24719  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24720  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'linkage_group x'");
24721 
24722  TestOneReplicon(subtype, "chry",
24723  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24724  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'chry'");
24725 
24726  TestOneReplicon(subtype, "chrm",
24727  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24728  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'chrm'");
24729 
24730  TestOneReplicon(subtype, "CHROM",
24731  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24732  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'CHROM'");
24733 
24734  TestOneReplicon(subtype, "PLASMID",
24735  expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24736  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'PLASMID'");
24737 
24738 }
24739 
24740 
24742 {
24743  // blanks are always bad
24745  TestOneReplicon(CSubSource::eSubtype_plastid_name, "", "BadPlastidName", eDiag_Warning, "Plastid name subsource contains unrecognized value");
24746  TestOneReplicon(CSubSource::eSubtype_transposon_name, "", "ObsoleteSourceQual", eDiag_Warning, "Transposon name and insertion sequence name are no longer legal qualifiers");
24747  TestOneReplicon(CSubSource::eSubtype_insertion_seq_name, "", "ObsoleteSourceQual", eDiag_Warning, "Transposon name and insertion sequence name are no longer legal qualifiers");
24748 
24749  // must start with letter or number
24750  TestAlwaysBadReplicon(".2");
24751 
24752  // unprintable characters bad
24753  TestAlwaysBadReplicon("a\tb");
24754 
24755  // just letters ok
24757 
24758  // spaces ok
24759  TestAlwaysGoodReplicon("x y");
24760 
24761  const string kMoreThan240 = "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z";
24762  const string kMoreThan32 = "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z";
24763 
24764  // segment
24766  TestOneReplicon(CSubSource::eSubtype_segment, kMoreThan240, "BadTextInSourceQualifier", eDiag_Error, "segment value should start with letter or number");
24769 
24770  // endogenous virus name
24772  TestOneReplicon(CSubSource::eSubtype_endogenous_virus_name, kMoreThan240, "BadTextInSourceQualifier", eDiag_Error, "endogenous-virus-name value should start with letter or number");
24775 
24776  // chromosome
24777  TestOneReplicon(CSubSource::eSubtype_chromosome, kMoreThan32, "BadPlasmidChromosomeLinkageName",
24778  eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan32 + "'");
24779  TestOneReplicon(CSubSource::eSubtype_chromosome, kMoreThan240, "BadPlasmidChromosomeLinkageName",
24780  eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan240 + "'");
24781  TestOneReplicon(CSubSource::eSubtype_chromosome, "LG 123", "BadPlasmidChromosomeLinkageName",
24782  eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'LG 123'");
24785 
24786  // linkage-group
24787  TestOneReplicon(CSubSource::eSubtype_linkage_group, kMoreThan32, "BadPlasmidChromosomeLinkageName",
24788  eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan32 + "'");
24789  TestOneReplicon(CSubSource::eSubtype_linkage_group, kMoreThan240, "BadPlasmidChromosomeLinkageName",
24790  eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan240 + "'");
24793 
24794  // plasmid-name
24796  TestOneReplicon(CSubSource::eSubtype_plasmid_name, kMoreThan32, "BadPlasmidChromosomeLinkageName",
24797  eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan32 + "'");
24798  TestOneReplicon(CSubSource::eSubtype_plasmid_name, kMoreThan240, "BadPlasmidChromosomeLinkageName",
24799  eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan240 + "'");
24802 
24803 // TestOneReplicon(CSubSource::eSubtype_plasmid_name, "pCHRO.01", "", eDiag_Info, "");
24804 
24805 }
24806 
24807 
24808 void CheckHost(const CBioseq& seq, const string& host)
24809 {
24810  bool found_host = false;
24811  BOOST_CHECK_EQUAL(seq.IsSetDescr(), true);
24812  if (!seq.IsSetDescr()) {
24813  return;
24814  }
24815  for (auto d : seq.GetDescr().Get()) {
24816  if (d->IsSource() && d->GetSource().IsSetOrgMod()) {
24817  for (auto om : d->GetSource().GetOrg().GetOrgname().GetMod()) {
24818  if (om->IsSetSubtype() && om->GetSubtype() == COrgMod::eSubtype_nat_host) {
24819  BOOST_CHECK_EQUAL(host, om->IsSetSubname() ? om->GetSubname() : kEmptyStr);
24820  found_host = true;
24821  }
24822  }
24823  }
24824  }
24825  BOOST_CHECK_EQUAL(found_host, true);
24826 }
24827 
24828 
24829 void CheckOneSpecificHost(const string& orig, const string& newval)
24830 {
24831  CRef<CSeq_entry> entry = BuildGoodSeq();
24834  CScope scope(*objmgr);
24835  scope.AddDefaults();
24836  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
24837 
24838  validator::CTaxValidationAndCleanup tval;
24839 
24840  BOOST_CHECK_EQUAL(tval.DoTaxonomyUpdate(seh, true), true);
24841  CheckHost(entry->GetSeq(), newval);
24842 }
24843 
24844 
24846 {
24847  CheckOneSpecificHost("Canis familiaris", "Canis lupus familiaris");
24848  CheckOneSpecificHost("Canis familiaris; some other information", "Canis familiaris; some other information");
24849  CheckOneSpecificHost("Hordeum spontaneum cultivar test", "Hordeum spontaneum cultivar test");
24850 }
24851 
24852 
24853 BOOST_AUTO_TEST_CASE(Test_BIOS_1527)
24854 {
24855  BOOST_CHECK_EQUAL("Acropora valida", FixSpecificHost("Acropora tumida"));
24856  BOOST_CHECK_EQUAL("Leuzea repens", FixSpecificHost("Acroptilon repens"));
24857 }
24858 
24859 
24861 {
24862  CRef<CSeq_entry> entry = BuildGoodSeq();
24863  SetLineage(entry, "Viroids;");
24865 
24867 
24868  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidTissueType",
24869  "Viroid has unexpected tissue-type qualifier"));
24870  // AddChromosomeNoLocation(expected_errors, entry);
24871 
24872  eval = validator.Validate(seh, options);
24873 
24874  CheckErrors(*eval, expected_errors);
24875 
24876  CLEAR_ERRORS
24877 }
24878 
24879 
24881 {
24882  // prepare entry
24884  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "Denmark: Aarhus Bay Station M5");
24885  unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "56.1033 N 10.4578 E");
24886 
24888 
24889  eval = validator.Validate(seh, options);
24890  // AddChromosomeNoLocation(expected_errors, entry);
24891  CheckErrors(*eval, expected_errors);
24892 
24894  string latlon = "56.1033 N 10.4578 E";
24895  string retval = CSubSource::ValidateLatLonCountry("Denmark: Aarhus Bay Station M5", latlon, true, lcErr);
24896  string enumval = lcErr == CSubSource::eLatLonCountryErr_Value ? "eLatLonCountryErr_Value" : "not error enum";
24897  BOOST_CHECK_EQUAL(retval, kEmptyStr);
24898 
24899  CLEAR_ERRORS
24900 }
24901 
24902 
24904 {
24906  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_type_material, "allotype of Bonnetina tenuiverpis");
24907 
24909 
24910  eval = validator.Validate(seh, options);
24911  // AddChromosomeNoLocation(expected_errors, entry);
24912  CheckErrors(*eval, expected_errors);
24913 
24914  CLEAR_ERRORS
24915 }
24916 
24917 
24918 void TestNewAccessionOnNuc(const string& accession, bool is_prot_acc, bool is_wgs)
24919 {
24920  CRef<CSeq_entry> entry = BuildGoodSeq();
24921  CRef<CSeq_id> new_id(new CSeq_id());
24922  new_id->SetGenbank().SetAccession(accession);
24923  entry->SetSeq().SetId().push_back(new_id);
24924 
24926 
24927  string acc_str = "gb|" + accession + "|";
24928  if (is_wgs) {
24929  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error,
24930  "InconsistentMolInfoTechnique",
24931  "WGS accession should have Mol-info.tech of wgs"));
24932  }
24933  if (is_prot_acc) {
24934  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "BadSeqIdFormat", "Bad accession " + accession));
24935  }
24936  // AddChromosomeNoLocation(expected_errors, acc_str);
24937  eval = validator.Validate(seh, options);
24938  CheckErrors(*eval, expected_errors);
24939 
24940  CLEAR_ERRORS
24941 
24942  scope.RemoveTopLevelSeqEntry(seh);
24943  SetTech(entry, CMolInfo::eTech_wgs);
24944  seh = scope.AddTopLevelSeqEntry(*entry);
24945  if (is_prot_acc) {
24946  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "BadSeqIdFormat", "Bad accession " + accession));
24947  }
24948  if (!is_wgs) {
24949  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "InconsistentWGSFlags", "Mol-info.tech of wgs should have WGS accession"));
24950  }
24951  AddChromosomeNoLocation(expected_errors, acc_str);
24952  eval = validator.Validate(seh, options);
24953  CheckErrors(*eval, expected_errors);
24954 
24955  CLEAR_ERRORS
24956 }
24957 
24958 
24959 void TestNewAccessionOnStandaloneProt(const string& accession, bool is_nuc_acc, bool is_wgs)
24960 {
24962  CRef<CSeq_id> new_id(new CSeq_id());
24963  new_id->SetGenbank().SetAccession(accession);
24964  entry->SetSeq().SetId().push_back(new_id);
24965 
24967 
24968  string acc_str = "gb|" + accession + "|";
24969  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "OrphanedProtein", "Orphaned stand-alone protein"));
24970  if (is_nuc_acc) {
24971  expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "BadSeqIdFormat", "Bad accession " + accession));
24972  }
24973  // AddChromosomeNoLocation(expected_errors, acc_str);
24974  eval = validator.Validate(seh, options);
24975  CheckErrors(*eval, expected_errors);
24976 
24977  CLEAR_ERRORS
24978 }
24979 
24980 
24981 void TestNewAccessionOnNucProt(const string& n_acc, const string& p_acc, bool is_wgs)
24982 {
24984  CRef<CSeq_id> nid(new CSeq_id());
24985  nid->SetGenbank().SetAccession(n_acc);
24987  CRef<CSeq_id> pid(new CSeq_id());
24988  pid->SetGenbank().SetAccession(p_acc);
24991 
24992  if (is_wgs) {
24993  expected_errors.push_back(new CExpectedError("gb|" + n_acc + "|", eDiag_Error, "InconsistentMolInfoTechnique", "WGS accession should have Mol-info.tech of wgs"));
24994  }
24995  eval = validator.Validate(seh, options);
24996  // AddChromosomeNoLocation(expected_errors, entry);
24997  CheckErrors(*eval, expected_errors);
24998 
24999  CLEAR_ERRORS
25000 }
25001 
25002 
25003 void TestNewAccessionAsInference(const string& acc)
25004 {
25005  CRef<CSeq_entry> entry = BuildGoodSeq();
25006  CRef<CSeq_feat> misc = AddMiscFeature(entry);
25007  misc->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("inference", "similar to DNA sequence:INSD:" + acc + ".1")));
25008 
25010 
25011  eval = validator.Validate(seh, options);
25012  // AddChromosomeNoLocation(expected_errors, entry);
25013  CheckErrors(*eval, expected_errors);
25014 
25015  CLEAR_ERRORS
25016 }
25017 
25018 
25019 BOOST_AUTO_TEST_CASE(Test_SQD_4560)
25020 {
25021  // new accession formats
25022  TestNewAccessionOnNuc("AAAAAB010000001", false, true);
25023  TestNewAccessionOnNuc("AA12345678", false, false);
25024  TestNewAccessionOnNuc("EAA0000015", true, true);
25025  TestNewAccessionOnStandaloneProt("AAAAAB010000001", true, true);
25026  TestNewAccessionOnStandaloneProt("AA12345678", true, false);
25027  TestNewAccessionOnStandaloneProt("EAA0000015", false, true);
25028 
25029  TestNewAccessionOnNucProt("AAAAAB010000001", "EAA0000015", true);
25030  TestNewAccessionOnNucProt("AA12345678", "EAA0000015", false);
25031 
25032  TestNewAccessionAsInference("AAAAAB010000001");
25033  TestNewAccessionAsInference("AA12345678");
25034  TestNewAccessionAsInference("EAA0000015");
25035 }
25036 
25038 {
25039  BOOST_CHECK_EQUAL("unclassified sequences", validator::FixSpecificHost("unclassified sequences"));
25040 }
25041 
25042 
25044 {
25046  unit_test_util::SetTaxname(entry, "Phascolarctobacterium sp.");
25047  unit_test_util::SetTaxon(entry, 0);
25048  unit_test_util::SetTaxon(entry, 2049039);
25051  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Homo sapiens");
25054  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_metagenome_source, "human gut metagenome");
25056 
25058 
25059  eval = validator.Validate(seh, options);
25060  // AddChromosomeNoLocation(expected_errors, entry);
25061  CheckErrors(*eval, expected_errors);
25062 
25063  CLEAR_ERRORS
25064 
25066  eval = validator.Validate(seh, options);
25067  // AddChromosomeNoLocation(expected_errors, entry);
25068  CheckErrors(*eval, expected_errors);
25069 
25070  CLEAR_ERRORS
25071 }
25072 
25074 {
25076  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_serotype, "an innocuous value");
25078 
25079  // no errors, because not salmonella
25080  eval = validator.Validate(seh, options);
25081  CheckErrors(*eval, expected_errors);
25082 
25083  CLEAR_ERRORS
25084 
25085  // no salmonella errors because not first word
25086  unit_test_util::SetTaxname(entry, "Badforyou Salmonella");
25087  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
25088  "Organism not found in taxonomy database"));
25089  eval = validator.Validate(seh, options);
25090  CheckErrors(*eval, expected_errors);
25091 
25092  CLEAR_ERRORS
25093 
25094  unit_test_util::SetTaxname(entry, "Salmonella");
25095  eval = validator.Validate(seh, options);
25096 
25097  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
25098  "TaxonomyLookupProblem", "Organism name is 'Salmonella', taxonomy ID should be '590' but is '592768'"));
25099  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
25100  "TaxonomyIsSpeciesProblem", "Taxonomy lookup reports is_species_level FALSE"));
25101  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadOrgMod", "Salmonella organisms should use serovar instead of serotype."));
25102  CheckErrors(*eval, expected_errors);
25103 
25104  CLEAR_ERRORS
25105 
25106  unit_test_util::SetTaxname(entry, "Salmonella badforyou");
25107  eval = validator.Validate(seh, options);
25108  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
25109  "Organism not found in taxonomy database"));
25110  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadOrgMod", "Salmonella organisms should use serovar instead of serotype."));
25111  CheckErrors(*eval, expected_errors);
25112 
25113  CLEAR_ERRORS
25114 
25115  scope.RemoveTopLevelSeqEntry(seh);
25116  entry->SetSeq().SetId().push_back(unit_test_util::BuildRefSeqId());
25117  seh = scope.AddTopLevelSeqEntry(*entry);
25118  eval = validator.Validate(seh, options);
25119  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "OrganismNotFound",
25120  "Organism not found in taxonomy database"));
25121  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "BadOrgMod", "Salmonella organisms should use serovar instead of serotype."));
25122  CheckErrors(*eval, expected_errors);
25123 
25124  CLEAR_ERRORS
25125 
25126  // presence of serovar
25127  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_serovar, "different value");
25128  eval = validator.Validate(seh, options);
25129  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "OrganismNotFound",
25130  "Organism not found in taxonomy database"));
25131  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "BadOrgMod", "Salmonella organisms should use serovar instead of serotype."));
25132  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "BadOrgMod", "Salmonella organism name should contain the serovar value."));
25133  CheckErrors(*eval, expected_errors);
25134 
25135  CLEAR_ERRORS
25136 
25137  unit_test_util::SetTaxname(entry, "Salmonella badforyou different value");
25138  eval = validator.Validate(seh, options);
25139  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "OrganismNotFound",
25140  "Organism not found in taxonomy database"));
25141  expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "BadOrgMod", "Salmonella organisms should use serovar instead of serotype."));
25142  CheckErrors(*eval, expected_errors);
25143 
25144  CLEAR_ERRORS
25145 }
25146 
25147 
25149 {
25151  unit_test_util::SetTaxname(entry, "Streptococcus agalactiae NEM316");
25152  unit_test_util::SetTaxon(entry, 0);
25153  unit_test_util::SetTaxon(entry, 211110);
25154  unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_serovar, "an innocuous value");
25156 
25157  // no errors, because not salmonella
25158  eval = validator.Validate(seh, options);
25159  CheckErrors(*eval, expected_errors);
25160 
25161  CLEAR_ERRORS
25162 
25163  unit_test_util::SetTaxname(entry, "Salmonella badforyou");
25164  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
25165  "Organism not found in taxonomy database"));
25166  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadOrgMod", "Salmonella organism name should contain the serovar value."));
25167  eval = validator.Validate(seh, options);
25168  CheckErrors(*eval, expected_errors);
25169 
25170  CLEAR_ERRORS
25171 }
25172 
25173 
25174 
25175 BOOST_AUTO_TEST_CASE(Test_SEQ_RW_1753)
25176 {
25177  CRef<CSeq_annot> annot(new CSeq_annot());
25178  annot->SetData().SetLocs();
25179 
25180  auto pObjMgr = CObjectManager::GetInstance();
25181  CScope scope(*pObjMgr);
25182  auto sah = scope.AddSeq_annot(*annot);
25183 
25184  vector<CExpectedError*> expected_errors;
25185  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "AnnotLOCs",
25186  "Record contains Seq-annot.data.locs"));
25187 
25188  unsigned int options{0};
25189  auto pErrors = Ref(new CValidError());
25190  CValidator(*pObjMgr).Validate(sah, options, *pErrors);
25191  CheckErrors(*pErrors, expected_errors);
25192 
25193  CLEAR_ERRORS
25194 }
25195 
25196 #if 0
25197 BOOST_AUTO_TEST_CASE(Test_TM_897)
25198 {
25200  env.Set("NI_SERVICE_NAME_TAXON3", "TaxService3v4test");
25201 
25202  CRef<CSeq_entry> entry = BuildGoodSeq();
25203  SetTaxname(entry, "Salmonela enterica");
25205 
25206  expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
25207  "OrganismNotFound",
25208  "Organism not found. Possible matches|Salmonella enterica|Salmonella enterica V|Salmonella enterica subsp. V"));
25209 
25210  eval = validator.Validate(seh, options);
25211  CheckErrors(*eval, expected_errors);
25212 
25213 
25214  CLEAR_ERRORS
25215 }
25216 #endif
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eErr_SEQ_FEAT_NotSpliceConsensusDonor
@ eErr_SEQ_FEAT_InternalStop
@ eErr_SEQ_INST_StopInProtein
@ eErr_SEQ_FEAT_ExceptionProblem
int debug_mode
Definition: build_db.cpp:63
bool AddTerminalCodeBreak(CSeq_feat &cds, CScope &scope)
Definition: cds_fix.cpp:162
CAnnotdesc –.
Definition: Annotdesc.hpp:66
CArgs –.
Definition: ncbiargs.hpp:379
CAuthor –.
Definition: Author.hpp:59
bool RemoveLineageSourceNotes()
Definition: BioSource.cpp:1519
bool GetDisableStrainForwarding() const
Definition: BioSource.cpp:647
void RemoveCultureNotes(bool is_species_level=true)
Definition: BioSource.cpp:1428
void SetDisableStrainForwarding(bool val)
Definition: BioSource.cpp:621
CBioseq_Handle –.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
CByte_graph –.
Definition: Byte_graph.hpp:66
static vector< CRef< CSeq_loc > > GetNonsenseIntrons(const CSeq_feat &feat, CScope &scope)
CCode_break –.
Definition: Code_break.hpp:66
static string NewFixCountry(const string &input, bool us_territories=false)
Definition: SubSource.cpp:4662
static string USAStateCleanup(const string &country)
Definition: SubSource.cpp:4654
static void LoadUSAExceptionMap(const TUsaExceptionMap &exceptions)
Definition: SubSource.cpp:4590
Definition: Date.hpp:53
@ ePrecision_day
Definition: Date.hpp:58
Definition: Dbtag.hpp:53
CDelta_seq –.
Definition: Delta_seq.hpp:66
bool Match(const CValidErrItem &err_item, bool ignore_severity=false)
CExpectedError(string accession, EDiagSev severity, string err_code, string err_msg)
void Print(void) const
static void PrintSeenError(const CValidErrItem &err_item)
void Test(const CValidErrItem &err_item)
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
static void ParseExperiment(const string &orig, string &category, string &experiment, string &doi)
Definition: Gb_qual.cpp:83
static string BuildExperiment(const string &category, const string &experiment, const string &doi)
Definition: Gb_qual.cpp:114
list< CRef< CTaxon3_reply > > TReplies
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:244
CNcbiEnvironment –.
Definition: ncbienv.hpp:110
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
static bool FixStructuredVoucher(string &val, const string &voucher_type)
Definition: OrgMod.cpp:662
static string IsCultureCollectionValid(const string &culture_collection)
Definition: OrgMod.cpp:424
C_Name –.
Definition: OrgName_.hpp:98
static bool TrimJunk(string &seq)
static bool IsValid(const string &seq, char &bad_ch)
static bool Fixi(string &seq)
CPCRPrimer –.
Definition: PCRPrimer.hpp:66
CPCRReactionSet –.
CPCRReaction –.
Definition: PCRReaction.hpp:66
Definition: Pub.hpp:56
CScope –.
Definition: scope.hpp:92
CSeqFeatXref –.
Definition: SeqFeatXref.hpp:66
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
void SetDescr(CSeq_descr &value)
Definition: Seq_entry.cpp:134
bool IsSetDescr(void) const
Definition: Seq_entry.cpp:106
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
void AddQualifier(const string &qual_name, const string &qual_val)
Add a qualifier to this feature.
Definition: Seq_feat.cpp:291
void SetGeneXref(CGene_ref &value)
Definition: Seq_feat.cpp:192
bool AddSeqFeatXref(const CSeqFeatXref::TId &id)
Definition: Seq_feat.cpp:279
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
static string GetCollectionDateProblem(const string &date_string)
static bool NCBI_UseGeoLocNameForCountry(void)
Definition: SubSource.cpp:92
static string FixSexQualifierValue(const string &value)
Definition: SubSource.cpp:2525
static bool IsValidSexQualifierValue(const string &value)
Definition: SubSource.cpp:2486
@ eLatLonCountryErr_Value
Definition: SubSource.hpp:194
static string AutoFix(TSubtype subtype, const string &value)
Definition: SubSource.cpp:5075
static string ValidateLatLonCountry(const string &countryname, string &lat_lon, bool check_state, ELatLonCountryErr &errcode)
Definition: SubSource.cpp:2099
static string FixDateFormat(const string &orig_date)
Attempt to fix the format of the date Returns a blank if the format of the date cannot be determined.
Definition: SubSource.cpp:618
static string CheckCellLine(const string &cell_line, const string &organism)
Definition: SubSource.cpp:2953
static void IsCorrectLatLonFormat(string lat_lon, bool &format_correct, bool &precision_correct, bool &lat_in_range, bool &lon_in_range, double &lat_value, double &lon_value)
Definition: SubSource.cpp:1235
static string FixAltitude(const string &value)
Definition: SubSource.cpp:2688
static void RemoveCultureNotes(string &value, bool is_species_level=true)
Definition: SubSource.cpp:5190
static void IsCorrectDateFormat(const string &date_string, bool &bad_format, bool &in_future)
Definition: SubSource.cpp:452
static void DetectDateFormat(const string &orig_date, bool &ambiguous, bool &day_first)
Definition: SubSource.cpp:1174
CT3Reply –.
Definition: T3Reply.hpp:66
CTaxElement –.
Definition: TaxElement.hpp:66
CConstRef< CSeq_feat > GetFeat(size_t num) const
vector< CRef< COrg_ref > > GetTaxonomyLookupRequest() const
bool AdjustOrgRefsForSpecificHosts(vector< CRef< COrg_ref > > org_refs)
CConstRef< CSeqdesc > GetDesc(size_t num) const
bool AdjustOrgRefsWithSpecificHostReply(vector< CRef< COrg_ref >> requests, const CTaxon3_reply &reply, vector< CRef< COrg_ref >> org_refs)
bool AdjustOrgRefsWithTaxLookupReply(const CTaxon3_reply &reply, vector< CRef< COrg_ref > > org_refs, string &error_message, bool use_error_orgrefs=false) const
vector< CRef< COrg_ref > > GetStrainLookupRequest()
string IncrementalSpecificHostMapUpdate(const vector< CRef< COrg_ref > > &input, const CTaxon3_reply &reply)
string IncrementalStrainMapUpdate(const vector< CRef< COrg_ref > > &input, const CTaxon3_reply &reply, TTaxId descTaxID=ZERO_TAX_ID)
vector< CRef< COrg_ref > > GetSpecificHostLookupRequest(bool for_fix)
void Init(const CSeq_entry &se)
CTaxon3_reply –.
virtual CRef< CTaxon3_reply > SendOrgRefList(const vector< CRef< COrg_ref > > &list, COrg_ref::fOrgref_parts result_parts=COrg_ref::eOrgref_default, fT3reply_parts t3result_parts=eT3reply_default)
Definition: taxon3.cpp:190
CTime –.
Definition: ncbitime.hpp:296
C_E –.
Definition: Title_.hpp:96
void SetObjectType(EObjectType obj_type)
@ eRefGeneTrackingStatus_INFERRED
void SetRefGeneTrackingStatus(ERefGeneTrackingStatus status)
@ eObjectType_RefGeneTracking
@ eObjectType_StructuredComment
@ eObjectType_ValidationSuppression
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
@ eParse_String
Add string even if all numbers.
Definition: User_object.hpp:61
static const string & ConvertSeverity(EDiagSev sev)
static const string & ConvertErrCode(unsigned int)
const string GetErrCode() const
EDiagSev GetSeverity() const
static void AddSuppression(CUser_object &user, CValidErrItem::TErrIndex err_code)
CRef< CValidError > Validate(const CSeq_entry &se, CScope *scope=nullptr, Uint4 options=0)
Definition: validator.cpp:100
@ eVal_remote_fetch
Definition: validator.hpp:86
@ eVal_indexer_version
Definition: validator.hpp:91
@ eVal_do_rubisco_test
Definition: validator.hpp:90
@ eVal_collect_locus_tags
Definition: validator.hpp:102
@ eVal_validate_id_set
Definition: validator.hpp:85
@ eVal_inference_accns
Definition: validator.hpp:93
@ eVal_val_exons
Definition: validator.hpp:81
@ eVal_need_isojta
Definition: validator.hpp:84
@ eVal_use_entrez
Definition: validator.hpp:92
@ eVal_non_ascii
Definition: validator.hpp:78
@ eVal_genome_submission
Definition: validator.hpp:98
@ eVal_locus_tag_general_match
Definition: validator.hpp:89
@ eVal_val_align
Definition: validator.hpp:80
@ eVal_far_fetch_cds_products
Definition: validator.hpp:88
@ eVal_latlon_check_state
Definition: validator.hpp:96
@ eVal_far_fetch_mrna_products
Definition: validator.hpp:87
@ eVal_seqsubmit_parent
Definition: validator.hpp:83
Definition: set.hpp:45
size_type size() const
Definition: set.hpp:132
static const int chunk_size
char value[7]
Definition: config.c:431
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
set< CBioseq_Handle > ListOrphanProteins(CSeq_entry_Handle seh, bool force_refseq=false)
Definition: dup_feats.cpp:142
set< CSeq_feat_Handle > GetDuplicateFeaturesForRemoval(CSeq_entry_Handle seh)
Definition: dup_feats.cpp:62
void SetGoTermPMID(CUser_field &field, int pmid)
size_t CountProcessGoTerms(const CSeq_feat &feat)
void ClearGoTermPMID(CUser_field &field)
void SetGoTermText(CUser_field &field, const string &val)
void AddProcessGoTerm(CSeq_feat &feat, CRef< CUser_field > field)
void ClearGoTermEvidence(CUser_field &field)
void SetGoTermId(CUser_field &field, const string &val)
void AddGoTermEvidence(CUser_field &field, const string &val)
bool RemoveDuplicateGoTerms(CSeq_feat &feat)
#define ENTREZ_ID_CONST(id)
Definition: ncbimisc.hpp:1099
#define TAX_ID_CONST(id)
Definition: ncbimisc.hpp:1112
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
#define GI_CONST(gi)
Definition: ncbimisc.hpp:1087
#define ZERO_GI
Definition: ncbimisc.hpp:1088
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
void SetDiagFilter(EDiagFilter what, const char *filter_str)
Set diagnostic filter.
Definition: ncbidiag.cpp:7670
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
@ eDiagFilter_All
for all non-FATAL
Definition: ncbidiag.hpp:2531
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void SetPartialStart(bool val, ESeqLocExtremes ext)
set / remove e_Lim fuzz on start or stop (lt/gt - indicating partial interval)
Definition: Seq_loc.cpp:3280
void SetPartialStop(bool val, ESeqLocExtremes ext)
Definition: Seq_loc.cpp:3313
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
static CObjectOStream * Open(ESerialDataFormat format, CNcbiOstream &outStream, bool deleteOutStream)
Create serial object writer and attach it to an output stream.
Definition: objostr.cpp:126
ELocationInFrame IsLocationInFrame(const CSeq_feat_Handle &cds, const CSeq_loc &loc)
Determines whether location loc is in frame with coding region cds.
Definition: feature.cpp:3818
@ eLocationInFrame_InFrame
Definition: feature.hpp:532
@ eLocationInFrame_BadStart
Definition: feature.hpp:533
@ eLocationInFrame_BadStop
Definition: feature.hpp:534
@ eLocationInFrame_BadStartAndStop
Definition: feature.hpp:535
@ eLocationInFrame_NotIn
Definition: feature.hpp:536
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CSeq_feat_Handle GetSeq_featHandle(const CSeq_feat &feat, EMissing action=eMissing_Default)
Definition: scope.cpp:200
CSeq_annot_Handle AddSeq_annot(CSeq_annot &annot, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add Seq-annot, return its CSeq_annot_Handle.
Definition: scope.cpp:538
void RemoveTopLevelSeqEntry(const CTSE_Handle &entry)
Revoke TSE previously added using AddTopLevelSeqEntry() or AddBioseq().
Definition: scope.cpp:376
TSeq GetSeq(void) const
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define NCBI_UNUSED
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5109
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
#define NCBITEST_DISABLE(test_name)
Unconditionally disable test case.
Definition: test_boost.hpp:906
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
static const char label[]
TKeywords & SetKeywords(void)
Assign a value to Keywords data member.
Definition: GB_block_.hpp:532
void SetBook(TBook &value)
Assign a value to Book data member.
Definition: Cit_proc_.cpp:61
void SetTitle(TTitle &value)
Assign a value to Title data member.
Definition: Cit_art_.cpp:210
void SetDate(TDate &value)
Assign a value to Date data member.
Definition: Cit_sub_.cpp:101
void SetTitle(TTitle &value)
Assign a value to Title data member.
Definition: Cit_book_.cpp:62
void SetName(TName &value)
Assign a value to Name data member.
Definition: Author_.cpp:81
void SetFrom(TFrom &value)
Assign a value to From data member.
Definition: Cit_art_.cpp:248
void SetAuthors(TAuthors &value)
Assign a value to Authors data member.
Definition: Cit_art_.cpp:227
void SetDate(TDate &value)
Assign a value to Date data member.
Definition: Cit_gen_.cpp:116
void SetAuthors(TAuthors &value)
Assign a value to Authors data member.
Definition: Cit_sub_.cpp:74
void SetCit(TCit &value)
Assign a value to Cit data member.
Definition: Cit_let_.cpp:70
void SetCit(const TCit &value)
Assign a value to Cit data member.
Definition: Cit_gen_.hpp:597
void SetAuthors(TAuthors &value)
Assign a value to Authors data member.
Definition: Cit_book_.cpp:93
void ResetDate(void)
Reset Date data member.
Definition: Cit_gen_.cpp:111
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Cit_gen_.hpp:942
void SetAuthors(TAuthors &value)
Assign a value to Authors data member.
Definition: Cit_gen_.cpp:64
@ ePubStatus_ppublish
published in print by publisher
Definition: PubStatus_.hpp:69
@ ePubStatus_aheadofprint
epublish, but will be followed by print
Definition: PubStatus_.hpp:75
@ ePubStatus_epublish
published electronically by publisher
Definition: PubStatus_.hpp:68
@ ePrepub_in_press
accepted, not published
Definition: Imprint_.hpp:96
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
Definition: SubSource_.hpp:319
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
void SetSeq(const TSeq &value)
Assign a value to Seq data member.
Definition: PCRPrimer_.hpp:220
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
TSubtype GetSubtype(void) const
Get the Subtype member data.
Definition: SubSource_.hpp:310
void SetForward(TForward &value)
Assign a value to Forward data member.
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: SubSource_.hpp:291
void SetPcr_primers(TPcr_primers &value)
Assign a value to Pcr_primers data member.
Definition: BioSource_.cpp:124
void SetReverse(TReverse &value)
Assign a value to Reverse data member.
void SetName(const TName &value)
Assign a value to Name data member.
Definition: PCRPrimer_.hpp:255
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
void SetName(const TName &value)
Assign a value to Name data member.
Definition: SubSource_.hpp:359
const TName & GetName(void) const
Get the Name member data.
Definition: SubSource_.hpp:350
Tdata & Set(void)
Assign a value to data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: SubSource_.hpp:338
TSubtype & SetSubtype(void)
Assign a value to Subtype data member.
Definition: BioSource_.hpp:545
@ eSubtype_collection_date
DD-MMM-YYYY format.
Definition: SubSource_.hpp:114
@ eSubtype_fwd_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:117
@ eSubtype_lat_lon
+/- decimal degrees
Definition: SubSource_.hpp:113
@ eSubtype_collected_by
name of person who collected the sample
Definition: SubSource_.hpp:115
@ eSubtype_rev_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:118
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eSubtype_endogenous_virus_name
Definition: SubSource_.hpp:109
@ eSubtype_identified_by
name of person who identified the sample
Definition: SubSource_.hpp:116
@ eOrigin_synthetic
purely synthetic
Definition: BioSource_.hpp:134
@ eOrigin_mut
artificially mutagenized
Definition: BioSource_.hpp:132
@ eOrigin_artificial
artificially engineered
Definition: BioSource_.hpp:133
void SetSeason(const TSeason &value)
Assign a value to Season data member.
Definition: Date_std_.hpp:569
void SetYear(TYear value)
Assign a value to Year data member.
Definition: Date_std_.hpp:435
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
void SetMonth(TMonth value)
Assign a value to Month data member.
Definition: Date_std_.hpp:482
TStd & SetStd(void)
Select the variant.
Definition: Date_.cpp:115
void SetDay(TDay value)
Assign a value to Day data member.
Definition: Date_std_.hpp:529
TData & SetData(void)
Assign a value to Data data member.
void SetLabel(TLabel &value)
Assign a value to Label data member.
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
void ResetMonth(void)
Reset Month data member.
Definition: Date_std_.hpp:466
void ResetDay(void)
Reset Day data member.
Definition: Date_std_.hpp:513
void SetType(TType &value)
Assign a value to Type data member.
void ResetData(void)
Reset Data data member.
void SetData(TData &value)
Assign a value to Data data member.
TYear GetYear(void) const
Get the Year member data.
Definition: Date_std_.hpp:426
TMonth GetMonth(void) const
Get the Month member data.
Definition: Date_std_.hpp:473
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
TDay GetDay(void) const
Get the Day member data.
Definition: Date_std_.hpp:520
const TStd & GetStd(void) const
Get the variant data.
Definition: Date_.cpp:109
@ eLim_gt
greater than
Definition: Int_fuzz_.hpp:211
@ eLim_unk
unknown
Definition: Int_fuzz_.hpp:210
@ eLim_tl
space to left of position
Definition: Int_fuzz_.hpp:214
@ eLim_tr
space to right of position
Definition: Int_fuzz_.hpp:213
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
THybrid & SetHybrid(void)
Select the variant.
Definition: OrgName_.cpp:155
TNamedhybrid & SetNamedhybrid(void)
Select the variant.
Definition: OrgName_.cpp:177
TVirus & SetVirus(void)
Select the variant.
Definition: OrgName_.hpp:734
TDb & SetDb(void)
Assign a value to Db data member.
Definition: Org_ref_.hpp:497
virtual void Reset(void)
Reset the whole object.
Definition: Org_ref_.cpp:99
Tdata & Set(void)
Assign a value to data member.
void SetGenus(const TGenus &value)
Assign a value to Genus data member.
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
void SetSubspecies(const TSubspecies &value)
Assign a value to Subspecies data member.
void SetSpecies(const TSpecies &value)
Assign a value to Species data member.
Tdata & Set(void)
Assign a value to data member.
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
TPartial & SetPartial(void)
Select the variant.
Definition: OrgName_.cpp:199
TBinomial & SetBinomial(void)
Select the variant.
Definition: OrgName_.cpp:127
const TAttrib & GetAttrib(void) const
Get the Attrib member data.
Definition: OrgName_.hpp:792
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_gb_synonym
used by taxonomy database
Definition: OrgMod_.hpp:117
@ eSubtype_pathovar
Definition: OrgMod_.hpp:94
@ eSubtype_other
ASN5: old-name (254) will be added to next spec.
Definition: OrgMod_.hpp:125
@ eSubtype_sub_species
Definition: OrgMod_.hpp:105
@ eSubtype_nat_host
natural host of this specimen
Definition: OrgMod_.hpp:104
@ eSubtype_cultivar
Definition: OrgMod_.hpp:93
@ eSubtype_variety
Definition: OrgMod_.hpp:89
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_metagenome_source
Definition: OrgMod_.hpp:120
@ eSubtype_biovar
Definition: OrgMod_.hpp:96
@ eSubtype_old_name
Definition: OrgMod_.hpp:124
@ eSubtype_synonym
Definition: OrgMod_.hpp:111
@ eSubtype_type_material
Definition: OrgMod_.hpp:121
@ eSubtype_acronym
Definition: OrgMod_.hpp:102
@ eSubtype_specimen_voucher
Definition: OrgMod_.hpp:106
@ eSubtype_serotype
Definition: OrgMod_.hpp:90
@ eSubtype_serovar
Definition: OrgMod_.hpp:92
@ eSubtype_bio_material
Definition: OrgMod_.hpp:119
@ eSubtype_culture_collection
Definition: OrgMod_.hpp:118
@ eSubtype_forma_specialis
Definition: OrgMod_.hpp:109
@ eSubtype_old_lineage
Definition: OrgMod_.hpp:123
@ eSubtype_isolate
Definition: OrgMod_.hpp:100
EProcessed
processing status
Definition: Prot_ref_.hpp:95
@ eProcessed_signal_peptide
Definition: Prot_ref_.hpp:99
@ eProcessed_transit_peptide
Definition: Prot_ref_.hpp:100
TProc & SetProc(void)
Select the variant.
Definition: Pub_.cpp:305
TPmid & SetPmid(void)
Select the variant.
Definition: Pub_.hpp:690
TMuid & SetMuid(void)
Select the variant.
Definition: Pub_.hpp:615
TBook & SetBook(void)
Select the variant.
Definition: Pub_.cpp:283
TEquiv & SetEquiv(void)
Select the variant.
Definition: Pub_.cpp:393
TMan & SetMan(void)
Select the variant.
Definition: Pub_.cpp:371
TSub & SetSub(void)
Select the variant.
Definition: Pub_.cpp:195
TGen & SetGen(void)
Select the variant.
Definition: Pub_.cpp:173
TMedline & SetMedline(void)
Select the variant.
Definition: Pub_.cpp:217
TArticle & SetArticle(void)
Select the variant.
Definition: Pub_.cpp:239
void ResetSegs(void)
Reset Segs data member.
Definition: Seq_align_.cpp:301
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Seq_align_.hpp:865
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
void SetAa(TAa &value)
Assign a value to Aa data member.
TXref & SetXref(void)
Assign a value to Xref data member.
Definition: Seq_feat_.hpp:1314
void SetQual(const TQual &value)
Assign a value to Qual data member.
Definition: Gb_qual_.hpp:221
void ResetTitle(void)
Reset Title data member.
Definition: Seq_feat_.cpp:142
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
void ResetExcept(void)
Reset Except data member.
Definition: Seq_feat_.hpp:1002
void SetCit(TCit &value)
Assign a value to Cit data member.
Definition: Seq_feat_.cpp:170
void SetPartial(TPartial value)
Assign a value to Partial data member.
Definition: Seq_feat_.hpp:971
void SetProduct(TProduct &value)
Assign a value to Product data member.
Definition: Seq_feat_.cpp:110
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_feat_.hpp:904
void ResetExcept_text(void)
Reset Except_text data member.
Definition: Seq_feat_.cpp:194
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
void SetExcept(TExcept value)
Assign a value to Except data member.
Definition: Seq_feat_.hpp:1018
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_feat_.cpp:153
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Seq_feat_.hpp:1181
void ResetPseudo(void)
Reset Pseudo data member.
Definition: Seq_feat_.hpp:1358
void SetId(TId &value)
Assign a value to Id data member.
Definition: Seq_feat_.cpp:73
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void ResetComment(void)
Reset Comment data member.
Definition: Seq_feat_.cpp:99
void SetLoc(TLoc &value)
Assign a value to Loc data member.
void SetVal(const TVal &value)
Assign a value to Val data member.
Definition: Gb_qual_.hpp:268
void SetPseudo(TPseudo value)
Assign a value to Pseudo data member.
Definition: Seq_feat_.hpp:1374
void SetExcept_text(const TExcept_text &value)
Assign a value to Except_text data member.
Definition: Seq_feat_.hpp:1414
void ResetProduct(void)
Reset Product data member.
Definition: Seq_feat_.cpp:105
void ResetXref(void)
Reset Xref data member.
Definition: Seq_feat_.cpp:182
TQual & SetQual(void)
Assign a value to Qual data member.
Definition: Seq_feat_.hpp:1153
void ResetQual(void)
Reset Qual data member.
Definition: Seq_feat_.cpp:136
@ eFrame_three
reading frame
Definition: Cdregion_.hpp:98
void SetSeqid(TSeqid value)
Assign a value to Seqid data member.
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
void SetMol(const TMol &value)
Assign a value to Mol data member.
TPatent & SetPatent(void)
Select the variant.
Definition: Seq_id_.cpp:331
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seq_id_.cpp:265
TOther & SetOther(void)
Select the variant.
Definition: Seq_id_.cpp:353
const TId & GetId(void) const
Get the Id member data.
void SetId(TId value)
Assign a value to Id data member.
void SetAccession(const TAccession &value)
Assign a value to Accession data member.
TFrom GetFrom(void) const
Get the From member data.
TTpe & SetTpe(void)
Select the variant.
Definition: Seq_id_.cpp:485
TTpg & SetTpg(void)
Select the variant.
Definition: Seq_id_.cpp:463
TPir & SetPir(void)
Select the variant.
Definition: Seq_id_.cpp:287
TGi & SetGi(void)
Select the variant.
Definition: Seq_id_.hpp:896
TTpd & SetTpd(void)
Select the variant.
Definition: Seq_id_.cpp:507
TGibbmt & SetGibbmt(void)
Select the variant.
Definition: Seq_id_.hpp:821
TGpipe & SetGpipe(void)
Select the variant.
Definition: Seq_id_.cpp:529
TDdbj & SetDdbj(void)
Select the variant.
Definition: Seq_id_.cpp:397
void SetFuzz_to(TFuzz_to &value)
Assign a value to Fuzz_to data member.
void SetFuzz_from(TFuzz_from &value)
Assign a value to Fuzz_from data member.
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
TGiim & SetGiim(void)
Select the variant.
Definition: Seq_id_.cpp:221
void SetDb(const TDb &value)
Assign a value to Db data member.
TPrf & SetPrf(void)
Select the variant.
Definition: Seq_id_.cpp:419
TTo GetTo(void) const
Get the To member data.
TGibbsq & SetGibbsq(void)
Select the variant.
Definition: Seq_id_.hpp:794
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seq_id_.cpp:243
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
TSwissprot & SetSwissprot(void)
Select the variant.
Definition: Seq_id_.cpp:309
void SetCit(TCit &value)
Assign a value to Cit data member.
void SetVersion(TVersion value)
Assign a value to Version data member.
TPdb & SetPdb(void)
Select the variant.
Definition: Seq_id_.cpp:441
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_both_rev
in reverse orientation
Definition: Na_strand_.hpp:69
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
void SetMin(TMin value)
Assign a value to Min data member.
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Seq_graph_.hpp:784
void SetNumval(TNumval value)
Assign a value to Numval data member.
TValues & SetValues(void)
Assign a value to Values data member.
void SetGraph(TGraph &value)
Assign a value to Graph data member.
Definition: Seq_graph_.cpp:250
void SetMax(TMax value)
Assign a value to Max data member.
void ResetLoc(void)
Reset Loc data member.
Definition: Seq_graph_.cpp:215
void ResetValues(void)
Reset Values data member.
Definition: Byte_graph_.cpp:50
void SetLoc(TLoc &value)
Assign a value to Loc data member.
Definition: Seq_graph_.cpp:224
TNumval GetNumval(void) const
Get the Numval member data.
void SetAxis(TAxis value)
Assign a value to Axis data member.
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TDescr & GetDescr(void) const
Get the Descr member data.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TClass GetClass(void) const
Get the Class member data.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
void ResetAnnot(void)
Reset Annot data member.
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
bool IsSetDescr(void) const
Check if a value has been assigned to Descr data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
void SetClass(TClass value)
Assign a value to Class data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_equiv
a set of equivalent maps or seqs
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_conset
constructed sequence + parts
@ eClass_pir
converted pir
@ eClass_eco_set
ecological sample study
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ eClass_gibb
geninfo backbone
@ eClass_gen_prod_set
genomic products, chrom+mRNA+protein
@ eClass_pdb_entry
a complete PDB entry
@ eClass_genbank
converted genbank
@ eClass_swissprot
converted SWISSPROT
@ eClass_segset
segmented sequence + parts
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
@ eClass_pub_set
all the seqs from a single publication
TModif & SetModif(void)
Select the variant.
Definition: Seqdesc_.hpp:972
virtual void Reset(void)
Reset the whole object.
Definition: Seq_gap_.cpp:86
void SetCompleteness(TCompleteness value)
Assign a value to Completeness data member.
Definition: MolInfo_.hpp:600
void SetLength(TLength value)
Assign a value to Length data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TMethod & SetMethod(void)
Select the variant.
Definition: Seqdesc_.hpp:992
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
TPir & SetPir(void)
Select the variant.
Definition: Seqdesc_.cpp:318
void ResetId(void)
Reset Id data member.
Definition: Bioseq_.cpp:54
void SetPub(TPub &value)
Assign a value to Pub data member.
Definition: Pubdesc_.cpp:72
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
TMol_type & SetMol_type(void)
Select the variant.
Definition: Seqdesc_.hpp:945
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
TPrf & SetPrf(void)
Select the variant.
Definition: Seqdesc_.cpp:522
TOrg & SetOrg(void)
Select the variant.
Definition: Seqdesc_.cpp:246
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
bool IsSource(void) const
Check if variant Source is selected.
Definition: Seqdesc_.hpp:1190
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seqdesc_.cpp:340
TPdb & SetPdb(void)
Select the variant.
Definition: Seqdesc_.cpp:544
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
void ResetAnnot(void)
Reset Annot data member.
Definition: Bioseq_.cpp:91
TUser & SetUser(void)
Select the variant.
Definition: Annotdesc_.cpp:190
TSp & SetSp(void)
Select the variant.
Definition: Seqdesc_.cpp:412
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_gap_.hpp:291
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
TName & SetName(void)
Select the variant.
Definition: Seqdesc_.hpp:1019
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void ResetLinkage(void)
Reset Linkage data member.
Definition: Seq_gap_.hpp:322
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
void SetBiomol(TBiomol value)
Assign a value to Biomol data member.
Definition: MolInfo_.hpp:453
bool IsPub(void) const
Check if variant Pub is selected.
Definition: Seqdesc_.hpp:1096
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
void ResetLinkage_evidence(void)
Reset Linkage_evidence data member.
Definition: Seq_gap_.cpp:80
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seqdesc_.cpp:456
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Pubdesc_.hpp:994
TLinkage_evidence & SetLinkage_evidence(void)
Assign a value to Linkage_evidence data member.
Definition: Seq_gap_.hpp:375
void SetLinkage(TLinkage value)
Assign a value to Linkage data member.
Definition: Seq_gap_.hpp:338
void ResetType(void)
Reset Type data member.
Definition: Seq_gap_.hpp:275
void SetTech(TTech value)
Assign a value to Tech data member.
Definition: MolInfo_.hpp:503
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
TCreate_date & SetCreate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:478
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
TRegion & SetRegion(void)
Select the variant.
Definition: Seqdesc_.hpp:1115
@ eGIBB_method_other
@ eRepr_const
constructed sequence
Definition: Seq_inst_.hpp:96
@ eRepr_ref
reference to another sequence
Definition: Seq_inst_.hpp:97
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_map
ordered map of any kind
Definition: Seq_inst_.hpp:99
@ eRepr_consen
consensus sequence or pattern
Definition: Seq_inst_.hpp:98
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
@ eRepr_not_set
empty
Definition: Seq_inst_.hpp:92
@ eCompleteness_unknown
Definition: MolInfo_.hpp:155
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
@ eCompleteness_no_left
missing 5' or NH3 end
Definition: MolInfo_.hpp:158
@ eCompleteness_partial
partial but no details given
Definition: MolInfo_.hpp:157
@ eCompleteness_no_right
missing 3' or COOH end
Definition: MolInfo_.hpp:159
@ eCompleteness_no_ends
missing both ends
Definition: MolInfo_.hpp:160
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_physmap
from physical mapping techniques
Definition: MolInfo_.hpp:129
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_both
concept transl. w/ partial pept. seq.
Definition: MolInfo_.hpp:133
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_seq_pept_homol
sequenced peptide, ordered by homology
Definition: MolInfo_.hpp:135
@ eTech_composite_wgs_htgs
composite of WGS and HTGS
Definition: MolInfo_.hpp:145
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_seq_pept_overlap
sequenced peptide, ordered by overlap
Definition: MolInfo_.hpp:134
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_concept_trans
conceptual translation
Definition: MolInfo_.hpp:131
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_standard
standard sequencing
Definition: MolInfo_.hpp:124
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_seq_pept
peptide was sequenced
Definition: MolInfo_.hpp:132
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_barcode
barcode of life project
Definition: MolInfo_.hpp:144
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_derived
derived from other data, not a primary entity
Definition: MolInfo_.hpp:130
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ eTech_concept_trans_a
conceptual transl. supplied by author
Definition: MolInfo_.hpp:136
@ eTech_genemap
from genetic mapping techniques
Definition: MolInfo_.hpp:128
@ e_Ncbi2na
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
@ e_Ncbi4na
4 bit nucleic acid code
Definition: Seq_data_.hpp:107
@ eBiomol_pre_RNA
precursor RNA of any sort really
Definition: MolInfo_.hpp:102
@ eBiomol_cRNA
viral RNA genome copy intermediate
Definition: MolInfo_.hpp:111
@ eBiomol_snoRNA
small nucleolar RNA
Definition: MolInfo_.hpp:112
@ eBiomol_genomic_mRNA
reported a mix of genomic and cdna sequence
Definition: MolInfo_.hpp:110
@ eBiomol_transcribed_RNA
transcribed RNA other than existing classes
Definition: MolInfo_.hpp:113
@ eBiomol_other_genetic
other genetic material
Definition: MolInfo_.hpp:109
@ eGIBB_mod_no_right
missing right end (3' or COOH)
Definition: GIBB_mod_.hpp:82
@ eGIBB_mod_complete
Definition: GIBB_mod_.hpp:76
@ eGIBB_mod_cyanelle
Definition: GIBB_mod_.hpp:72
@ eGIBB_mod_mitochondrial
Definition: GIBB_mod_.hpp:69
@ eGIBB_mod_other
Definition: GIBB_mod_.hpp:92
@ eGIBB_mod_dna
Definition: GIBB_mod_.hpp:65
@ eGIBB_mod_rna
Definition: GIBB_mod_.hpp:66
@ eGIBB_mod_partial
Definition: GIBB_mod_.hpp:75
@ eGIBB_mod_no_left
missing left end (5' for na, NH2 for aa)
Definition: GIBB_mod_.hpp:81
@ e_Org
if all from one organism
Definition: Seqdesc_.hpp:116
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Mol_type
type of molecule
Definition: Seqdesc_.hpp:111
@ e_Method
sequencing method
Definition: Seqdesc_.hpp:113
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Modif
modifiers
Definition: Seqdesc_.hpp:112
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eType_fragment
Deprecated. Used only for AGP 1.1.
Definition: Seq_gap_.hpp:90
@ eTopology_tandem
some part of tandem repeat
Definition: Seq_inst_.hpp:125
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
@ eStrand_other
default ds for DNA, ss for RNA, pept
Definition: Seq_inst_.hpp:138
@ eStrand_ds
double strand
Definition: Seq_inst_.hpp:136
@ eStrand_ss
single strand
Definition: Seq_inst_.hpp:135
@ eGIBB_mol_snRNA
Definition: GIBB_mol_.hpp:71
@ eGIBB_mol_pre_mRNA
precursor RNA of any sort really
Definition: GIBB_mol_.hpp:67
@ eGIBB_mol_mRNA
Definition: GIBB_mol_.hpp:68
@ eGIBB_mol_unknown
Definition: GIBB_mol_.hpp:65
@ eGIBB_mol_genomic_mRNA
reported a mix of genomic and cdna sequence
Definition: GIBB_mol_.hpp:75
@ eGIBB_mol_other_genetic
other genetic material
Definition: GIBB_mol_.hpp:74
@ eGIBB_mol_other
Definition: GIBB_mol_.hpp:76
@ eGIBB_mol_tRNA
Definition: GIBB_mol_.hpp:70
@ eGIBB_mol_rRNA
Definition: GIBB_mol_.hpp:69
@ eGIBB_mol_peptide
Definition: GIBB_mol_.hpp:73
@ eGIBB_mol_scRNA
Definition: GIBB_mol_.hpp:72
@ eGIBB_mol_genomic
Definition: GIBB_mol_.hpp:66
void SetSub(TSub &value)
Assign a value to Sub data member.
void SetData(TData &value)
Assign a value to Data data member.
void SetLevel(TLevel value)
Assign a value to Level data member.
Definition: T3Error_.hpp:363
void SetMessage(const TMessage &value)
Assign a value to Message data member.
Definition: T3Error_.hpp:403
TError & SetError(void)
Select the variant.
Definition: T3Reply_.cpp:108
const TAccnver & GetAccnver(void) const
Get the Accnver member data.
const TMsg & GetMsg(void) const
Get the Msg member data.
unsigned int TErrIndex
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is orig
int i
yy_size_t n
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
const struct ncbi::grid::netcache::search::fields::KEY key
static bool Equals(const CVariation::TPlacements &p1, const CVariation::TPlacements &p2)
Defines to provide correct exporting from DLLs in some configurations.
EIPRangeType t
Definition: ncbi_localip.c:101
const char * tag
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int islower(Uchar c)
Definition: ncbictype.hpp:66
int isupper(Uchar c)
Definition: ncbictype.hpp:70
T min(T x_, T y_)
static Format format
Definition: njn_ioutil.cpp:53
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
The Object manager core.
bool IsDateInPast(const CDate &date)
Definition: utilities.cpp:826
bool HasNoStop(const CSeq_feat &feat, CScope *scope)
Definition: utilities.cpp:2673
EBioseqEndIsType
Definition: utilities.hpp:156
@ eBioseqEndIsType_Last
Definition: utilities.hpp:158
@ eBioseqEndIsType_None
Definition: utilities.hpp:157
@ eBioseqEndIsType_All
Definition: utilities.hpp:159
void CheckBioseqEndsForNAndGap(const CBioseq_Handle &bsh, EBioseqEndIsType &begin_n, EBioseqEndIsType &begin_gap, EBioseqEndIsType &end_n, EBioseqEndIsType &end_gap, bool &begin_ambig, bool &end_ambig)
Definition: utilities.cpp:1422
bool DoesFeatureHaveUnnecessaryException(const CSeq_feat &feat, CScope &scope)
Definition: utilities.cpp:2939
bool HasInternalStop(const CSeq_feat &feat, CScope &scope, bool ignore_exceptions)
Definition: utilities.cpp:2416
bool IsLikelyTaxname(const string &val)
Definition: utilities.cpp:2150
string FixSpecificHost(const string &host)
returns the corrected specific host, if the specific host is invalid and can be corrected returns an ...
Definition: utilities.cpp:2189
bool IsSpecificHostValid(const string &host, string &error_msg)
returns true and error_msg will be empty, if specific host is valid returns true and error_msg will b...
Definition: utilities.cpp:2182
bool HasStopInProtein(const CSeq_feat &feat, CScope &scope)
Definition: utilities.cpp:2519
bool HasBadStartCodon(const CSeq_feat &feat, CScope &scope, bool ignore_exceptions)
Definition: utilities.cpp:2362
static const char * expected[]
Definition: bcp.c:42
static char tmp[2048]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
#define FOR_EACH_ORGMOD_ON_BIOSOURCE(Itr, Var)
FOR_EACH_ORGMOD_ON_BIOSOURCE EDIT_EACH_ORGMOD_ON_BIOSOURCE.
#define FOR_EACH_SEQENTRY_ON_SEQSET(Itr, Var)
FOR_EACH_SEQENTRY_ON_SEQSET EDIT_EACH_SEQENTRY_ON_SEQSET.
CRef< objects::CObjectManager > om
static const char * str(char *buf, int n)
Definition: stats.c:84
Definition: inftrees.h:24
Definition: type.c:6
Utility stuff for more convenient using of Boost.Test library.
else result
Definition: token2.c:20
static HENV env
Definition: transaction2.c:38
#define STANDARD_SETUP
void SetSynthetic_construct(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_entry > BuildGenProdSetNucProtSet(CRef< objects::CSeq_id > nuc_id, CRef< objects::CSeq_id > prot_id)
CRef< objects::CPub > BuildGoodCitGenPub(CRef< objects::CAuthor > author, int serial_number)
CRef< objects::CSeq_feat > MakeGeneForFeature(CRef< objects::CSeq_feat > feat)
CRef< objects::CAuthor > BuildGoodAuthor()
void SetDrosophila_melanogaster(CRef< objects::CSeq_entry > entry)
void SetTaxon(objects::CBioSource &src, size_t taxon)
void SetSubSource(objects::CBioSource &src, objects::CSubSource::TSubtype subtype, string val)
CRef< objects::CSeq_annot > BuildGoodGraphAnnot(string id)
void SetChromosome(objects::CBioSource &src, string chromosome)
void SetTech(CRef< objects::CSeq_entry > entry, objects::CMolInfo::TTech tech)
void ChangeNucId(CRef< objects::CSeq_entry > np_set, CRef< objects::CSeq_id > id)
CRef< objects::CSeq_feat > AddProtFeat(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_id > BuildRefSeqId(void)
CRef< objects::CPub > BuildGoodArticlePub()
void SetFocus(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_entry > GetNucProtSetFromGenProdSet(CRef< objects::CSeq_entry > entry)
void SetGenome(CRef< objects::CSeq_entry > entry, objects::CBioSource::TGenome genome)
void AddToDeltaSeq(CRef< objects::CSeq_entry > entry, string seq)
void SetDiv(CRef< objects::CSeq_entry > entry, string div)
void ChangeProtId(CRef< objects::CSeq_entry > np_set, CRef< objects::CSeq_id > id)
CRef< objects::CSeq_entry > MakeProteinForGoodNucProtSet(string id)
CRef< objects::CSeq_feat > GetCDSFromGoodNucProtSet(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_align > BuildGoodAlign()
void SetSebaea_microphylla(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeqdesc > BuildGoodPubSeqdesc()
CRef< objects::CPub > BuildGoodCitSubPub()
CRef< objects::CSeq_entry > BuildGoodProtSeq(void)
void ResetOrgname(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_feat > GetCDSFromGenProdSet(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_annot > AddFeat(CRef< objects::CSeq_feat > feat, CRef< objects::CSeq_entry > entry)
void SetTaxname(CRef< objects::CSeq_entry > entry, string taxname)
CRef< objects::CSeq_feat > GetmRNAFromGenProdSet(CRef< objects::CSeq_entry > entry)
void RetranslateCdsForNucProtSet(CRef< objects::CSeq_entry > entry, objects::CScope &scope)
void ChangeNucProtSetProteinId(CRef< objects::CSeq_entry > entry, CRef< objects::CSeq_id > id)
void SetCompleteness(CRef< objects::CSeq_entry > entry, objects::CMolInfo::TCompleteness completeness)
void SetNucProtSetPartials(CRef< objects::CSeq_entry > entry, bool partial5, bool partial3)
void SetOrigin(CRef< objects::CSeq_entry > entry, objects::CBioSource::TOrigin origin)
CRef< objects::CSeq_entry > BuildGoodSeq(void)
CRef< objects::CSeq_feat > MakeIntronForMixLoc(CRef< objects::CSeq_id > id)
void SetSpliceForMixLoc(objects::CBioseq &seq)
CRef< objects::CSeq_feat > AddMiscFeature(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_entry > BuildGoodEcoSet()
void SetTransgenic(objects::CBioSource &src, bool do_set)
void ChangeId(CRef< objects::CSeq_annot > annot, CRef< objects::CSeq_id > id)
void MakeNucProtSet3Partial(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_feat > GetProtFeatFromGoodNucProtSet(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_entry > GetProteinSequenceFromGoodNucProtSet(CRef< objects::CSeq_entry > entry)
void RemoveDescriptorType(CRef< objects::CSeq_entry > entry, objects::CSeqdesc::E_Choice desc_choice)
void SetBiomol(CRef< objects::CSeq_entry > entry, objects::CMolInfo::TBiomol biomol)
CRef< objects::CSeq_entry > BuildGoodNucProtSet(void)
CRef< objects::CSeq_feat > MakemRNAForCDS(CRef< objects::CSeq_feat > feat)
void SetOrgMod(objects::CBioSource &src, objects::COrgMod::TSubtype subtype, string val)
CRef< objects::CSeq_entry > GetNucleotideSequenceFromGoodNucProtSet(CRef< objects::CSeq_entry > entry)
void ClearFocus(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_loc > MakeMixLoc(CRef< objects::CSeq_id > id)
void SetNucProtSetProductName(CRef< objects::CSeq_entry > entry, string new_name)
void AddGoodPub(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_feat > MakeCDSForGoodNucProtSet(const string &nuc_id, const string &prot_id)
void AddGoodSource(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_feat > BuildtRNA(CRef< objects::CSeq_id > id)
void SetCommon(CRef< objects::CSeq_entry > entry, string common)
void RevComp(objects::CBioseq &bioseq)
void SetGcode(CRef< objects::CSeq_entry > entry, objects::COrgName::TGcode gcode)
CRef< objects::CSeq_feat > BuildGoodtRNA(CRef< objects::CSeq_id > id)
void ChangeNucProtSetNucId(CRef< objects::CSeq_entry > entry, CRef< objects::CSeq_id > id)
void AdjustProtFeatForNucProtSet(CRef< objects::CSeq_entry > entry)
void RemoveDeltaSeqGaps(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_entry > BuildGoodDeltaSeq(void)
CRef< objects::CSeq_feat > AddGoodSourceFeature(CRef< objects::CSeq_entry > entry)
CRef< objects::CSeq_entry > GetGenomicFromGenProdSet(CRef< objects::CSeq_entry > entry)
void SetDbxref(objects::CBioSource &src, string db, objects::CObject_id::TId id)
CRef< objects::CSeq_entry > BuildGoodGenProdSet()
void SetLineage(CRef< objects::CSeq_entry > entry, string lineage)
void RemoveDbxref(objects::CBioSource &src, string db, objects::CObject_id::TId id)
void TestRepliconForbiddenWords(CSubSource::ESubtype subtype, bool expect_errs)
static CRef< CSeq_align > BuildSetAlign(CRef< CSeq_entry > entry)
void AddYear(CDate &add_date)
void TestOneReplicon(CSubSource::ESubtype subtype, const string &val, const string &err_code, EDiagSev sev, const string &msg)
void TestSpecificHostNoError(const string &host)
void TestUTRPair(bool add_gene, bool is_minus)
#define INTERNAL_STOP_ERR
void TestOneLatLonCountry(const string &country, const string &lat_lon, const string &error, bool use_state=false, const string &err_code="LatLonCountry")
USING_SCOPE(objects)
void AddOrgmodDescriptor(CRef< CSeq_entry > entry, const string &val, COrgMod::ESubtype subtype)
void TestNewAccessionOnNucProt(const string &n_acc, const string &p_acc, bool is_wgs)
static void SetErrorsAccessions(vector< CExpectedError * > &expected_errors, string accession)
CRef< CSeq_submit > MakeGeneious()
static CRef< CSeq_entry > BuildGenProdSetBigNucProtSet(CRef< CSeq_id > nuc_id, CRef< CSeq_id > prot_id)
static bool s_debugMode
void TestOnePlasmid(const string &plasmid_name, bool expect_error)
NCBITEST_INIT_TREE()
static void AddCDSAndProtForBigGoodNucProtSet(CRef< CSeq_entry > entry, string nuc_id, string prot_id, TSeqPos offset)
#define NO_STOP_ERR
void CheckUnbalancedParenthesesSubSource(CSubSource::TSubtype subtype, const string &val)
bool s_ArePrimersUnique(const CPCRReactionSet &rset)
CRef< CSeq_entry > s_BuildBadEcNumberEntry()
void CheckStrings(const vector< string > &seen, const vector< string > &expected)
NCBITEST_INIT_CMDLINE(arg_desc)
void AddOrgmodFeat(CRef< CSeq_entry > entry, const string &val, COrgMod::ESubtype subtype)
void TestAlwaysGoodReplicon(const string &val)
#define START_CODON_AND_INT_STOP_ERR
void AddOrgmod(COrg_ref &org, const string &val, COrgMod::ESubtype subtype)
CRef< CSeq_feat > MakeGeneOntologyFeat(CRef< CUser_field > term1, CRef< CUser_field > term2)
void AddSgmlError(vector< CExpectedError * > &expected_errors, const string &valtype, const string &val)
static bool OrgModHasOtherRules(COrgMod::TSubtype subtype)
static void s_USAStateTest(string before, string after, CCountries::EStateCleanup expected)
void TestOneMiscPartial(CRef< CSeq_entry > entry, TSeqPos good_start, TSeqPos bad_start, TSeqPos good_stop, TSeqPos bad_stop, bool is_mrna)
void s_AddGeneralAndLocal(CBioseq &seq)
void CheckLocalId(const string &id, const string &badchar)
void SetUpMiscForPartialTest(CSeq_feat &feat, TSeqPos start, TSeqPos stop, bool pseudo)
void TestGoodNucId(const string &id_str)
static void AddRefGeneTrackingUserObject(CRef< CSeq_entry > entry)
void AddStrsField(CUser_object &user, const string &label, const string &val)
static CRef< CSeq_entry > BuildBigGoodNucProtSet(void)
void AddMonth(CDate &add_date)
static void MakeBadSeasonDate(CDate &date)
vector< pair< string, string > > THostStringsVector
CRef< CUser_field > MkField(const string &label, const string &val)
void TestOneOtherAcc(CRef< CSeq_id > other_acc, bool id_change, bool conflict, bool need_hist=false)
#define NO_SRC_ERR
void WriteErrors(const CValidError &eval, bool debug_mode)
void AddCdregionToSmallGenomeSet(CRef< CSeq_entry > entry, size_t cdr1_num, size_t cdr2_num, size_t cdr_pos, size_t p_pos)
void ChangeErrorAcc(vector< CExpectedError * > expected_errors, const string &acc)
static CRef< CSeq_entry > BuildGapFuzz100DeltaSeq(void)
#define test_undesired_protein_name(name)
void AddDay(CDate &add_date)
void TestAlwaysBadReplicon(const string &val)
void TestMultipleEquivBioSources(const string &lineage, TSeqPos first_end, TSeqPos second_start, bool expected)
void CheckGeneOntologyTermNotDuplicate(CRef< CSeq_feat > feat)
#define PROT_LEN_ERR
void CheckOneSpecificHost(const string &orig, const string &newval)
static void SetRefGeneTrackingStatus(CRef< CSeq_entry > entry, string status)
void TestDeltaTechAllowed(CMolInfo::TTech tech)
static void SetFeatureLocationBond(CRef< CSeq_feat > feat, string id, TSeqPos pt1, TSeqPos pt2)
void ShowOrgRef(const COrg_ref &org)
void TestConsultRequired(const string &taxname)
void TestNewAccessionOnStandaloneProt(const string &accession, bool is_nuc_acc, bool is_wgs)
static void AddGenbankKeyword(CRef< CSeq_entry > entry, string keyword)
void AdjustGap(CSeq_gap &gap, CSeq_gap::EType gap_type, bool is_linked, vector< CLinkage_evidence::EType > linkage_evidence)
static void AddTpaAssemblyUserObject(CRef< CSeq_entry > entry)
static CRef< CSeq_graph > BuildGoodByteGraph(CRef< CSeq_entry > entry, TSeqPos offset=0, TSeqPos len=kInvalidSeqPos)
static CRef< CSeq_entry > BuildGoodSpliceNucProtSet()
void TestGoodProtId(const string &id_str)
void CheckMiscPartialErrors(CRef< CSeq_entry > entry, bool expect_bad_5, bool expect_bad_3)
void TestStartGapSeg(CMolInfo::TTech tech)
void TestDeltaTechNotAllowed(CMolInfo::TTech tech)
static CRef< CSeq_entry > BuildGenProdSetWithBigProduct()
void CheckHost(const CBioseq &seq, const string &host)
#define TESTWGS(seh, entry)
#define EXCEPTION_PROBLEM_ERR
static string MakeWrongCap(const string &str)
void TestBulkSpecificHostFixList(const THostStringsVector &test_values)
void AddChromosomeNoLocation(vector< CExpectedError * > &expected_errors, const string &id)
static CRef< CSeq_entry > MakeGps(CRef< CSeq_entry > member)
BOOST_FIXTURE_TEST_CASE(Test_SEQ_INST_BadSeqIdFormat, CGenBankFixture)
void CheckErrors(const CValidError &eval, vector< CExpectedError * > &expected_errors)
void MakeLeft(CSeq_loc &loc)
void AddGeneticCode(CSeq_feat &cds, CGenetic_code::C_E::TId code_id)
void TestOneStrain(const string &taxname, const string &strain, const string &lineage, TTaxId taxID, bool expect_err)
void TestOneGeneralSeqId(const string &db, const string &tag, const string &errmsg)
void TestOverlappingRNAFeatures(const CSeq_loc &loc1, const CSeq_loc &loc2, bool expect_err)
void TestNewAccessionAsInference(const string &acc)
void TestOneLongGeneral(bool emb, bool err)
void TestBadProtId(const string &id_str)
CRef< CTaxon3_reply > s_CreateReplyWithMessage(const string &message)
static CRef< CSeq_align > BuildSetDendiagAlign(CRef< CSeq_entry > entry)
NCBITEST_AUTO_INIT()
CRef< CSeq_id > MakeSmallGenomeSetNucId(size_t num)
#define TESTPOPPHYMUTECO(seh, entry)
static void SetTitle(CRef< CSeq_entry > entry, string title)
#define test_gene_syn(name)
void CheckUnbalancedParenthesesOrgMod(COrgMod::TSubtype subtype, const string &val)
#define NO_PUB_ERR
CRef< CSeq_entry > BuildSmallGenomeSet(size_t num_np)
void TestRepliconTaxname(CSubSource::ESubtype subtype, bool expect_errs)
static void ChangeGoodNucProtSetIdToGenbankName(CRef< CSeq_entry > entry, string name)
void CheckGeneOntologyTermDuplicate(CRef< CSeq_feat > feat)
#define NO_SUB_ERR
void AddStrainDescriptor(CSeq_entry &entry, const string &taxname, const string &strain, const string &lineage, TTaxId taxID)
BOOST_AUTO_TEST_CASE(Test_Descr_MissingKeyword)
static bool SubSourceHasOtherRules(CSubSource::TSubtype subtype)
void MakeRight(CSeq_loc &loc, TSeqPos stop)
static bool IsProteinTech(CMolInfo::TTech tech)
const std::string sc_TestEntryCollidingLocusTags
static NCBI_UNUSED string ToAsn1(const CRef< CSeq_entry > &entry)
CRef< CUser_field > MakeStructuredCommentField(const string &label, const string &value)
void CreateReciprocalLinks(CSeq_feat &f1, CSeq_feat &f2)
static CRef< CUser_field > MakeGoTerm(string text="something", string evidence="some evidence")
void TestNewAccessionOnNuc(const string &accession, bool is_prot_acc, bool is_wgs)
#define STANDARD_SETUP_WITH_MOCK_TAXON(replies)
#define CLEAR_ERRORS
#define STANDARD_SETUP_NAME(entry_name)
void g_IgnoreDataFile(const string &pattern, bool do_ignore=true)
Ignore (or stop ignoring, depending on do_ignore) NCBI application data files matching the given patt...
Definition: util_misc.cpp:182
@ eSubmitterFormatErrorGroup_ConsensusSplice
static bool ambig(char c)
Modified on Wed Mar 27 11:20:58 2024 by modify_doxy.py rev. 669887