NCBI C++ ToolKit
agpconvert.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: agpconvert.cpp 92149 2020-12-22 17:10:32Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Josh Cherry, Michael Kornbluh
27  *
28  * File Description:
29  * Read an AGP file, build Seq-entry's or Seq-submit's,
30  * and do some validation. Wrapper over CAgpConverter.
31  *
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbienv.hpp>
37 #include <corelib/ncbiargs.hpp>
38 #include <corelib/ncbifile.hpp>
39 
41 
42 #include <serial/serial.hpp>
43 #include <serial/objostr.hpp>
44 #include <serial/objistr.hpp>
45 
47 #include <objects/general/Date.hpp>
52 #include <objects/seq/Bioseq.hpp>
53 #include <objects/seq/Seq_inst.hpp>
54 #include <objects/seq/Seqdesc.hpp>
58 #include <objects/seq/Seq_ext.hpp>
66 #include <corelib/ncbitime.hpp>
67 #include <corelib/ncbiexec.hpp>
75 #include <objects/pub/Pub.hpp>
77 #include <objects/seq/Pubdesc.hpp>
80 
81 #include <objmgr/scope.hpp>
82 
84 
85 #include <util/xregexp/regexp.hpp>
86 #include <util/format_guess.hpp>
87 
88 #include <algorithm>
89 
92 
93 /////////////////////////////////////////////////////////////////////////////
94 // Lookup tables
95 
96 namespace {
97 
98  // some arguments simply add a subsource.
99  // Adding more is very simple, just add them to sc_SimpleSubsource below
100  struct SSimpleSubsourceInfo {
101  const char * m_pchSynopsis;
102  const char * m_pchUsage;
103  CSubSource::ESubtype m_eSubtype;
104  };
105  typedef SStaticPair<const char*, SSimpleSubsourceInfo> TSimpleSubsource;
106  static const TSimpleSubsource sc_SimpleSubsource[] = {
107  {"cl", { "clone_lib", "Clone library (for BioSource.subtype)", CSubSource::eSubtype_clone_lib} },
108  {"cm", { "chromosome", "Chromosome (for BioSource.subtype)", CSubSource::eSubtype_chromosome} },
109  {"cn", { "clone", "Clone (for BioSource.subtype)", CSubSource::eSubtype_clone} },
110  {"ht", { "haplotype", "Haplotype (for BioSource.subtype)", CSubSource::eSubtype_haplotype} },
111  {"sc", { "source_comment", "Source comment (for BioSource.subtype = other)", CSubSource::eSubtype_other} },
112  {"sex", { "sex", "Sex/gender (for BioSource.subtype)", CSubSource::eSubtype_sex} }
113  };
115  DEFINE_STATIC_ARRAY_MAP(TSimpleSubsourceMap, sc_SimpleSubsourceMap, sc_SimpleSubsource );
116 }
117 
118 /////////////////////////////////////////////////////////////////////////////
119 // CAgpconvertApplication::
120 
121 
123 {
124 public:
125 
126  CAgpconvertApplication(void) ;
127 
128  virtual void Init(void);
129  virtual int Run(void);
130  virtual void Exit(void);
131 
132 private:
133 
135  {
136  public:
137  virtual void HandleError(CAgpConverter::EError eError, const string & sMessage ) const;
138  };
139 
141 
143  {
144  public:
145  // runs asnval on the file
146  virtual void Notify(const string & file);
147  };
148 
149 
150  // load the file specified via the -template arg
151  void x_LoadTemplate(
152  const string & sTemplateLocation,
153  CRef<CSeq_entry> & out_ent_templ,
154  CRef<CSeq_submit> & out_submit_templ);
155 
156  bool x_IsAnySimpleSubsourceArgSet(void);
157 
158  void x_HandleTaxArgs( CRef<CSeqdesc> source_desc );
159 };
160 
161 /////////////////////////////////////////////////////////////////////////////
162 // Constructor
163 
165  m_pCustomErrorHandler( new CCustomErrorHandler )
166 {
167 }
168 
169 /////////////////////////////////////////////////////////////////////////////
170 // Init test for all different types of arguments
171 
172 
174 {
176 
177  // Create command-line argument descriptions class
178  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
179 
180  // Specify USAGE context
181  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
182  "AGP file converter program");
183 
184  // Describe the expected command-line arguments
185 
186  arg_desc->SetCurrentGroup("INPUT");
187 
188  arg_desc->AddKey("template", "LOCATION",
189  "The filename of a template Seq-entry or Seq-submit or Bioseq in "
190  "either ASN.1 text or ASN.1 binary or XML (autodetected). A series of Seqdescs "
191  "may optionally follow the main ASN.1 object."
192  "Alternatively, if the LOCATION looks reasonably like a GenBank identifier and "
193  "doesn't exist as a file, the template is loaded from genbank instead.",
195  arg_desc->AddFlag("keeptemplateannots",
196  "Unless this flag is set, the annots from the template are removed");
197 
198  arg_desc->AddExtra
199  (1, 32766, "AGP files to process",
201 
202  arg_desc->SetCurrentGroup("OUTPUT");
203 
204  arg_desc->AddOptionalKey("outdir", "output_directory",
205  "Directory for output files "
206  "(defaults to current directory)",
208  arg_desc->AddOptionalKey("ofs", "ofs",
209  "Output filename suffix "
210  "(default is \".ent\" for Seq-entry "
211  "or \".sqn\" for Seq-submit",
213  arg_desc->AddFlag("stdout", "Write to stdout rather than files. This does not work for Seq-submits. "
214  "Implies -no_asnval.");
215  arg_desc->AddDefaultKey(
216  "output-type", "ASN_OBJECT_TYPE",
217  "This lets you force what kind of object is used for output. "
218  " Forcing may cause some data to be thrown out (Example: "
219  "if input is a Seq-submit and you force the output to be a "
220  "Seq-entry, then the Seq-submit's data will be disregarded)",
222  "AUTO" );
223  arg_desc->SetConstraint("output-type",
224  &(*new CArgAllow_Strings,
225  "AUTO", "Seq-entry"));
226 
227 
228  arg_desc->SetCurrentGroup("VALIDATION");
229 
230  arg_desc->AddOptionalKey("components", "components_file",
231  "Bioseq-set of components, used for "
232  "validation",
234  arg_desc->AddFlag("no_asnval",
235  "Do not validate using asnval");
236 
237  arg_desc->SetCurrentGroup("DESCRIPTORS");
238 
239  arg_desc->AddOptionalKey("dl", "definition_line",
240  "Definition line (title descriptor)",
242 
243  arg_desc->AddOptionalKey("nt", "tax_id",
244  "NCBI Taxonomy Database ID",
246  arg_desc->SetDependency("nt", CArgDescriptions::eExcludes, "chromosomes" );
247  arg_desc->AddOptionalKey("on", "org_name",
248  "Organism name",
250  arg_desc->SetDependency("on", CArgDescriptions::eExcludes, "chromosomes" );
251  arg_desc->AddOptionalKey("sn", "strain_name",
252  "Strain name",
254  arg_desc->SetDependency("sn", CArgDescriptions::eRequires, "on");
255  arg_desc->SetDependency("sn", CArgDescriptions::eExcludes, "chromosomes" );
256 
257  ITERATE( TSimpleSubsourceMap, simple_src_it, sc_SimpleSubsourceMap ) {
258  const string & sArgName = simple_src_it->first;
259  const SSimpleSubsourceInfo & info = simple_src_it->second;
260  arg_desc->AddOptionalKey(sArgName, info.m_pchSynopsis,
261  info.m_pchUsage,
263  arg_desc->SetDependency(sArgName, CArgDescriptions::eExcludes, "chromosomes" );
264  }
265 
266  arg_desc->SetCurrentGroup("SEQ-IDS");
267 
268  arg_desc->AddFlag("fasta_id", "Parse object ids (col. 1) "
269  "as fasta-style ids if they contain '|'");
270  arg_desc->AddDefaultKey("general_id", "general_db",
271  "if set to non-empty string, local ids for object seq-ids will "
272  "become general ids belonging to the given database",
274 
275  arg_desc->SetCurrentGroup("OTHER");
276 
277  arg_desc->AddFlag("fuzz100", "For gaps of length 100, "
278  "put an Int-fuzz = unk in the literal");
279 
280  arg_desc->AddOptionalKey("chromosomes", "chromosome_name_file",
281  "Mapping of col. 1 names to chromsome "
282  "names, for use as SubSource",
284  arg_desc->AddFlag("gap-info",
285  "Set Seq-gap (gap type and linkage) in delta sequence");
286  arg_desc->AddFlag("len-check",
287  "Die if AGP's length does not match the length of the original template.");
288 
289  // Setup arg.descriptions for this application
290  SetupArgDescriptions(arg_desc.release());
291 }
292 
293 
294 static TTaxId s_GetTaxid(const COrg_ref& org_ref) {
295  TTaxId taxid = ZERO_TAX_ID;
296  int count = 0;
297  ITERATE (COrg_ref::TDb, db_tag, org_ref.GetDb()) {
298  if ((*db_tag)->GetDb() == "taxon") {
299  count++;
300  taxid = TAX_ID_FROM(CObject_id::TId, (*db_tag)->GetTag().GetId());
301  }
302  }
303  if (count != 1) {
304  throw runtime_error("found " + NStr::IntToString(count) + " taxids; "
305  "expected exactly one");
306  }
307  return taxid;
308 }
309 
310 
311 // Helper for removing old-name OrgMod's
313 {
315  {
316  return mod->GetSubtype() == COrgMod::eSubtype_old_name;
317  }
318 };
319 
320 
321 /////////////////////////////////////////////////////////////////////////////
322 // Run
323 
324 
326 {
327  // Get arguments
328  const CArgs & args = GetArgs();
329 
330  // load template file
331  CRef<CSeq_entry> ent_templ;
332  CRef<CSeq_submit> submit_templ;
334  args["template"].AsString(),
335  ent_templ,
336  submit_templ );
337 
338  // don't use any annots in the template, unless specifically requested
339  if( ! args["keeptemplateannots"] ) {
340  ent_templ->SetSeq().ResetAnnot();
341  }
342 
343  // Deal with any descriptor info from command line
344  if (args["dl"]) {
345  const string& dl = args["dl"].AsString();
346  ITERATE (CSeq_descr::Tdata, desc,
347  ent_templ->GetSeq().GetDescr().Get()) {
348  if ((*desc)->IsTitle()) {
349  throw runtime_error("-dl given but template contains a title");
350  }
351  }
352  CRef<CSeqdesc> title_desc(new CSeqdesc);
353  title_desc->SetTitle(dl);
354  ent_templ->SetSeq().SetDescr().Set().push_back(title_desc);
355  }
356  if (args["nt"] || args["on"] || args["sn"] ||
358  {
359  // consistency checks
360  ITERATE (CSeq_descr::Tdata, desc,
361  ent_templ->GetSeq().GetDescr().Get()) {
362  if ((*desc)->IsSource()) {
363  throw runtime_error("BioSource specified on command line but "
364  "template contains BioSource");
365  }
366  }
367 
368  // build a BioSource desc and add it to template
369  CRef<CSeqdesc> source_desc(new CSeqdesc);
370 
371  // handle the simple subsource args, if any set
372  ITERATE( TSimpleSubsourceMap, simple_src_it, sc_SimpleSubsourceMap ) {
373  const string & sArgName = simple_src_it->first;
374  const SSimpleSubsourceInfo & info = simple_src_it->second;
375  if( args[sArgName] ) {
376  CRef<CSubSource> sub_source( new CSubSource );
377  sub_source->SetSubtype(info.m_eSubtype);
378  sub_source->SetName(args[sArgName].AsString());
379  source_desc->SetSource().SetSubtype().push_back(sub_source);
380  }
381  }
382 
383  // handle tax-related args, if any set
384  x_HandleTaxArgs( source_desc );
385 
386  ent_templ->SetSeq().SetDescr().Set().push_back(source_desc);
387  }
388 
389  CAgpConverter::TOutputFlags fAgpConvertOutputFlags = 0;
390  if( args["fuzz100"] ) {
391  fAgpConvertOutputFlags |= CAgpConverter::fOutputFlags_Fuzz100;
392  }
393  if( args["fasta_id"] ) {
394  fAgpConvertOutputFlags |= CAgpConverter::fOutputFlags_FastaId;
395  }
396  if( args["gap-info"] ) {
397  fAgpConvertOutputFlags |= CAgpConverter::fOutputFlags_SetGapInfo;
398  }
399  if( args["len-check"] ) {
400  fAgpConvertOutputFlags |= CAgpConverter::fOutputFlags_AGPLenMustMatchOrig;
401  }
402  CAgpConverter agpConvert(
403  CConstRef<CBioseq>( &ent_templ->GetSeq() ),
404  ( submit_templ->IsEntrys() ? &submit_templ->GetSub() : NULL ),
405  fAgpConvertOutputFlags,
407 
408  // add general_id transformer, if needed
409  const string & sGeneralIdDb = args["general_id"].AsString();
410  if( ! sGeneralIdDb.empty() ) {
411  class CLocalToGeneralIdTransformer :
413  {
414  public:
415  CLocalToGeneralIdTransformer(const string & sGeneralDb)
416  : m_sGeneralDb(sGeneralDb) { }
417 
418  virtual bool Transform(CRef<objects::CSeq_id> pSeqId) const
419  {
420  if( ! pSeqId || ! pSeqId->IsLocal() ) {
421  // only transform local ids
422  return false;
423  }
424  CRef<CSeq_id> pNewSeqId( new CSeq_id );
425  CDbtag & dbtag = pNewSeqId->SetGeneral();
426  dbtag.SetDb(m_sGeneralDb);
427  if( pSeqId->GetLocal().IsId() ) {
428  dbtag.SetTag().SetId( pSeqId->GetLocal().GetId() );
429  } else if( pSeqId->GetLocal().IsStr() ) {
430  dbtag.SetTag().SetStr( pSeqId->GetLocal().GetStr() );
431  } else {
432  return false;
433  }
434 
435  pSeqId->Assign( *pNewSeqId );
436  return true;
437  }
438 
439  private:
440  string m_sGeneralDb;
441  };
443  new CLocalToGeneralIdTransformer(sGeneralIdDb) );
444 
445  agpConvert.SetIdTransformer( pIdTransformer.GetPointer() );
446  }
447 
448  // if validating against a file containing
449  // sequence components, load it and make
450  // a mapping of ids to lengths
451  if (args["components"]) {
452  CRef<CBioseq_set> seq_set( new CBioseq_set );
453  args["components"].AsInputFile() >> MSerial_AsnText >> *seq_set;
454  agpConvert.SetComponentsBioseqSet(seq_set);
455  }
456 
457  // if requested, load a file of mappings of
458  // object identifiers to chromosome names
459  if (args["chromosomes"]) {
460  agpConvert.LoadChromosomeMap( args["chromosomes"].AsInputFile() );
461  }
462 
463  // convert AGP file name args to strings
464  vector<string> vecAgpFileNames;
465  for( size_t idx = 1; idx <= args.GetNExtra(); ++idx ) {
466  vecAgpFileNames.push_back( args[idx].AsString() );
467  if( ! CFile(vecAgpFileNames.back()).Exists() ) {
468  throw runtime_error( "AGP file not found: " + vecAgpFileNames.back() );
469  }
470  }
471 
472  if( args["stdout"] ) {
473  CAgpConverter::TOutputBioseqsFlags fOutputBioseqsFlags =
475 
476  if( args["output-type"].AsString() == "Seq-entry" ) {
478  }
479  agpConvert.OutputBioseqs(
480  cout,
481  vecAgpFileNames,
482  fOutputBioseqsFlags );
483  } else {
484  if( ! args["outdir"] ) {
485  throw runtime_error("Please specify -stdout or -outdir");
486  }
487 
488  CAsnvalRunner asnval_runner;
489  agpConvert.OutputOneFileForEach(
490  args["outdir"].AsString(),
491  vecAgpFileNames,
492  ( args["ofs"] ? args["ofs"].AsString() : kEmptyStr ),
493  ( args["no_asnval"] ? NULL : &asnval_runner ) );
494  }
495 
496  return 0;
497 }
498 
499 
500 
501 void
503 {
504  // verify using asnval
505 
506  // command and args
507  const char * pchCommand = "asnval";
508  const char * asnval_argv[] = {
509  pchCommand,
510  "-Q", "2",
511  "-o", "stdout",
512  "-i", file.c_str(),
513  NULL
514  };
515 
516  // print what we're executing to cout
517  string cmd = CExec::QuoteArg(pchCommand);
518  for( size_t idx = 0; asnval_argv[idx]; ++idx ) {
519  cmd += ' ';
520  cmd += CExec::QuoteArg(asnval_argv[idx]);
521  }
522  cout << cmd << endl;
523 
524  // run asnval, and wait for it to finish
525  CExec::SpawnVP(CExec::eWait, pchCommand, asnval_argv);
526 }
527 
528 /////////////////////////////////////////////////////////////////////////////
529 // x_LoadTemplate
530 
532  const string & sTemplateLocation,
533  CRef<CSeq_entry> & out_ent_templ,
534  CRef<CSeq_submit> & out_submit_templ)
535 {
536  const CArgs & args = GetArgs();
537 
538  out_ent_templ.Reset( new CSeq_entry );
539  out_submit_templ.Reset( new CSeq_submit ); // possibly not used
540 
541  // check if the location doesn't exist, and see if we can
542  // consider it some kind of sequence identifier
543  if( ! CDirEntry(sTemplateLocation).IsFile() ) {
544  // see if this is blatantly not a sequence identifier
545  if( ! CRegexpUtil(sTemplateLocation).Exists("^[A-Za-z0-9_|]+(\\.[0-9]+)?$") ) {
546  throw runtime_error("This is not a valid sequence identifier: " + sTemplateLocation);
547  }
548 
549  // try to load from genbank
552  pScope->AddDefaults();
553 
554  CRef<CSeq_id> pTemplateId( new CSeq_id(sTemplateLocation) );
555  CBioseq_Handle bsh = pScope->GetBioseqHandle( *pTemplateId );
556 
557  if ( ! bsh ) {
558  throw runtime_error("Invalid sequence identifier: " + sTemplateLocation);
559  }
560  CSeq_entry_Handle entry_h = bsh.GetParentEntry();
561 
562  out_ent_templ->Assign( *entry_h.GetCompleteSeq_entry() );
563  return;
564  }
565 
566 
567  CNcbiIfstream istrm(sTemplateLocation.c_str());
568 
569  // guess format
570  ESerialDataFormat eSerialDataFormat = eSerial_None;
571  {{
572  CFormatGuess::EFormat eFormat =
573  CFormatGuess::Format(istrm);
574 
575  switch(eFormat) {
577  eSerialDataFormat = eSerial_AsnBinary;
578  break;
580  eSerialDataFormat = eSerial_AsnText;
581  break;
582  case CFormatGuess::eXml:
583  eSerialDataFormat = eSerial_Xml;
584  break;
585  default:
587  "template file seems to be in an unsupported format: "
588  << CFormatGuess::GetFormatName(eFormat) );
589  break;
590  }
591 
592  istrm.seekg(0);
593  }}
594 
595  unique_ptr<CObjectIStream> pObjIstrm(
596  CObjectIStream::Open(eSerialDataFormat, istrm, eNoOwnership) );
597 
598  // guess object type
599  const string sType = pObjIstrm->ReadFileHeader();
600 
601  // do the right thing depending on the input type
602  if( sType == CSeq_entry::GetTypeInfo()->GetName() ) {
603  pObjIstrm->Read(ObjectInfo(*out_ent_templ),
605  } else if( sType == CBioseq::GetTypeInfo()->GetName() ) {
606  CRef<CBioseq> pBioseq( new CBioseq );
607  pObjIstrm->Read(ObjectInfo(*pBioseq),
609  out_ent_templ->SetSeq( *pBioseq );
610  } else if( sType == CSeq_submit::GetTypeInfo()->GetName() ) {
611  pObjIstrm->Read(ObjectInfo(*out_submit_templ),
613  if (!out_submit_templ->GetData().IsEntrys()
614  || out_submit_templ->GetData().GetEntrys().size() != 1)
615  {
616  throw runtime_error("Seq-submit template must contain "
617  "exactly one Seq-entry");
618  }
619  } else if( sType == CSubmit_block::GetTypeInfo()->GetName() ) {
620 
621  // a Submit-block
622  CRef<CSubmit_block> submit_block(new CSubmit_block);
623  pObjIstrm->Read(ObjectInfo(*submit_block),
625 
626  // Build a Seq-submit containing this plus a bogus Seq-entry
627  out_submit_templ->SetSub(*submit_block);
628  CRef<CSeq_entry> ent(new CSeq_entry);
629  CRef<CSeq_id> dummy_id(new CSeq_id("lcl|dummy_id"));
630  ent->SetSeq().SetId().push_back(dummy_id);
631  ent->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_raw);
632  ent->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
633  out_submit_templ->SetData().SetEntrys().push_back(ent);
634  } else {
635  NCBI_USER_THROW_FMT("Template must be Seq-entry, Seq-submit, Bioseq or "
636  "Submit-block. Object seems to be of type: " << sType);
637  }
638 
639  // for submit types, pull out the seq-entry inside and remember it
640  if( out_submit_templ->IsEntrys() ) {
641  out_ent_templ = out_submit_templ->SetData().SetEntrys().front();
642  }
643 
644  // The template may contain a set rather than a seq.
645  // That's OK if it contains only one na entry, which we'll use.
646  if (out_ent_templ->IsSet()) {
647  unsigned int num_nuc_ents = 0;
649  ITERATE (CBioseq_set::TSeq_set, ent_iter,
650  out_ent_templ->GetSet().GetSeq_set()) {
651  if ((*ent_iter)->GetSeq().GetInst().IsNa()) {
652  ++num_nuc_ents;
653  tmp->Assign(**ent_iter);
654  // Copy any descriptors from the set to the sequence
656  out_ent_templ->GetSet().GetDescr().Get()) {
657  CRef<CSeqdesc> desc(new CSeqdesc);
658  desc->Assign(**desc_iter);
659  tmp->SetSeq().SetDescr().Set().push_back(desc);
660  }
661  }
662  }
663  if (num_nuc_ents == 1) {
664  out_ent_templ->Assign(*tmp);
665  } else {
666  throw runtime_error("template contains "
667  + NStr::IntToString(num_nuc_ents)
668  + " nuc. Seq-entrys; should contain 1");
669  }
670  }
671 
672  // incorporate any Seqdesc's that follow in the file
673  while (true) {
674  try {
675  CRef<CSeqdesc> desc(new CSeqdesc);
676  pObjIstrm->Read(ObjectInfo(*desc));
677  out_ent_templ->SetSeq().SetDescr().Set().push_back(desc);
678  } catch (...) {
679  break;
680  }
681  }
682 
683  if ( out_submit_templ->IsEntrys() ) {
684  // Take Seq-submit.sub.cit and put it in the Bioseq
685  CRef<CPub> pub(new CPub);
686  pub->SetSub().Assign(out_submit_templ->GetSub().GetCit());
687  CRef<CSeqdesc> pub_desc(new CSeqdesc);
688  pub_desc->SetPub().SetPub().Set().push_back(pub);
689  out_ent_templ->SetSeq().SetDescr().Set().push_back(pub_desc);
690  }
691 
692  if( ! out_ent_templ->IsSeq() ) {
693  throw runtime_error("The Seq-entry must be a Bioseq not a Bioseq-set.");
694  }
695 
696  if( args["output-type"].AsString() == "Seq-entry" ) {
697  // force Seq-entry by throwing out the Seq-submit
698  out_submit_templ.Reset( new CSeq_submit );
699  }
700 }
701 
702 /////////////////////////////////////////////////////////////////////////////
703 // x_IsAnySimpleSubsourceArgSet
704 
706 {
707  const CArgs & args = GetArgs();
708 
709  ITERATE( TSimpleSubsourceMap, simple_src_it, sc_SimpleSubsourceMap ) {
710  const string & sArgName = simple_src_it->first;
711  if( args[sArgName] ) {
712  return true;
713  }
714  }
715 
716  // none are set
717  return false;
718 }
719 
720 /////////////////////////////////////////////////////////////////////////////
721 // x_HandleTaxArgs
722 
724 {
725  const CArgs & args = GetArgs();
726 
727  // leave if no taxon-related arg is set
728  if ( ! args["on"] && ! args["nt"] ) {
729  return;
730  }
731 
732  CTaxon1 cl;
733  if (!cl.Init()) {
734  throw runtime_error("failure contacting taxonomy server");
735  }
736 
737  CConstRef<CTaxon2_data> on_result;
738  CRef<CTaxon2_data> nt_result;
739  CRef<COrg_ref> inp_orgref( new COrg_ref );
740 
741  if (args["on"]) {
742  const string& inp_taxname = args["on"].AsString();
743  inp_orgref->SetTaxname(inp_taxname);
744 
745  if (args["sn"]) {
747  COrgMod::eSubtype_strain, args["sn"].AsString()) );
748  inp_orgref->SetOrgname().SetMod().push_back(mod);
749  }
750 
751  on_result = cl.LookupMerge(*inp_orgref);
752 
753  if (!on_result) {
754  throw runtime_error("taxonomy server lookup failed");
755  }
756  if (!on_result->GetIs_species_level()) {
757  throw runtime_error("supplied name is not species-level");
758  }
759  if (inp_orgref->GetTaxname() != inp_taxname) {
760  cerr << "** Warning: taxname returned by server ("
761  << on_result->GetOrg().GetTaxname()
762  << ") differs from that supplied with -on ("
763  << inp_taxname << ")" << endl;
764  // an old-name OrgMod will have been added
765  COrgName::TMod& mod = inp_orgref->SetOrgname().SetMod();
766  mod.erase(remove_if(mod.begin(), mod.end(), SIsOldName()),
767  mod.end());
768  if (mod.empty()) {
769  inp_orgref->SetOrgname().ResetMod();
770  }
771  }
772 
773  if (args["sn"]) {
774  const string& inp_strain_name = args["sn"].AsString();
775  vector<string> strain_names;
777  inp_orgref->GetOrgname().GetMod())
778  {
779  if ((*mod)->GetSubtype() == COrgMod::eSubtype_strain) {
780  strain_names.push_back((*mod)->GetSubname());
781  }
782  }
783  if (!(strain_names.size() == 1
784  && strain_names[0] == inp_strain_name))
785  {
786  cerr << "** Warning: strain name " << inp_strain_name
787  << " provided but server lookup yielded ";
788  if (strain_names.empty()) {
789  cerr << "no strain name" << endl;
790  } else {
791  cerr << NStr::Join(strain_names, " and ") << endl;
792  }
793  }
794  }
795  }
796 
797  if (args["nt"]) {
798  TTaxId inp_taxid = TAX_ID_FROM(int, args["nt"].AsInteger());
799  nt_result = cl.GetById(inp_taxid);
800  if (!nt_result->GetIs_species_level()) {
801  throw runtime_error("taxid " + NStr::NumericToString(inp_taxid)
802  + " is not species-level");
803  }
804  nt_result->SetOrg().ResetSyn(); // lose any synonyms
805  TTaxId db_taxid = s_GetTaxid(nt_result->GetOrg());
806  if (db_taxid != inp_taxid) {
807  cerr << "** Warning: taxid returned by server ("
808  << NStr::NumericToString(db_taxid)
809  << ") differs from that supplied with -nt ("
810  << inp_taxid << ")" << endl;
811  }
812  if (args["on"]) {
813  TTaxId on_taxid = s_GetTaxid(on_result->GetOrg());
814  if (on_taxid != db_taxid) {
815  throw runtime_error("taxid from name lookup ("
816  + NStr::NumericToString(on_taxid)
817  + ") differs from that from "
818  + "taxid lookup ("
819  + NStr::NumericToString(db_taxid)
820  + ")");
821  }
822  }
823  }
824 
825  if (args["on"]) {
826  source_desc->SetSource().SetOrg().Assign(*inp_orgref);
827  } else {
828  source_desc->SetSource().SetOrg().Assign(nt_result->GetOrg());
829  }
830 }
831 
832 /////////////////////////////////////////////////////////////////////////////
833 // HandleError
834 
835 void
837  CAgpConverter::EError eError, const string & sMessage ) const
838 {
839  // assert that this function includes all possibilities
842 
843  switch(eError) {
844  // these errors are instantly fatal
856  NCBI_USER_THROW_FMT("ERROR: " << sMessage);
857  break;
858 
859  // these errors are just written and we continue:
863  cerr << sMessage << endl;
864  break;
865  default:
866  _TROUBLE;
867  break;
868  }
869 }
870 
871 /////////////////////////////////////////////////////////////////////////////
872 // Cleanup
873 
874 
876 {
877  SetDiagStream(0);
878 }
879 
880 
881 /////////////////////////////////////////////////////////////////////////////
882 // MAIN
883 
884 
885 int main(int argc, const char* argv[])
886 {
887  // Execute main application function
888  return CAgpconvertApplication().AppMain(argc, argv, 0, eDS_Default, 0);
889 }
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
USING_SCOPE(objects)
static TTaxId s_GetTaxid(const COrg_ref &org_ref)
Definition: agpconvert.cpp:294
int main(int argc, const char *argv[])
Definition: agpconvert.cpp:885
USING_NCBI_SCOPE
Definition: agpconvert.cpp:90
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
Subclass this to override how errors are handled (example: to stop early on some kinds of errors)
This gets called after each file is written, so the caller can do useful things like run asnval on ev...
void OutputBioseqs(CNcbiOstream &ostrm, const std::vector< std::string > &vecAgpFileNames, TOutputBioseqsFlags fFlags=0, size_t uMaxBioseqsToWrite=std::numeric_limits< size_t >::max()) const
Outputs the result from the AGP file names as ASN.1.
@ fOutputBioseqsFlags_WrapInSeqEntry
Bioseqs and Bioseq-sets should always be wrapped in a Seq-entry.
@ fOutputBioseqsFlags_DoNOTUnwrapSingularBioseqSets
Specify this if Bioseq-sets with just one Bioseq in them should _NOT_ be unwrapped into a Bioseq.
EError
The different kinds of errors that could occur while processing.
@ eError_SuggestUsingFastaIdOption
@ eError_WrongNumberOfSourceDescs
@ eError_SubmitBlockIgnoredWhenOneBigBioseqSet
@ eError_EntrySkippedDueToFailedComponentValidation
@ eError_ChromosomeFileBadFormat
@ eError_OutputDirNotFoundOrNotADir
@ eError_ChromosomeIsInconsistent
@ eError_ChromosomeMapIgnoredBecauseChromosomeSubsourceAlreadyInTemplate
@ eError_AGPLengthMismatchWithTemplateLength
void SetComponentsBioseqSet(CConstRef< objects::CBioseq_set > pComponentsBioseqSet)
Give a bioseq-set containing all the components pieces, for verification.
void OutputOneFileForEach(const string &sDirName, const std::vector< std::string > &vecAgpFileNames, const string &sSuffix=kEmptyStr, IFileWrittenCallback *pFileWrittenCallback=nullptr) const
Outputs the results of each Seq-entry (or Seq-submit if Submit-block was given) into its own file in ...
void LoadChromosomeMap(CNcbiIstream &chromosomes_istr)
Input has 2 tab-delimited columns: id, then chromosome name.
@ fOutputFlags_Fuzz100
For gaps of length 100, put an Int-fuzz = unk in the literal.
@ fOutputFlags_FastaId
Parse object ids (col. 1) as fasta-style ids if they contain '|'.
@ fOutputFlags_SetGapInfo
Set Seq-gap (gap type and linkage) in delta sequence.
@ fOutputFlags_AGPLenMustMatchOrig
When set, we give an error on AGP objects that don't have the same length as the original template.
int TOutputFlags
Bitwise-OR of EOutputFlags.
void SetIdTransformer(IIdTransformer *pIdTransformer)
When this reads an id, it will use the supplied transformer (if any) to change the CSeq_id.
virtual void Notify(const string &file)
Definition: agpconvert.cpp:502
virtual void HandleError(CAgpConverter::EError eError, const string &sMessage) const
Default is to print to cerr, but feel free to override in a subclass.
Definition: agpconvert.cpp:836
virtual void Init(void)
Initialize the application.
Definition: agpconvert.cpp:173
void x_HandleTaxArgs(CRef< CSeqdesc > source_desc)
Definition: agpconvert.cpp:723
virtual void Exit(void)
Cleanup on application exit.
Definition: agpconvert.cpp:875
CRef< CCustomErrorHandler > m_pCustomErrorHandler
Definition: agpconvert.cpp:140
bool x_IsAnySimpleSubsourceArgSet(void)
Definition: agpconvert.cpp:705
virtual int Run(void)
Run the application.
Definition: agpconvert.cpp:325
void x_LoadTemplate(const string &sTemplateLocation, CRef< CSeq_entry > &out_ent_templ, CRef< CSeq_submit > &out_submit_templ)
Definition: agpconvert.cpp:531
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
Definition: Dbtag.hpp:53
CDirEntry –.
Definition: ncbifile.hpp:262
CFile –.
Definition: ncbifile.hpp:1604
EFormat
The formats are checked in the same order as declared here.
@ eBinaryASN
Binary ASN.1.
@ eTextASN
Text ASN.1.
static const char * GetFormatName(EFormat format)
static EFormat Format(const string &path, EOnError onerror=eDefault)
Guess file format.
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
Definition: Pub.hpp:56
CRegexpUtil –.
Definition: regexp.hpp:312
CScope –.
Definition: scope.hpp:92
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
bool IsEntrys(void) const
Definition: Seq_submit.cpp:54
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
CSubmit_block –.
CRef< CTaxon2_data > GetById(TTaxId tax_id)
Definition: taxon1.cpp:230
CConstRef< CTaxon2_data > LookupMerge(COrg_ref &inp_orgRef, string *psLog=0, TOrgRefStatus *pStatusOut=0)
Definition: taxon1.cpp:429
bool Init(void)
Definition: taxon1.cpp:101
Operators to edit gaps in sequences.
static CS_COMMAND * cmd
Definition: ct_dynamic.c:26
static char tmp[3200]
Definition: utf8.c:42
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1312
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
@ fHideLogfile
Hide log file description.
@ fHideConffile
Hide configuration file description.
@ fHideVersion
Hide version description.
@ eNoOwnership
No ownership is assumed.
Definition: ncbi_types.h:135
size_t GetNExtra(void) const
Get the number of unnamed positional (a.k.a. extra) args.
Definition: ncbiargs.hpp:422
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NULL
Definition: ncbistd.hpp:225
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8083
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1790
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
Definition: ncbiexpt.hpp:724
static string QuoteArg(const string &arg)
Quote argument.
Definition: ncbiexec.cpp:230
static CResult SpawnVP(EMode mode, const char *cmdname, const char *const *argv)
Spawn a new process with variable number of command-line arguments and find file to execute from the ...
Definition: ncbiexec.cpp:656
@ eWait
Suspends calling thread until execution of new process is complete (synchronous operation).
Definition: ncbiexec.hpp:87
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4038
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_Xml
XML.
Definition: serialdef.hpp:75
@ eSerial_None
Definition: serialdef.hpp:72
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
pair< TObjectPtr, TTypeInfo > ObjectInfo(C &obj)
Definition: objectinfo.hpp:762
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
#define kEmptyStr
Definition: ncbistr.hpp:123
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2697
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
Definition: SubSource_.hpp:319
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
void SetName(const TName &value)
Assign a value to Name data member.
Definition: SubSource_.hpp:359
TSubtype & SetSubtype(void)
Assign a value to Subtype data member.
Definition: BioSource_.hpp:545
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
vector< CRef< CDbtag > > TDb
Definition: Org_ref_.hpp:101
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
list< CRef< COrgMod > > TMod
Definition: OrgName_.hpp:332
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_old_name
Definition: OrgMod_.hpp:124
TSub & SetSub(void)
Select the variant.
Definition: Pub_.cpp:195
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TDescr & GetDescr(void) const
Get the Descr member data.
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
void SetPub(TPub &value)
Assign a value to Pub data member.
Definition: Pubdesc_.cpp:72
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
void ResetAnnot(void)
Reset Annot data member.
Definition: Bioseq_.cpp:91
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
const TCit & GetCit(void) const
Get the Cit member data.
void SetSub(TSub &value)
Assign a value to Sub data member.
const TEntrys & GetEntrys(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
const TSub & GetSub(void) const
Get the Sub member data.
bool IsEntrys(void) const
Check if variant Entrys is selected.
TIs_species_level GetIs_species_level(void) const
Get the Is_species_level member data.
void SetOrg(TOrg &value)
Assign a value to Org data member.
const TOrg & GetOrg(void) const
Get the Org member data.
FILE * file
static MDB_envinfo info
Definition: mdb_load.c:37
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
Defines a portable execute class.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Defines: CTimeFormat - storage class for time format.
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
@ eError
An error was encountered while trying to send request or to read and to process the reply.
#define DEFINE_STATIC_ARRAY_MAP(Type, Var, Array)
Definition: static_set.hpp:888
bool operator()(CRef< COrgMod > mod)
Definition: agpconvert.cpp:314
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
#define _TROUBLE
#define _ASSERT
C++ wrappers for the Perl-compatible regular expression (PCRE) library.
Modified on Sun Apr 14 05:25:19 2024 by modify_doxy.py rev. 669887