NCBI C++ ToolKit
agp_converter.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: agp_converter.cpp 93579 2021-05-01 20:54:52Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Josh Cherry, Michael Kornbluh
27  *
28  * File Description:
29  * Read an AGP file, build Seq-entry's or Seq-submit's,
30  * and optionally do some validation
31  *
32  */
33 #include <ncbi_pch.hpp>
34 
35 #include <corelib/ncbifile.hpp>
36 
38 
39 #include <serial/objostr.hpp>
40 #include <serial/objostrasn.hpp>
41 
45 #include <objects/seq/Bioseq.hpp>
46 #include <objects/seq/Seq_inst.hpp>
47 #include <objects/seq/Seqdesc.hpp>
49 #include <objects/seq/Seq_ext.hpp>
61 
62 #include <sstream>
63 
64 #include <util/static_map.hpp>
65 
66 using namespace std;
67 
69 
71 
73  CConstRef<CBioseq> pTemplateBioseq,
74  const CSubmit_block * pSubmitBlock,
75  TOutputFlags fOutputFlags,
76  CRef<CErrorHandler> pErrorHandler )
77  : m_pTemplateBioseq(pTemplateBioseq),
78  m_fOutputFlags(fOutputFlags)
79 {
80  if( pSubmitBlock ) {
81  m_pSubmitBlock.Reset(pSubmitBlock);
82  }
83 
84  if( pErrorHandler ) {
85  m_pErrorHandler = pErrorHandler;
86  } else {
87  // use default error handler if none supplied
88  m_pErrorHandler.Reset( new CErrorHandler );
89  }
90 }
91 
92 void
94  CConstRef<CBioseq_set> pComponentsBioseqSet)
95 {
97  ITERATE (CBioseq_set::TSeq_set, ent, pComponentsBioseqSet->GetSeq_set()) {
98  TSeqPos length = (*ent)->GetSeq().GetInst().GetLength();
99  ITERATE (CBioseq::TId, id, (*ent)->GetSeq().GetId()) {
100  m_mapComponentLength[(*id)->AsFastaString()] = length;
101  }
102  }
103 }
104 
105 void
107  const TChromosomeMap & mapChromosomeNames )
108 {
109  // Make sure there's not already a chromosome in the template
110  ITERATE (CSeq_descr::Tdata, desc,
111  m_pTemplateBioseq->GetDescr().Get()) {
112  if ((*desc)->IsSource() && (*desc)->GetSource().IsSetSubtype()) {
113  ITERATE (CBioSource::TSubtype, sub_type,
114  (*desc)->GetSource().GetSubtype()) {
115  if ((*sub_type)->GetSubtype() ==
117  m_pErrorHandler->HandleError(
119  "chromosome info ignored because template "
120  "contains a chromosome SubSource");
121  return;
122  }
123  }
124  }
125  }
126 
127  m_mapChromosomeNames = mapChromosomeNames;
128 }
129 
130 /// Input has 2 tab-delimited columns: id, then chromosome name
131 void
133 {
134  TChromosomeMap mapChromosomeNames;
135 
136  string line;
137  while (!chromosomes_istr.eof()) {
138  NcbiGetlineEOL(chromosomes_istr, line);
139  if (line.empty()) {
140  continue;
141  }
142  list<string> split_line;
143  NStr::Split(line, " \t", split_line, NStr::fSplit_Tokenize);
144  if (split_line.size() != 2) {
145  m_pErrorHandler->HandleError(
147  "line of chromosome file does not have "
148  "two columns: " + line);
149  return;
150  }
151  string id = split_line.front();
152  string chr = split_line.back();
153  if (mapChromosomeNames.find(id) != mapChromosomeNames.end()
154  && mapChromosomeNames[id] != chr)
155  {
156  m_pErrorHandler->HandleError(
158  "inconsistent chromosome for " + id +
159  " in chromosome file");
160  return;
161  }
162  mapChromosomeNames[id] = chr;
163  }
164 
165  SetChromosomesInfo(mapChromosomeNames);
166 }
167 
169 {
170  // nothing required yet.
171 }
172 
174  CNcbiOstream & ostrm,
175  const std::vector<std::string> & vecAgpFileNames,
176  TOutputBioseqsFlags fFlags,
177  size_t uMaxBioseqsToWrite ) const
178 {
179  // put some flags into easier-to-read variables
180  const bool bOneObjectPerBioseq = (fFlags & fOutputBioseqsFlags_OneObjectPerBioseq);
181 
182  // we get the first AGP entries to help with
183  // determining whether to use Bioseq-sets
185  if( ! vecAgpFileNames.empty() ) {
186  x_ReadAgpEntries( vecAgpFileNames[0], agp_entries );
187  }
188  const bool bOnlyOneBioseqInAllAGPFiles =
189  ( agp_entries.size() == 1 && vecAgpFileNames.size() == 1 );
190 
191 
192  // Each top-level object we write out is prepended with sObjectOpeningString
193  // and appended with sObjectClosingString.
194  string sObjectOpeningString;
195  string sObjectClosingString;
196  // set up sObjectOpeningString and sObjectClosingString
198  sObjectOpeningString,
199  sObjectClosingString,
200  fFlags,
201  bOnlyOneBioseqInAllAGPFiles );
202 
203  ostrm << sObjectOpeningString << endl;
204 
205  {{
206 
207  CObjectOStreamAsn obj_writer(ostrm);
208 
209  // Iterate over AGP files
210  bool bFirstEntry = true;
211  ITERATE( std::vector<std::string>, file_name_it, vecAgpFileNames ) {
212 
213  // We got entries for the first AGP file earlier in this func
214  if( ! bFirstEntry ) {
215  agp_entries.clear();
216  x_ReadAgpEntries( *file_name_it, agp_entries );
217  }
218 
219  ITERATE (CAgpToSeqEntry::TSeqEntryRefVec, ent, agp_entries) {
220 
221  string id_str;
222 
223  CRef<CBioseq> new_bioseq;
224 
225  // set new_bioseq
226  {{
227  CRef<CSeq_entry> new_entry =
229  (*ent)->GetSeq(),
230  id_str );
231  if( ! new_entry ) {
232  m_pErrorHandler->HandleError(
234  "Entry skipped and reason probably given in a previous error" );
235  continue;
236  }
237  new_bioseq.Reset( &new_entry->SetSeq() );
238  }}
239 
240  if( bFirstEntry ) {
241  bFirstEntry = false;
242  } else {
243  if( bOneObjectPerBioseq ) {
244  // if one object per bioseq, we close the
245  // previous one and open up the new one
246  ostrm << sObjectClosingString << endl;
247  ostrm << sObjectOpeningString << endl;
248  } else if( ! sObjectOpeningString.empty() ) {
249  // all the bioseqs are in one Bioseq-set,
250  // so just comma-separate them
251  ostrm << "," << endl;
252  }
253  }
254 
255  if( sObjectOpeningString.empty() ) {
256  // Bioseq has to stand on its own
257  ostrm << "Bioseq ::= " << endl;
258  } else {
259  // Bioseq is inside some other object
260  ostrm << "seq " << endl;
261  }
262  obj_writer.WriteObject(new_bioseq.GetPointer(), new_bioseq->GetThisTypeInfo());
263  // flush after every write in case the object writer has its own
264  // buffering that can cause corruption when intermixed with direct
265  // stringstream "operator<<" calls.
266  obj_writer.Flush();
267  }
268  }
269  }}
270 
271  ostrm << sObjectClosingString << endl;
272 }
273 
275  const string & sDirName,
276  const std::vector<std::string> & vecAgpFileNames,
277  const string & sSuffix_arg,
278  IFileWrittenCallback * pFileWrittenCallback ) const
279 {
280  CDir outputDir(sDirName);
281  if( ! outputDir.Exists() ||
282  ! outputDir.IsDir() )
283  {
284  m_pErrorHandler->HandleError(
286  "The output directory is not a dir or is not found: " + sDirName );
287  return;
288  }
289 
290  const string & sSuffix = (
291  sSuffix_arg.empty() ?
292  ( m_pSubmitBlock ? "sqn" : "ent" ) :
293  sSuffix_arg );
294 
295  ITERATE( std::vector<std::string>, file_name_it, vecAgpFileNames ) {
296 
298  x_ReadAgpEntries( *file_name_it, agp_entries );
299 
300  ITERATE (CAgpToSeqEntry::TSeqEntryRefVec, ent, agp_entries) {
301 
302  string id_str;
303  CRef<CSeq_entry> new_entry =
305  (*ent)->GetSeq(),
306  id_str );
307  if( ! new_entry ) {
308  m_pErrorHandler->HandleError(
310  "Entry skipped and the reason was "
311  "probably given in a previous error" );
312  continue;
313  }
314 
315  // we're in one of the modes where we print a CSerialObject
316  // to its own file
317  CRef<CSerialObject> pObjectToPrint;
318  if( m_pSubmitBlock ) {
319  // wrap in seq-submit before writing
320  CRef<CSeq_submit> new_submit( new CSeq_submit );
321  new_submit->SetSub( *SerialClone(*m_pSubmitBlock) );
322  new_submit->SetData().SetEntrys().push_back(new_entry);
323  pObjectToPrint = new_submit;
324  } else {
325  // don't need to wrap the seq-entry in a seq-submit
326  pObjectToPrint = new_entry;
327  }
328 
329  string outfpath = CDirEntry::MakePath(
330  outputDir.GetPath(), id_str, sSuffix);
331  {{
332  CNcbiOfstream ostr(outfpath.c_str());
333  ostr << MSerial_AsnText << *pObjectToPrint;
334  }}
335 
336  // allow caller to perform some custom actions, if desired
337  if( pFileWrittenCallback ) {
338  pFileWrittenCallback->Notify(outfpath);
339  }
340  }
341  }
342 }
343 
344 #ifdef STRING_AND_VAR_PAIR
345 # error STRING_AND_VAR_PAIR
346 #endif
347 
348 // This is less error prone because we don't have to
349 // worry about getting the string and name out of sync
350 #define STRING_AND_VAR_PAIR(_value) \
351  { #_value, _value }
352 
354 CAgpConverter::OutputFlagStringToEnum(const string & sEnumAsString)
355 {
356  // check if this func has fallen out of date
358 
360  static const TStrFlagPair kStrFlagPairs[] = {
365  };
367  DEFINE_STATIC_ARRAY_MAP(TStrFlagMap, kStrFlagMap, kStrFlagPairs);
368 
369  TStrFlagMap::const_iterator find_iter =
370  kStrFlagMap.find(NStr::TruncateSpaces(sEnumAsString).c_str());
371  if( find_iter == kStrFlagMap.end() ) {
373  "Bad string given to CAgpConverter::OutputFlagStringToEnum: "
374  << sEnumAsString);
375  } else {
376  return find_iter->second;
377  }
378 }
379 
381 CAgpConverter::ErrorStringToEnum(const string & sEnumAsString)
382 {
383  // check if this func has fallen out of date
385 
387  static const TStrErrorPair kStrErrorPairs[] = {
402  };
404  DEFINE_STATIC_ARRAY_MAP(TStrErrorMap, kStrErrorMap, kStrErrorPairs);
405 
406  TStrErrorMap::const_iterator find_iter =
407  kStrErrorMap.find(NStr::TruncateSpaces(sEnumAsString).c_str());
408  if( find_iter == kStrErrorMap.end() ) {
410  "Bad string given to CAgpConverter::ErrorStringToEnum: "
411  << sEnumAsString);
412  } else {
413  return find_iter->second;
414  }
415 }
416 
417 #undef STRING_AND_VAR_PAIR
418 
420  const string & sAgpFileName,
421  CAgpToSeqEntry::TSeqEntryRefVec & out_agp_entries ) const
422 {
423  // load AGP Seq-entry's into agp_entries
424 
425  // set up the AGP to Seq-entry object
426  const CAgpToSeqEntry::TFlags fAgpReaderFlags =
428  stringstream err_strm;
429  CRef<CAgpErrEx> pErrHandler( new CAgpErrEx(&err_strm) );
430  CAgpToSeqEntry agp_reader( fAgpReaderFlags, eAgpVersion_auto, pErrHandler.GetPointer() );
431  CNcbiIfstream istr( sAgpFileName.c_str() );
432  const int iErrCode = agp_reader.ReadStream(istr);
433 
434  // deal with errors
435  const string sErrors = err_strm.str();
436  if( ! sErrors.empty() ) {
437  m_pErrorHandler->HandleError(
439  "AGP parsing returned error message(s): " + sErrors );
440  }
441  if( iErrCode != 0 ) {
442  m_pErrorHandler->HandleError(
444  "AGP parsing returned error code " +
445  NStr::NumericToString(iErrCode) + " (" + pErrHandler->GetMsg(iErrCode) + ")");
446  return;
447  }
448 
449  // swap is faster than assignment
450  out_agp_entries.swap( agp_reader.GetResult() );
451 }
452 
455  const CBioseq & agp_bioseq,
456  string & out_id_str ) const
457 {
458  string unparsed_id_str;
459  CRef<CSeq_entry> new_entry =
460  x_InitializeCopyOfTemplate(agp_bioseq,
461  unparsed_id_str,
462  out_id_str );
463 
464 
466  // calculate the original template's length
467  const TSeqPos uOrigBioseqLen = ( m_pTemplateBioseq->IsSetLength() ?
468  m_pTemplateBioseq->GetLength() :
469  0 );
470 
471  // calculate the new bioseq's length
472  const TSeqPos uAGPBioseqLen = (
473  agp_bioseq.IsSetLength() ?
474  agp_bioseq.GetLength() :
475  0 );
476 
477  if( uOrigBioseqLen != uAGPBioseqLen ) {
478  m_pErrorHandler->HandleError(
480  "** Entry " + out_id_str + " has mismatch, but will "
481  "be written anyway: "
482  "fOutputFlags_AGPLenMustMatchOrig was set and the entry's "
483  "length is " +
484  NStr::NumericToString(uAGPBioseqLen) +
485  " but the original template's length is " +
486  NStr::NumericToString(uOrigBioseqLen) );
487  }
488  }
489 
490  // if requested, put an Int-fuzz = unk for
491  // all literals of length 100
494  new_entry->SetSeq().SetInst()
495  .SetExt().SetDelta().Set()) {
496  if ((*delta)->IsLiteral() &&
497  (*delta)->GetLiteral().GetLength() == 100) {
498  (*delta)->SetLiteral().SetFuzz().SetLim();
499  }
500  }
501  }
502 
503  // if requested, verify against known sequence components
504  if ( ! m_mapComponentLength.empty() ) {
505  const bool bSuccessfulValidation = x_VerifyComponents(
506  new_entry, out_id_str);
507  if ( ! bSuccessfulValidation ) {
508  // put this error in a better place
509  m_pErrorHandler->HandleError(
511  "** Not writing entry " + out_id_str + " due to failed validation");
512  return CRef<CSeq_entry>();
513  }
514  }
515 
516  // if requested, set chromosome name in source subtype
517  if ( ! m_mapChromosomeNames.empty() ) {
519  new_entry, unparsed_id_str);
520  }
521 
522  // set create and update dates to today
524 
525  return new_entry;
526 }
527 
530  const CBioseq& agp_seq,
531  string & out_unparsed_id_str,
532  string & out_id_str ) const
533 {
534  // insert sequence instance and id into a copy of template
535  CRef<CSeq_id> pSeqId( SerialClone(*agp_seq.GetFirstId()) );
536  {
537  stringstream id_strm;
538  pSeqId->GetLocal().AsString(id_strm);
539  out_unparsed_id_str = id_strm.str();
540  out_id_str = out_unparsed_id_str;
541  }
542 
543  // "ids" will hold all the ids for this piece,
544  // hopefully just one unless we have to fasta_id parse it
545  list<CRef<CSeq_id> > ids;
546  ids.push_back(pSeqId);
547 
548  // if ID contains a pipe, it might be a fasta id
549  if (NStr::Find(out_id_str, "|") != NPOS) {
551  // parse the id as a fasta id
552  ids.clear();
553  CSeq_id::ParseFastaIds(ids, out_id_str);
554  } else {
555  m_pErrorHandler->HandleError(
557  "** ID " + out_id_str +
558  " contains a '|'; consider using the -fasta_id option");
559  }
560  }
561 
562  // perform custom transformations given to us by the caller, if any
563  bool bFirstWasTransformed = false;
564  if( m_pIdTransformer ) {
565  NON_CONST_ITERATE(list<CRef<CSeq_id> >, id_it, ids) {
566  const bool bWasTransformed = m_pIdTransformer->Transform(*id_it);
567  if( bWasTransformed && id_it == ids.begin() ) {
568  bFirstWasTransformed = true;
569  }
570  }
571  }
572 
573  // out_id_str might need to be updated
575  bFirstWasTransformed )
576  {
577  // need version, no db name from id general
578  out_id_str.clear();
580  ids.front()->GetLabel(&out_id_str, CSeq_id::eContent, flags);
581  }
582 
583  CRef<CSeq_entry> new_entry( new CSeq_entry );
584  new_entry->SetSeq( *SerialClone(*m_pTemplateBioseq) );
585  new_entry->SetSeq().SetInst().Assign(agp_seq.GetInst());
586  new_entry->SetSeq().ResetId();
587  ITERATE (list<CRef<CSeq_id> >, an_id, ids) {
588  new_entry->SetSeq().SetId().push_back(*an_id);
589  }
590 
591  return new_entry;
592 }
593 
595  CConstRef<CSeq_entry> new_entry,
596  const string & id_str) const
597 {
598  bool failure = false;
600  new_entry->GetSeq().GetInst()
601  .GetExt().GetDelta().Get()) {
602  if ((*delta)->IsLoc()) {
603  const string comp_id_str =
604  (*delta)->GetLoc().GetInt().GetId().AsFastaString();
606  m_mapComponentLength.find(comp_id_str);
607  if ( find_iter == m_mapComponentLength.end()) {
608  failure = true;
609  m_pErrorHandler->HandleError(
611  "** Component " + comp_id_str +
612  " of entry " + id_str + " not found");
613  } else {
614  const TSeqPos uCompLen = find_iter->second;
615 
616  const TSeqPos to = (*delta)->GetLoc().GetInt().GetTo();
617  if (to >= uCompLen) {
618  failure = true;
619  m_pErrorHandler->HandleError(
621  "** Component " + comp_id_str +
622  " of entry " + id_str + " not long enough.\n"
623  "** Length is " +
624  NStr::NumericToString(uCompLen) +
625  "; requested \"to\" is " + NStr::NumericToString(to) );
626  }
627  }
628  }
629  }
630 
631  return ! failure;
632 }
633 
635  CRef<CSeq_entry> new_entry,
636  const string & unparsed_id_str ) const
637 {
638  TChromosomeMap::const_iterator chr_find_iter =
639  m_mapChromosomeNames.find(unparsed_id_str);
640  if( chr_find_iter == m_mapChromosomeNames.end() ) {
641  // not found, so leave
642  return;
643  }
644 
645  CRef<CSubSource> sub_source(new CSubSource);
647  sub_source->SetName(chr_find_iter->second);
648  vector<CRef<CSeqdesc> > source_descs;
649  ITERATE (CSeq_descr::Tdata, desc,
650  new_entry->GetSeq().GetDescr().Get()) {
651  if ((*desc)->IsSource()) {
652  source_descs.push_back(*desc);
653  }
654  }
655  if (source_descs.size() != 1) {
656  m_pErrorHandler->HandleError(
658  "found " +
659  NStr::SizetToString(source_descs.size()) +
660  "Source Desc's; expected exactly one");
661  return;
662  }
663  CSeqdesc& source_desc = *source_descs[0];
664  source_desc.SetSource().SetSubtype().push_back(sub_source);
665 }
666 
668  CRef<CSeq_entry> new_entry ) const
669 {
670  CRef<CDate> date(new CDate);
672 
673  CRef<CSeqdesc> update_date(new CSeqdesc);
674  update_date->SetUpdate_date(*date);
675  new_entry->SetSeq().SetDescr().Set().push_back(update_date);
676 
677  CRef<CSeqdesc> create_date(new CSeqdesc);
678  create_date->SetCreate_date(*date);
679  new_entry->SetSeq().SetDescr().Set().push_back(create_date);
680 }
681 
683  string & out_sObjectOpeningString,
684  string & out_sObjectClosingString,
685  TOutputBioseqsFlags fOutputBioseqsFlags,
686  bool bOnlyOneBioseqInAllAGPFiles ) const
687 {
688  out_sObjectOpeningString.clear();
689  out_sObjectClosingString.clear();
690 
691  // See if Bioseqs will be in a Bioseq-set or not:
692  bool bUsingBioseqSets = false; // default so we can unwrap where possible
693  if( fOutputBioseqsFlags & fOutputBioseqsFlags_DoNOTUnwrapSingularBioseqSets ) {
694  // if unwrapping is forbidden, we have no choice but
695  // to use Bioseq-sets
696  bUsingBioseqSets = true;
697  } else if( fOutputBioseqsFlags & fOutputBioseqsFlags_OneObjectPerBioseq ) {
698  // There's only one Bioseq per object, so
699  // there's no reason to use Bioseq-sets in each one
700  // if we don't have to
701  bUsingBioseqSets = false; // redundant assignment, but clarifies
702  } else if( ! bOnlyOneBioseqInAllAGPFiles )
703  {
704  // there's only one big object, so using Bioseq-sets
705  // depends on whether there exists one Bioseq in all the AGP files
706  // (we make the assumption that AGP files will have at least one Bioseq)
707  bUsingBioseqSets = true;
708  }
709 
710  // Each subsequent "if" should append to out_sObjectOpeningString
711  // and prepend to out_sObjectClosingString, because we're going from the outside inward.
712 
713  // At each step, we check if out_sObjectOpeningString is empty
714  // to see whether or not to add a ASN.1 text header (example header: "Seq-submit :: ")
715 
716  // outermost possible level: is a Seq-submit needed?
717  if( m_pSubmitBlock ) {
718  stringstream seq_sub_header_strm;
719  CObjectOStreamAsn submit_block_writer(seq_sub_header_strm);
720 
721  // for consistency we put the header-writing line in an
722  // "if" even though we know the "if" always succeeds
723  if( out_sObjectOpeningString.empty() ) {
724  seq_sub_header_strm << "Seq-submit ::= ";
725  }
726  seq_sub_header_strm << "{" << endl;
727  seq_sub_header_strm << "sub ";
728  submit_block_writer.WriteObject(m_pSubmitBlock.GetPointer(), m_pSubmitBlock->GetThisTypeInfo());
729  submit_block_writer.Flush();
730  seq_sub_header_strm << "," << endl;
731  seq_sub_header_strm << "data entrys {" << endl;
732 
733  out_sObjectOpeningString = seq_sub_header_strm.str();
734  out_sObjectClosingString = "} }" + out_sObjectClosingString;
735  }
736 
737  // next level inward: is a Seq-entry needed?
738  const bool bUsingSeqEntry = (
739  m_pSubmitBlock ||
740  ( fOutputBioseqsFlags & fOutputBioseqsFlags_WrapInSeqEntry ) );
741  if( bUsingSeqEntry ) {
742  if( out_sObjectOpeningString.empty() ) {
743  // add an ASN.1 text header if we're not wrapped in
744  // something else
745  out_sObjectOpeningString += "Seq-entry ::= ";
746  }
747  if( bUsingBioseqSets ) {
748  out_sObjectOpeningString += "set ";
749  }
750  }
751 
752  // next level inward: is a Bioseq-set needed?
753  if( bUsingBioseqSets ) {
754  // add an ASN.1 text header if we're not wrapped in
755  // something else
756  if( out_sObjectOpeningString.empty() ) {
757  out_sObjectOpeningString += "Bioseq-set ::= ";
758  }
759  out_sObjectOpeningString += "{ seq-set { ";
760  out_sObjectClosingString = "} }" + out_sObjectClosingString;
761  }
762 }
763 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
USING_SCOPE(objects)
#define STRING_AND_VAR_PAIR(_value)
@ eAgpVersion_auto
auto-detect using the first gap line
Definition: agp_util.hpp:56
This gets called after each file is written, so the caller can do useful things like run asnval on ev...
virtual void Notify(const string &file)=0
TCompLengthMap m_mapComponentLength
void OutputBioseqs(CNcbiOstream &ostrm, const std::vector< std::string > &vecAgpFileNames, TOutputBioseqsFlags fFlags=0, size_t uMaxBioseqsToWrite=std::numeric_limits< size_t >::max()) const
Outputs the result from the AGP file names as ASN.1.
void x_SetUpObjectOpeningAndClosingStrings(string &out_sObjectOpeningString, string &out_sObjectClosingString, TOutputBioseqsFlags fOutputBioseqsFlags, bool bOnlyOneBioseqInAllAGPFiles) const
Each Bioseq written out will have the out_sObjectOpeningString before it and out_sObjectClosingString...
CRef< IIdTransformer > m_pIdTransformer
bool x_VerifyComponents(CConstRef< objects::CSeq_entry > new_entry, const string &id_str) const
@ fOutputBioseqsFlags_OneObjectPerBioseq
If set, each AGP Bioseq is written as its own object.
@ fOutputBioseqsFlags_WrapInSeqEntry
Bioseqs and Bioseq-sets should always be wrapped in a Seq-entry.
@ fOutputBioseqsFlags_DoNOTUnwrapSingularBioseqSets
Specify this if Bioseq-sets with just one Bioseq in them should _NOT_ be unwrapped into a Bioseq.
TChromosomeMap m_mapChromosomeNames
EError
The different kinds of errors that could occur while processing.
@ eError_SuggestUsingFastaIdOption
@ eError_WrongNumberOfSourceDescs
@ eError_SubmitBlockIgnoredWhenOneBigBioseqSet
@ eError_EntrySkippedDueToFailedComponentValidation
@ eError_ChromosomeFileBadFormat
@ eError_OutputDirNotFoundOrNotADir
@ eError_ChromosomeIsInconsistent
@ eError_ChromosomeMapIgnoredBecauseChromosomeSubsourceAlreadyInTemplate
@ eError_AGPLengthMismatchWithTemplateLength
void SetComponentsBioseqSet(CConstRef< objects::CBioseq_set > pComponentsBioseqSet)
Give a bioseq-set containing all the components pieces, for verification.
CRef< objects::CSeq_entry > x_InitializeCopyOfTemplate(const objects::CBioseq &agp_seq, string &out_unparsed_id_str, string &out_id_str) const
void x_ReadAgpEntries(const string &sAgpFileName, CAgpToSeqEntry::TSeqEntryRefVec &out_agp_entries) const
static TOutputFlags OutputFlagStringToEnum(const string &sEnumAsString)
Convert string to flag.
CAgpConverter(CConstRef< objects::CBioseq > pTemplateBioseq, const objects::CSubmit_block *pSubmitBlock=nullptr, TOutputFlags fOutputFlags=0, CRef< CErrorHandler > pErrorHandler=CRef< CErrorHandler >())
Constructor.
CRef< CErrorHandler > m_pErrorHandler
void OutputOneFileForEach(const string &sDirName, const std::vector< std::string > &vecAgpFileNames, const string &sSuffix=kEmptyStr, IFileWrittenCallback *pFileWrittenCallback=nullptr) const
Outputs the results of each Seq-entry (or Seq-submit if Submit-block was given) into its own file in ...
void x_SetCreateAndUpdateDatesToToday(CRef< objects::CSeq_entry > new_entry) const
void LoadChromosomeMap(CNcbiIstream &chromosomes_istr)
Input has 2 tab-delimited columns: id, then chromosome name.
CConstRef< objects::CBioseq > m_pTemplateBioseq
@ fOutputFlags_Fuzz100
For gaps of length 100, put an Int-fuzz = unk in the literal.
@ fOutputFlags_FastaId
Parse object ids (col. 1) as fasta-style ids if they contain '|'.
@ fOutputFlags_SetGapInfo
Set Seq-gap (gap type and linkage) in delta sequence.
@ fOutputFlags_AGPLenMustMatchOrig
When set, we give an error on AGP objects that don't have the same length as the original template.
int TOutputFlags
Bitwise-OR of EOutputFlags.
static EError ErrorStringToEnum(const string &sEnumAsString)
Convert string to EError enum.
void x_SetChromosomeNameInSourceSubtype(CRef< objects::CSeq_entry > new_entry, const string &unparsed_id_str) const
void SetChromosomesInfo(const TChromosomeMap &mapChromosomeNames)
Give the chromosomes to this object.
TOutputFlags m_fOutputFlags
CRef< objects::CSeq_entry > x_InitializeAndCheckCopyOfTemplate(const objects::CBioseq &agp_bioseq, string &out_id_str) const
CConstRef< objects::CSubmit_block > m_pSubmitBlock
Correctly print multiple errors and warnings on consequitive lines; suppress undesired or higly repet...
Definition: agp_util.hpp:650
virtual int ReadStream(CNcbiIstream &is, EFinalize eFinalize=eFinalize_Yes)
Read an AGP file from the given input stream.
Definition: agp_util.cpp:1084
This class is used to turn an AGP file into a vector of Seq-entry's.
vector< CRef< objects::CSeq_entry > > TSeqEntryRefVec
This is the way the results will be returned Each Seq-entry contains just one Bioseq,...
@ fSetSeqGap
Found gaps will not be given Seq-data such as Type and Linkage.
TSeqEntryRefVec & GetResult(void)
This gets the results found, but don't call before finalizing.
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
bool IsSetLength(void) const
Definition: Bioseq.cpp:355
Definition: Date.hpp:53
void SetToTime(const CTime &time, EPrecision prec=ePrecision_second)
Definition: Date.cpp:57
@ ePrecision_day
Definition: Date.hpp:58
CDir –.
Definition: ncbifile.hpp:1695
CObjectOStreamAsn –.
Definition: objostrasn.hpp:53
ostream & AsString(ostream &s) const
Definition: Object_id.cpp:202
CRef –.
Definition: ncbiobj.hpp:618
Definition: Seq_entry.hpp:56
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
CSubmit_block –.
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator end() const
Definition: map.hpp:152
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
static uch flags
static int failure
Definition: t0019.c:11
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
Definition: ncbiexpt.hpp:724
static string MakePath(const string &dir=kEmptyStr, const string &base=kEmptyStr, const string &ext=kEmptyStr)
Assemble a path from basic components.
Definition: ncbifile.cpp:413
virtual bool Exists(void) const
Check if directory "dirname" exists.
Definition: ncbifile.hpp:4065
bool IsDir(EFollowLinks follow=eFollowLinks) const
Check whether a directory entry is a directory.
Definition: ncbifile.hpp:3946
const string & GetPath(void) const
Get entry path.
Definition: ncbifile.hpp:3910
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
C * SerialClone(const C &src)
Create on heap a clone of the source object.
Definition: serialbase.hpp:512
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
int TLabelFlags
Definition: Seq_id.hpp:625
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
Definition: Seq_id.cpp:2603
@ fLabel_Version
Show the version.
Definition: Seq_id.hpp:615
@ fLabel_GeneralDbIsContent
For type general, use the database name as the tag and the (text or numeric) key as the content.
Definition: Seq_id.hpp:618
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:605
void WriteObject(const CConstObjectInfo &object)
Definition: objostr.cpp:566
void Flush(void)
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
CTime CurrentTime(CTime::ETimeZone tz=CTime::eLocal, CTime::ETimeZonePrecision tzp=CTime::eTZPrecisionDefault)
Definition: ncbitime.hpp:2185
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
Definition: SubSource_.hpp:319
list< CRef< CSubSource > > TSubtype
Definition: BioSource_.hpp:145
void SetName(const TName &value)
Assign a value to Name data member.
Definition: SubSource_.hpp:359
TSubtype & SetSubtype(void)
Assign a value to Subtype data member.
Definition: BioSource_.hpp:545
const TLocal & GetLocal(void) const
Get the variant data.
Definition: Seq_id_.cpp:193
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
void ResetId(void)
Reset Id data member.
Definition: Bioseq_.cpp:54
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
TCreate_date & SetCreate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:478
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
void SetSub(TSub &value)
Assign a value to Sub data member.
void SetData(TData &value)
Assign a value to Data data member.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Int4 delta(size_t dimension_, const Int4 *score_)
#define DEFINE_STATIC_ARRAY_MAP(Type, Var, Array)
Definition: static_set.hpp:888
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
#define _ASSERT
Modified on Sun Apr 14 05:29:19 2024 by modify_doxy.py rev. 669887