100 struct SSimpleSubsourceInfo {
101 const char * m_pchSynopsis;
102 const char * m_pchUsage;
106 static const TSimpleSubsource sc_SimpleSubsource[] = {
128 virtual void Init(
void);
129 virtual int Run(
void);
130 virtual void Exit(
void);
152 const string & sTemplateLocation,
181 arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
182 "AGP file converter program");
186 arg_desc->SetCurrentGroup(
"INPUT");
188 arg_desc->AddKey(
"template",
"LOCATION",
189 "The filename of a template Seq-entry or Seq-submit or Bioseq in "
190 "either ASN.1 text or ASN.1 binary or XML (autodetected). A series of Seqdescs "
191 "may optionally follow the main ASN.1 object."
192 "Alternatively, if the LOCATION looks reasonably like a GenBank identifier and "
193 "doesn't exist as a file, the template is loaded from genbank instead.",
195 arg_desc->AddFlag(
"keeptemplateannots",
196 "Unless this flag is set, the annots from the template are removed");
199 (1, 32766,
"AGP files to process",
202 arg_desc->SetCurrentGroup(
"OUTPUT");
204 arg_desc->AddOptionalKey(
"outdir",
"output_directory",
205 "Directory for output files "
206 "(defaults to current directory)",
208 arg_desc->AddOptionalKey(
"ofs",
"ofs",
209 "Output filename suffix "
210 "(default is \".ent\" for Seq-entry "
211 "or \".sqn\" for Seq-submit",
213 arg_desc->AddFlag(
"stdout",
"Write to stdout rather than files. This does not work for Seq-submits. "
214 "Implies -no_asnval.");
215 arg_desc->AddDefaultKey(
216 "output-type",
"ASN_OBJECT_TYPE",
217 "This lets you force what kind of object is used for output. "
218 " Forcing may cause some data to be thrown out (Example: "
219 "if input is a Seq-submit and you force the output to be a "
220 "Seq-entry, then the Seq-submit's data will be disregarded)",
223 arg_desc->SetConstraint(
"output-type",
225 "AUTO",
"Seq-entry"));
228 arg_desc->SetCurrentGroup(
"VALIDATION");
230 arg_desc->AddOptionalKey(
"components",
"components_file",
231 "Bioseq-set of components, used for "
234 arg_desc->AddFlag(
"no_asnval",
235 "Do not validate using asnval");
237 arg_desc->SetCurrentGroup(
"DESCRIPTORS");
239 arg_desc->AddOptionalKey(
"dl",
"definition_line",
240 "Definition line (title descriptor)",
243 arg_desc->AddOptionalKey(
"nt",
"tax_id",
244 "NCBI Taxonomy Database ID",
247 arg_desc->AddOptionalKey(
"on",
"org_name",
251 arg_desc->AddOptionalKey(
"sn",
"strain_name",
257 ITERATE( TSimpleSubsourceMap, simple_src_it, sc_SimpleSubsourceMap ) {
258 const string & sArgName = simple_src_it->first;
259 const SSimpleSubsourceInfo &
info = simple_src_it->second;
260 arg_desc->AddOptionalKey(sArgName,
info.m_pchSynopsis,
266 arg_desc->SetCurrentGroup(
"SEQ-IDS");
268 arg_desc->AddFlag(
"fasta_id",
"Parse object ids (col. 1) "
269 "as fasta-style ids if they contain '|'");
270 arg_desc->AddDefaultKey(
"general_id",
"general_db",
271 "if set to non-empty string, local ids for object seq-ids will "
272 "become general ids belonging to the given database",
275 arg_desc->SetCurrentGroup(
"OTHER");
277 arg_desc->AddFlag(
"fuzz100",
"For gaps of length 100, "
278 "put an Int-fuzz = unk in the literal");
280 arg_desc->AddOptionalKey(
"chromosomes",
"chromosome_name_file",
281 "Mapping of col. 1 names to chromsome "
282 "names, for use as SubSource",
284 arg_desc->AddFlag(
"gap-info",
285 "Set Seq-gap (gap type and linkage) in delta sequence");
286 arg_desc->AddFlag(
"len-check",
287 "Die if AGP's length does not match the length of the original template.");
298 if ((*db_tag)->GetDb() ==
"taxon") {
305 "expected exactly one");
334 args[
"template"].AsString(),
339 if( ! args[
"keeptemplateannots"] ) {
345 const string& dl = args[
"dl"].AsString();
348 if ((*desc)->IsTitle()) {
349 throw runtime_error(
"-dl given but template contains a title");
356 if (args[
"nt"] || args[
"on"] || args[
"sn"] ||
362 if ((*desc)->IsSource()) {
363 throw runtime_error(
"BioSource specified on command line but "
364 "template contains BioSource");
372 ITERATE( TSimpleSubsourceMap, simple_src_it, sc_SimpleSubsourceMap ) {
373 const string & sArgName = simple_src_it->first;
374 const SSimpleSubsourceInfo &
info = simple_src_it->second;
375 if( args[sArgName] ) {
378 sub_source->
SetName(args[sArgName].AsString());
390 if( args[
"fuzz100"] ) {
393 if( args[
"fasta_id"] ) {
396 if( args[
"gap-info"] ) {
399 if( args[
"len-check"] ) {
405 fAgpConvertOutputFlags,
409 const string & sGeneralIdDb = args[
"general_id"].AsString();
410 if( ! sGeneralIdDb.empty() ) {
411 class CLocalToGeneralIdTransformer :
415 CLocalToGeneralIdTransformer(
const string & sGeneralDb)
416 : m_sGeneralDb(sGeneralDb) { }
420 if( ! pSeqId || ! pSeqId->IsLocal() ) {
426 dbtag.
SetDb(m_sGeneralDb);
427 if( pSeqId->GetLocal().IsId() ) {
428 dbtag.
SetTag().SetId( pSeqId->GetLocal().GetId() );
429 }
else if( pSeqId->GetLocal().IsStr() ) {
430 dbtag.
SetTag().SetStr( pSeqId->GetLocal().GetStr() );
435 pSeqId->Assign( *pNewSeqId );
443 new CLocalToGeneralIdTransformer(sGeneralIdDb) );
451 if (args[
"components"]) {
459 if (args[
"chromosomes"]) {
464 vector<string> vecAgpFileNames;
465 for(
size_t idx = 1; idx <= args.
GetNExtra(); ++idx ) {
466 vecAgpFileNames.push_back( args[idx].AsString() );
468 throw runtime_error(
"AGP file not found: " + vecAgpFileNames.back() );
472 if( args[
"stdout"] ) {
476 if( args[
"output-type"].AsString() ==
"Seq-entry" ) {
482 fOutputBioseqsFlags );
484 if( ! args[
"outdir"] ) {
485 throw runtime_error(
"Please specify -stdout or -outdir");
490 args[
"outdir"].AsString(),
492 ( args[
"ofs"] ? args[
"ofs"].AsString() :
kEmptyStr ),
493 ( args[
"no_asnval"] ?
NULL : &asnval_runner ) );
507 const char * pchCommand =
"asnval";
508 const char * asnval_argv[] = {
518 for(
size_t idx = 0; asnval_argv[idx]; ++idx ) {
532 const string & sTemplateLocation,
543 if( !
CDirEntry(sTemplateLocation).IsFile() ) {
545 if( !
CRegexpUtil(sTemplateLocation).Exists(
"^[A-Za-z0-9_|]+(\\.[0-9]+)?$") ) {
546 throw runtime_error(
"This is not a valid sequence identifier: " + sTemplateLocation);
558 throw runtime_error(
"Invalid sequence identifier: " + sTemplateLocation);
587 "template file seems to be in an unsupported format: "
595 unique_ptr<CObjectIStream> pObjIstrm(
599 const string sType = pObjIstrm->ReadFileHeader();
602 if( sType == CSeq_entry::GetTypeInfo()->GetName() ) {
605 }
else if( sType == CBioseq::GetTypeInfo()->GetName() ) {
609 out_ent_templ->
SetSeq( *pBioseq );
610 }
else if( sType == CSeq_submit::GetTypeInfo()->GetName() ) {
611 pObjIstrm->Read(
ObjectInfo(*out_submit_templ),
616 throw runtime_error(
"Seq-submit template must contain "
617 "exactly one Seq-entry");
619 }
else if( sType == CSubmit_block::GetTypeInfo()->GetName() ) {
627 out_submit_templ->
SetSub(*submit_block);
633 out_submit_templ->
SetData().SetEntrys().push_back(ent);
636 "Submit-block. Object seems to be of type: " << sType);
640 if( out_submit_templ->
IsEntrys() ) {
641 out_ent_templ = out_submit_templ->
SetData().SetEntrys().front();
646 if (out_ent_templ->
IsSet()) {
647 unsigned int num_nuc_ents = 0;
651 if ((*ent_iter)->GetSeq().GetInst().IsNa()) {
653 tmp->Assign(**ent_iter);
658 desc->
Assign(**desc_iter);
659 tmp->SetSeq().SetDescr().Set().push_back(desc);
663 if (num_nuc_ents == 1) {
666 throw runtime_error(
"template contains "
668 +
" nuc. Seq-entrys; should contain 1");
683 if ( out_submit_templ->
IsEntrys() ) {
692 if( ! out_ent_templ->
IsSeq() ) {
693 throw runtime_error(
"The Seq-entry must be a Bioseq not a Bioseq-set.");
696 if( args[
"output-type"].AsString() ==
"Seq-entry" ) {
709 ITERATE( TSimpleSubsourceMap, simple_src_it, sc_SimpleSubsourceMap ) {
710 const string & sArgName = simple_src_it->first;
711 if( args[sArgName] ) {
728 if ( ! args[
"on"] && ! args[
"nt"] ) {
734 throw runtime_error(
"failure contacting taxonomy server");
742 const string& inp_taxname = args[
"on"].AsString();
754 throw runtime_error(
"taxonomy server lookup failed");
756 if (!on_result->GetIs_species_level()) {
757 throw runtime_error(
"supplied name is not species-level");
759 if (inp_orgref->
GetTaxname() != inp_taxname) {
760 cerr <<
"** Warning: taxname returned by server ("
761 << on_result->GetOrg().GetTaxname()
762 <<
") differs from that supplied with -on ("
763 << inp_taxname <<
")" << endl;
774 const string& inp_strain_name = args[
"sn"].AsString();
775 vector<string> strain_names;
780 strain_names.push_back((*mod)->GetSubname());
783 if (!(strain_names.size() == 1
784 && strain_names[0] == inp_strain_name))
786 cerr <<
"** Warning: strain name " << inp_strain_name
787 <<
" provided but server lookup yielded ";
788 if (strain_names.empty()) {
789 cerr <<
"no strain name" << endl;
791 cerr <<
NStr::Join(strain_names,
" and ") << endl;
799 nt_result = cl.
GetById(inp_taxid);
802 +
" is not species-level");
804 nt_result->
SetOrg().ResetSyn();
806 if (db_taxid != inp_taxid) {
807 cerr <<
"** Warning: taxid returned by server ("
809 <<
") differs from that supplied with -nt ("
810 << inp_taxid <<
")" << endl;
814 if (on_taxid != db_taxid) {
815 throw runtime_error(
"taxid from name lookup ("
817 +
") differs from that from "
863 cerr << sMessage << endl;
885 int main(
int argc,
const char* argv[])
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static TTaxId s_GetTaxid(const COrg_ref &org_ref)
int main(int argc, const char *argv[])
void remove_if(Container &c, Predicate *__pred)
Subclass this to override how errors are handled (example: to stop early on some kinds of errors)
This gets called after each file is written, so the caller can do useful things like run asnval on ev...
void OutputBioseqs(CNcbiOstream &ostrm, const std::vector< std::string > &vecAgpFileNames, TOutputBioseqsFlags fFlags=0, size_t uMaxBioseqsToWrite=std::numeric_limits< size_t >::max()) const
Outputs the result from the AGP file names as ASN.1.
@ fOutputBioseqsFlags_WrapInSeqEntry
Bioseqs and Bioseq-sets should always be wrapped in a Seq-entry.
@ fOutputBioseqsFlags_DoNOTUnwrapSingularBioseqSets
Specify this if Bioseq-sets with just one Bioseq in them should _NOT_ be unwrapped into a Bioseq.
EError
The different kinds of errors that could occur while processing.
@ eError_SuggestUsingFastaIdOption
@ eError_WrongNumberOfSourceDescs
@ eError_ComponentTooShort
@ eError_SubmitBlockIgnoredWhenOneBigBioseqSet
@ eError_EntrySkippedDueToFailedComponentValidation
@ eError_ChromosomeFileBadFormat
@ eError_OutputDirNotFoundOrNotADir
@ eError_ChromosomeIsInconsistent
@ eError_ChromosomeMapIgnoredBecauseChromosomeSubsourceAlreadyInTemplate
@ eError_ComponentNotFound
@ eError_AGPLengthMismatchWithTemplateLength
void SetComponentsBioseqSet(CConstRef< objects::CBioseq_set > pComponentsBioseqSet)
Give a bioseq-set containing all the components pieces, for verification.
void OutputOneFileForEach(const string &sDirName, const std::vector< std::string > &vecAgpFileNames, const string &sSuffix=kEmptyStr, IFileWrittenCallback *pFileWrittenCallback=nullptr) const
Outputs the results of each Seq-entry (or Seq-submit if Submit-block was given) into its own file in ...
void LoadChromosomeMap(CNcbiIstream &chromosomes_istr)
Input has 2 tab-delimited columns: id, then chromosome name.
@ fOutputFlags_Fuzz100
For gaps of length 100, put an Int-fuzz = unk in the literal.
@ fOutputFlags_FastaId
Parse object ids (col. 1) as fasta-style ids if they contain '|'.
@ fOutputFlags_SetGapInfo
Set Seq-gap (gap type and linkage) in delta sequence.
@ fOutputFlags_AGPLenMustMatchOrig
When set, we give an error on AGP objects that don't have the same length as the original template.
int TOutputFlags
Bitwise-OR of EOutputFlags.
void SetIdTransformer(IIdTransformer *pIdTransformer)
When this reads an id, it will use the supplied transformer (if any) to change the CSeq_id.
virtual void Notify(const string &file)
virtual void HandleError(CAgpConverter::EError eError, const string &sMessage) const
Default is to print to cerr, but feel free to override in a subclass.
virtual void Init(void)
Initialize the application.
void x_HandleTaxArgs(CRef< CSeqdesc > source_desc)
virtual void Exit(void)
Cleanup on application exit.
CRef< CCustomErrorHandler > m_pCustomErrorHandler
CAgpconvertApplication(void)
bool x_IsAnySimpleSubsourceArgSet(void)
virtual int Run(void)
Run the application.
void x_LoadTemplate(const string &sTemplateLocation, CRef< CSeq_entry > &out_ent_templ, CRef< CSeq_submit > &out_submit_templ)
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
@OrgMod.hpp User-defined methods of the data storage class.
bool IsEntrys(void) const
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
CRef< CTaxon2_data > GetById(TTaxId tax_id)
CConstRef< CTaxon2_data > LookupMerge(COrg_ref &inp_orgRef, string *psLog=0, TOrgRefStatus *pStatusOut=0)
Operators to edit gaps in sequences.
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
SStrictId_Tax::TId TTaxId
Taxon id type.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
#define TAX_ID_FROM(T, value)
@ fHideLogfile
Hide log file description.
@ fHideConffile
Hide configuration file description.
@ fHideVersion
Hide version description.
@ eNoOwnership
No ownership is assumed.
size_t GetNExtra(void) const
Get the number of unnamed positional (a.k.a. extra) args.
@ eRequires
One argument requires another.
@ eExcludes
One argument excludes another.
@ eInputFile
Name of file (must exist and be readable)
@ eString
An arbitrary string.
@ eInteger
Convertible into an integer number (int or Int8)
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
static string QuoteArg(const string &arg)
Quote argument.
static CResult SpawnVP(EMode mode, const char *cmdname, const char *const *argv)
Spawn a new process with variable number of command-line arguments and find file to execute from the ...
@ eWait
Suspends calling thread until execution of new process is complete (synchronous operation).
virtual bool Exists(void) const
Check existence of file.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnText
I/O stream manipulators –.
ESerialDataFormat
Data file format.
@ eSerial_AsnText
ASN.1 text.
@ eSerial_AsnBinary
ASN.1 binary.
pair< TObjectPtr, TTypeInfo > ObjectInfo(C &obj)
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
void Reset(void)
Reset reference object.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
void SetOrg(TOrg &value)
Assign a value to Org data member.
void SetName(const TName &value)
Assign a value to Name data member.
TSubtype & SetSubtype(void)
Assign a value to Subtype data member.
void SetTag(TTag &value)
Assign a value to Tag data member.
void SetDb(const TDb &value)
Assign a value to Db data member.
const TMod & GetMod(void) const
Get the Mod member data.
vector< CRef< CDbtag > > TDb
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
const TDb & GetDb(void) const
Get the Db member data.
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
list< CRef< COrgMod > > TMod
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
TSub & SetSub(void)
Select the variant.
TGeneral & SetGeneral(void)
Select the variant.
const TSeq & GetSeq(void) const
Get the variant data.
const TDescr & GetDescr(void) const
Get the Descr member data.
const TSet & GetSet(void) const
Get the variant data.
bool IsSeq(void) const
Check if variant Seq is selected.
bool IsSet(void) const
Check if variant Set is selected.
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
list< CRef< CSeqdesc > > Tdata
TId & SetId(void)
Assign a value to Id data member.
void SetPub(TPub &value)
Assign a value to Pub data member.
TTitle & SetTitle(void)
Select the variant.
TPub & SetPub(void)
Select the variant.
void ResetAnnot(void)
Reset Annot data member.
const Tdata & Get(void) const
Get the member data.
void SetInst(TInst &value)
Assign a value to Inst data member.
TSource & SetSource(void)
Select the variant.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
const TDescr & GetDescr(void) const
Get the Descr member data.
@ eRepr_raw
continuous sequence
const TCit & GetCit(void) const
Get the Cit member data.
void SetSub(TSub &value)
Assign a value to Sub data member.
const TEntrys & GetEntrys(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
const TSub & GetSub(void) const
Get the Sub member data.
bool IsEntrys(void) const
Check if variant Entrys is selected.
TIs_species_level GetIs_species_level(void) const
Get the Is_species_level member data.
void SetOrg(TOrg &value)
Assign a value to Org data member.
const TOrg & GetOrg(void) const
Get the Org member data.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
Defines a portable execute class.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Defines: CTimeFormat - storage class for time format.
@ eError
An error was encountered while trying to send request or to read and to process the reply.
#define DEFINE_STATIC_ARRAY_MAP(Type, Var, Array)
bool operator()(CRef< COrgMod > mod)
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
C++ wrappers for the Perl-compatible regular expression (PCRE) library.