58 const string g_m8(
"m8"), g_AsnTxt(
"asntxt"), g_AsnBin(
"asnbin");
60 const string kMode_Pairwise (
"pairwise");
61 const string kMode_Multiple (
"multiple");
66 const string kBoth(
"strict");
67 const string kQuery(
"query");
68 const string kSubj(
"subject");
79 argdescr->AddDefaultKey(
"mode",
"mode",
80 "Specify whether the hits should be resolved in pairs "
81 "or as a single set.",
85 argdescr->AddDefaultKey(
"min_idty",
"min_idty",
86 "Minimal input hit identity",
89 argdescr->AddDefaultKey(
"min_len",
"min_len",
90 "Minimal input hit length",
93 argdescr->AddDefaultKey(
"retain_overlap",
"retain_overlap",
94 "Min overlap to retain in kilobases (0=OFF)",
97 argdescr->AddDefaultKey(
"fmt_in",
"fmt_in",
"Input format",
100 argdescr->AddOptionalKey(
"file_in",
"file_in",
"Input file (stdin otherwise)",
104 argdescr->AddFlag(
"sas",
"Assume seq-align-set as the top-level structure "
105 "for the input ASN hits",
true);
107 argdescr->AddDefaultKey(
"merge",
"merge",
108 "Merge abutting alignments unless the merged "
109 "alignment overlap length ratio is greater "
110 "than this parameter. Any negative value will "
115 argdescr->AddOptionalKey(
"constraints",
"constraints",
116 "Binary ASN file with constraining alignments",
120 argdescr->AddOptionalKey(
"file_out",
"file_out",
"Output file (stdout otherwise)",
124 argdescr->AddOptionalKey(
"m",
"m",
125 "Text description/comment to add to the output",
128 argdescr->AddDefaultKey(
"fmt_out",
"fmt_out",
"Output format",
131 argdescr->AddDefaultKey(
"hits_per_chunk",
"hits_per_chunk",
132 "Input is split into chunks with the number of hits "
133 "per chunk limited by this parameter.",
137 argdescr->AddDefaultKey(
"coord_margin",
"coord_margin",
138 "Larger values of this argument will result in less "
139 "RAM used but longer running times.",
143 argdescr->AddOptionalKey(
"ids",
"ids",
"Table to rename sequence IDs.",
146 argdescr->AddDefaultKey(
"ut",
"uniqueness_type",
147 "uniqueness type (strict, query, or subject)",
152 argdescr->SetConstraint(
"ut", unique_type);
154 argdescr->AddFlag(
"keep_strands",
155 "Keep plus-plus strands"
158 argdescr->AddFlag(
"no_output_constraint",
159 "Do not output constraints"
163 argdescr->SetConstraint(
"fmt_in", constrain_format);
164 argdescr->SetConstraint(
"fmt_out", constrain_format);
167 argdescr->SetConstraint(
"min_len", constrain_minlen);
170 argdescr->SetConstraint(
"min_idty", constrain_minidty);
173 argdescr->SetConstraint(
"merge", constrain_merge);
176 constrain_mode->
Allow(kMode_Pairwise)->
Allow(kMode_Multiple);
177 argdescr->SetConstraint(
"mode", constrain_mode);
188 string ctgid, accver;
190 if(ctgid.size() == 0) {
194 if(accver.size() == 0) {
197 m_IDs[ctgid] = accver;
202 build_ids.
m_id[0] = build_ids.
m_id[1] = ctgid;
217 const string fmt_in = args[
"fmt_in"].AsString();
218 const string fmt_out = args[
"fmt_out"].AsString();
219 const THit::TCoord min_len = args[
"min_len"].AsInteger();
220 const double min_idty = args[
"min_idty"].AsDouble();
222 CNcbiIstream& istr = args[
"file_in"]? args[
"file_in"].AsInputFile(): cin;
228 static string firstline;
231 if(one_pair && firstline.size()) {
234 if(hit->GetIdentity() >= min_idty && hit->GetLength() >= min_len) {
235 phitrefs->push_back(hit);
236 id_query = hit->GetQueryId();
237 id_subj = hit->GetSubjId();
255 id_query = hit->GetQueryId();
256 id_subj = hit->GetSubjId();
258 else if(
false == id_query -> Match(*(hit->GetQueryId()))
259 ||
false == id_subj -> Match(*(hit->GetSubjId())) )
261 if(phitrefs->size()) {
266 id_query = hit->GetQueryId();
267 id_subj = hit->GetSubjId();
272 if(hit->GetIdentity() >= min_idty && hit->GetLength() >= min_len) {
273 phitrefs->push_back(hit);
280 const bool parse_aln = fmt_out != g_m8;
284 unique_ptr<CObjectIStream>
in (in_ptr);
286 const bool assume_sas (args[
"sas"]);
288 while (!
in->EndOfData()) {
298 phitrefs, parse_aln, min_len, min_idty);
311 if(one_pair && phitrefs->size()) {
316 static TStringSet idtags;
318 const string strid_query (phitrefs->front()->GetId(0)->GetSeqIdString(
true));
319 const string strid_subj (phitrefs->front()->GetId(1)->GetSeqIdString(
true));
320 const string tag (strid_subj +
"$_#_&" + strid_query);
321 if(idtags.end() != idtags.find(
tag)) {
323 "In pairwise mode input hits must be collated "
324 "by query and subject.");
337 const double& min_idty)
const
343 if(
r.GetTo() -
r.GetFrom() >= min_len) {
346 if(hit->GetIdentity() >= min_idty) {
347 if(hit->GetQueryStrand() ==
false) {
350 phitrefs->push_back(hit);
359 const string fmt = args[
"fmt_out"].AsString();
361 CNcbiOstream& ostr = args[
"file_out"]? args[
"file_out"].AsOutputFile(): cout;
363 string comment (args[
"m"]? args[
"m"].AsString():
"");
367 if(comment.size() > 0) {
368 ostr <<
"# " << comment << endl;
376 <<
"\tNumGapOpenings"
386 const THit& hit = **ii;
395 const bool fmt_txt (fmt == g_AsnTxt);
397 const THit& h = **ii;
399 bool no_output_constraint = args[
"no_output_constraint"].HasValue();
400 if (no_output_constraint && h.
GetScore() >= kBigDbl) {
432 bool keep_strands = args[
"keep_strands"].HasValue();
437 vector< CRef< CSeq_id > > &ids = ds->
SetIds();
438 for(
Uint1 where = 0; where < 2; ++where) {
441 id->Assign(*h.
GetId(where));
450 score->SetId().SetStr(
"reciprocity");
452 if (h.
GetScore() > kBigDbl || args[
"ut"].AsString() == kBoth)
457 }
else if (args[
"ut"].AsString() ==
kQuery) {
464 cerr <<
"Error adding reciprocity" << endl;
467 seq_align->
SetScore().push_back(score);
470 seq_align->
SetSegs().SetDenseg(*ds);
472 align_list.push_back(seq_align);
475 if(comment.size() > 0) {
488 cerr <<
"Error writing output file" << endl;
497 return lhs->GetScore() > rhs->GetScore();
505 const bool mode_multiple ( args[
"mode"].AsString() == kMode_Multiple );
506 const string fmt_in ( args[
"fmt_in"].AsString() );
507 const string fmt_out ( args[
"fmt_out"].AsString() );
508 const double maxlenfr (args[
"merge"].AsDouble());
510 if((fmt_out == g_AsnTxt || fmt_out == g_AsnBin) &&
511 (fmt_in != g_AsnTxt && fmt_in != g_AsnBin))
515 "For ASN output, input must also be in ASN");
518 if( mode_multiple ==
false && (args[
"ids"] || args[
"constraints"]
519 || fmt_in == g_AsnTxt || fmt_in == g_AsnBin ))
523 "Invalid parameter combination - "
524 "some options are not yet supported in pairwise mode.");
554 const THit::TCoord min_len (args[
"min_len"].AsInteger());
555 const double min_idty (args[
"min_idty"].AsDouble());
556 const size_t margin (args[
"coord_margin"].AsInteger());
557 const THit::TCoord retain_overlap (1024 * args[
"retain_overlap"].AsInteger());
560 if (args[
"ut"].AsString() ==
"query") {
562 }
else if (args[
"ut"].AsString() ==
"subject") {
572 hits.begin(), hits.end(),
581 copy(hits.begin(), hits.end(), back_inserter(
all));
582 copy(hits_new.begin(), hits_new.end(), back_inserter(
all));
587 cerr <<
"Error running x_DoPairwise" << endl;
599 const string fmt_in = args[
"fmt_in"].AsString();
600 const string fmt_out = args[
"fmt_out"].AsString();
601 const THit::TCoord min_len = args[
"min_len"].AsInteger();
602 const double min_idty = args[
"min_idty"].AsDouble();
603 const THit::TCoord retain_overlap = 1024 * args[
"retain_overlap"].AsInteger();
604 const size_t margin (args[
"coord_margin"].AsInteger());
607 if (args[
"ut"].AsString() ==
"query") {
609 }
else if (args[
"ut"].AsString() ==
"subject") {
618 if(args[
"constraints"]) {
623 copy(restraint.begin(), restraint.end(), back_inserter(
all));
627 const size_t M = args[
"hits_per_chunk"].AsInteger();
628 const size_t dim =
all.size();
629 size_t m =
min(dim, M);
631 const THitRefs::iterator ii_beg =
all.begin(), ii_end =
all.end();
632 THitRefs::iterator ii_hi = ii_beg, ii = ii_beg;
637 THitRefs::iterator ii_dst = ii + m;
638 if(ii_dst > ii_end) {
643 copy(ii, ii_dst, ii_hi);
644 ii_hi += ii_dst - ii;
659 THitRefs::iterator ii_hi0 = ii_hi;
661 THitRefs::iterator jj = hits_new.begin(), jje = hits_new.end();
662 for(;jj != jje && ii_hi != ii_hi0; *ii_hi++ = *jj++);
664 LOG_POST(
"Warning: space from eliminated alignments "
665 "not enough for all splits.");
670 cerr <<
"Error in x_DoMultiple" << endl;
673 all.erase(ii_hi, ii_end);
697 for(
Uint1 where = 0; where < 2; ++where) {
717 id->Assign(*(hit->GetId(where)));
720 const string ctgid =
string(
"lcl|") + im->second.m_id[where];
723 hit->SetId(where,
id);
726 if(hit->GetQueryStrand() ==
false) {
730 hit->SetScore(kBigDbl);
734 if(hit->GetLength() > maxlen) {
735 maxlen = hit->GetLength();
738 float score_factor = 0.25 / maxlen;
742 h->SetScore(h->GetScore() * (1 + score_factor * h->GetLength()));
744 if (args[
"no_output_constraint"].
HasValue()) {
762 int main(
int argc,
const char* argv[])
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
bool GetQueryStrand(void) const
TCoord GetQueryStart(void) const
static string s_RunLengthDecode(const string &in)
TCoord GetSubjStart(void) const
bool GetSubjStrand(void) const
const TId & GetId(Uint1 where) const
const TTranscript & GetTranscript(void) const
list< CRef< objects::CSeq_align > > TSeqAlignList
void x_LoadIDs(CNcbiIstream &istr)
void x_LoadConstraints(CNcbiIstream &istr, THitRefs &all)
void x_DoPairwise(THitRefs *pall)
void x_ReadInputHits(THitRefs *phitrefs, bool one_pair=false)
virtual void Exit()
Cleanup on application exit.
vector< THitRef > THitRefs
void x_IterateSeqAlignList(const TSeqAlignList &sa_list, THitRefs *phitrefs, bool parse_aln, const THit::TCoord &min_len, const double &min_idty) const
void x_DumpOutput(const THitRefs &hitrefs)
void x_DoMultiple(THitRefs *pall)
virtual int Run()
Run the application.
virtual void Init()
Initialize the application.
float GetScore(void) const
void FromTranscript(TSeqPos query_start, ENa_strand query_strand, TSeqPos subj_start, ENa_strand subj_strand, const string &transcript)
Initialize from pairwise alignment transcript (a string representation produced by CNWAligner)
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
void AddComment(const string &comment)
container_type::const_iterator const_iterator
container_type::iterator iterator
const_iterator end() const
const_iterator find(const key_type &key) const
static void s_RunGreedy(typename THitRefs::iterator hri_beg, typename THitRefs::iterator hri_end, THitRefs *phits_new, TCoord min_hit_len=100, double min_hit_idty=.9, TCoord margin=1, TCoord retain_overlap=0, EUnique_type unique_type=e_Strict)
EUnique_type
Multiple-sequences greedy alignment uniquification algorithm.
static void s_MergeAbutting(typename THitRefs::iterator hri_beg, typename THitRefs::iterator hri_end, const double &maxlenfr, THitRefs *pout)
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideLogfile
Hide log file description.
@ fHideConffile
Hide configuration file description.
@ fHideVersion
Hide version description.
CArgAllow_Strings * Allow(const string &value)
Add allowed string values.
@ fBinary
Open as binary file; for eInputFile, eOutputFile, eIOFile.
@ eInputFile
Name of file (must exist and be readable)
@ eDouble
Convertible into a floating point number (double)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
#define MSerial_AsnBinary
#define MSerial_AsnText
I/O stream manipulators –.
@ eSerial_AsnText
ASN.1 text.
@ eSerial_AsnBinary
ASN.1 binary.
const string AsFastaString(void) const
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
TGi GetGiForAccession(const string &acc, CScope &scope, EGetIdType flags=0)
Given an accession string retrieve the GI id.
string GetAccessionForGi(TGi gi, CScope &scope, EAccessionVersion use_version=eWithAccessionVersion, EGetIdType flags=0)
Retrieve the accession for a given GI.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
uint8_t Uint1
1-byte (8-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
TScore & SetScore(void)
Assign a value to Score data member.
const TStarts & GetStarts(void) const
Get the Starts member data.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
void ResetStrands(void)
Reset Strands data member.
void SetType(TType value)
Assign a value to Type data member.
TDim GetDim(void) const
Get the Dim member data.
TNumseg GetNumseg(void) const
Get the Numseg member data.
TIds & SetIds(void)
Assign a value to Ids data member.
const TDisc & GetDisc(void) const
Get the variant data.
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
@ eType_partial
mapping pieces together
ENa_strand
strand of nucleic acid
TGi GetGi(void) const
Get the variant data.
bool IsGi(void) const
Check if variant Gi is selected.
void SetData(TData &value)
Assign a value to Data data member.
list< CRef< CSeq_align > > TAlign
const TAlign & GetAlign(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
bool s_PHitRefScore(const CAppHitFilter::THitRef &lhs, const CAppHitFilter::THitRef &rhs)
int main(int argc, const char *argv[])
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is smart and slim</td> n<td> orig</td> n</tr> n<tr> n<td> last_modified</td> n<td> optional</td> n<td> Integer</td> n<td class=\"description\"> The blob last modification If provided then the exact match will be requested with n the Cassandra storage corresponding field value</td> n<td> Positive integer Not provided means that the most recent match will be selected</td> n<td></td> n</tr> n<tr> n<td> use_cache</td> n<td> optional</td> n<td> String</td> n<td class=\"description\"> The option controls if the Cassandra LMDB cache and or database should be used It n affects the seq id resolution step and the blob properties lookup step The following n options are BIOSEQ_INFO and BLOB_PROP at all
constexpr auto sort(_Init &&init)
#define GetProgramName
Avoid name clash with the NCBI C Toolkit.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
CRef< objects::CObjectManager > om