NCBI C++ ToolKit
xcompareannotsdemo.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: xcompareannotsdemo.cpp 92270 2020-12-29 19:42:29Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  ..
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Alex Astashyn
27  *
28  * File Description:
29  * Annotation cross-comparator
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <vector>
36 #include <map>
37 #include <list>
38 
39 
40 #include <corelib/ncbistd.hpp>
41 #include <corelib/ncbiapp.hpp>
42 #include <corelib/ncbienv.hpp>
43 #include <corelib/ncbiargs.hpp>
44 #include <corelib/ncbiobj.hpp>
45 #include <corelib/ncbiexpt.hpp>
47 
48 
50 #include <objmgr/scope.hpp>
52 
60 
61 #include <objects/seq/MolInfo.hpp>
64 #include <util/compress/stream.hpp>
65 #include <util/compress/zlib.hpp>
66 
69 
71 
74 
75 //#include <db/bdb/bdb_blobcache.hpp>
76 #include <dbapi/driver/drivers.hpp>
77 
78 #include <limits>
79 
82 
83 ///////////////////////////////////////////////////////////////////////////////
84 
85 
86 //Remap intervals individually and compute
87 //the quality of the remapping (identity)
89 {
90 public:
91 
93  {
94  public:
96  CGappedRange(TRange range = TRange(), unsigned gaps = 0) : m_range(range), m_gaps(gaps) {}
97  CGappedRange(const CGappedRange& r1, const CGappedRange& r2)
98  {
99  TSeqPos r1_len = r1.m_range.GetLength();
100  TSeqPos r2_len = r2.m_range.GetLength();
101 
103  m_gaps = r1.m_gaps + r2.m_gaps + (m_range.GetLength() - (r1_len + r2_len));
104  }
105 
106  string ToString() const
107  {
108  return "GRange{ " + NStr::UInt8ToString(m_range.GetFrom()) + "\t"
109  + NStr::UInt8ToString(m_range.GetTo()) + "\t"
111  + NStr::UInt8ToString(m_gaps) + "\t"
113 
114  }
115 
116  double GetIdentity() const
117  {
118  return m_range.GetLength() == 0 ? 0 : 1.0 - static_cast<double>(m_gaps) / m_range.GetLength();
119  }
120 
122  unsigned m_gaps;
123  };
124 
125 
126  /* this represents the quality of the remapping;
127  */
129  TSeqPos qry_len,
130  TSeqPos tgt_len,
131  TSeqPos aln_len,
132  double qry_bias = 0.5)
133  {
134  double d_aln_len = static_cast<double>(aln_len);
135  return (qry_len == 0 ? 0.0 : d_aln_len/(qry_len) * qry_bias)
136  + (tgt_len == 0 ? 0.0 : d_aln_len/(tgt_len) * (1.0 - qry_bias));
137  }
138 
139  CGappedRange CollapseRanges(list<CGappedRange>& ranges, TSeqPos query_len, double* identity_out = NULL)
140  {
141  CGappedRange r;
142  int pos = 0;
143  int best_ending_pos = ranges.size() - 1;
144 
145  r = CGappedRange();
146  pos = ranges.size() - 1;
147  int best_beginning_pos = 0;
148 
149 
150  r = CGappedRange();
151  pos = 0;
152  ITERATE(list<CGappedRange>, it, ranges) {
153  if(pos >= best_beginning_pos && pos <= best_ending_pos) {
154  r = CGappedRange(r, *it);
155  }
156  pos++;
157  }
158 
159  if(identity_out != NULL) {
160  unsigned aligned_len = r.m_range.GetLength() - r.m_gaps;
161  unsigned d = (query_len + r.m_range.GetLength() - aligned_len);
162  *identity_out = d == 0 ? 0 : static_cast<double>(aligned_len) / d;
163  }
164 
165  return r;
166  }
167 
168 
169  //we might need to remap locations with seq-ids specified as accession-noversion.
170  //We must treat such seq-id as the same version in the mapper's underlying alignment
171  //as opposed to latest-version-in-id. We keep the from_id that contains
172  //the accession-version in the alignment query, so if we encounter
173  //an versionless-seq-id location we'll treat it as that versioned seq-id
175  CSeq_loc_Mapper& mapper,
176  CScope& scope,
177  bool is_spliced = false, //if true, do not collapse remapped ranges
178  const CSeq_id* from_id = NULL,
179  bool strip_versions = false)
180  : m_scope(scope)
181  , m_is_spliced(is_spliced)
182  , m_from_id(new CSeq_id)
184  , m_mapper(mapper)
185  , m_strip_versions(strip_versions)
186  {
187  if(from_id && !from_id->IsLocal()) {
188  m_from_id->Assign(*from_id);
190  }
191  }
192 
193 
194  CRef<CSeq_loc> Map(const CSeq_loc& loc, double* mapped_identity = NULL)
195  {
196  CRef<CSeq_loc> temp_loc(new CSeq_loc);
197  CRef<CSeq_loc> mapped_loc(new CSeq_loc);
198  mapped_loc->SetMix();
199 
200  temp_loc->Assign(loc);
201  temp_loc->ChangeToMix();
202 
203  TSeqPos query_len_total(0);
204  TSeqPos aligned_len_total(0);
205  TSeqPos collapsed_len_total(0);
206 
207  for(CSeq_loc_CI it(*temp_loc); it; ++it) {
208  CConstRef<CSeq_loc> ci_loc = it.GetRangeAsSeq_loc();
209  TSeqPos query_len = sequence::GetLength(*ci_loc, NULL); //may belong to another scope
210  query_len_total += query_len;
211 
212  CRef<CSeq_loc> mapped_interval = m_mapper.Map(*ci_loc);
213 
214  if(mapped_interval->IsNull() || mapped_interval->IsEmpty()) {
215  continue;
216  }
217 
218  CRef<CSeq_loc> mapped_interval_merged = sequence::Seq_loc_Merge(
219  *mapped_interval,
221  &m_scope);
222  //TSeqPos remapped_len = sequence::GetLength(*mapped_interval_merged, NULL);
223 
224 
225  //in case of spliced alignments we remap an interval to
226  //a gapped loc; In case of non-spliced alignments we
227  //remap each interval indivdually and collapse it to single range.
228  //In case of a spliced alignment it would collapse the whole thing
229  //into one range, so we have to process them differently
230  if(m_is_spliced) {
231  mapped_loc->Add(*mapped_interval_merged);
232  continue;
233  }
234 
235 
236 
237  list<CGappedRange> mapped_ranges_list;
238  for(CSeq_loc_CI it2(*mapped_interval_merged); it2; ++it2) {
239  mapped_ranges_list.push_back(CGappedRange(it2.GetRange()));
240  }
241 
242 
243 
245  mapped_ranges_list,
246  query_len);
247 
248  TSeqPos collapsed_len = r.m_range.GetLength();
249  TSeqPos aligned_len = collapsed_len - r.m_gaps;
250 
251 
252  aligned_len_total += aligned_len;
253  collapsed_len_total += collapsed_len;
254 
255  if(aligned_len > query_len) {
256  //The identity formula relies on non-redundant remapping
257  string s2 = "";
258  ci_loc->GetLabel(&s2);
259  string s3 = "";
260  mapped_interval->GetLabel(&s3);
261  ERR_POST(Warning << "Detected non-redundant remapping from\n"
262  //<< s0
263  //<< "\nto\n"
264  //<< s1
265  << "\nsegment\n"
266  << s2
267  << "\nremapped segment\n"
268  << s3
269  << "\naligned len: " << aligned_len
270  << "\ncollapsed len: " << collapsed_len
271  );
272  }
273 
274  CRef<CSeq_loc> mapped_interval_collapsed(new CSeq_loc);
275  mapped_interval_collapsed->SetInt().SetFrom(r.m_range.GetFrom());
276  mapped_interval_collapsed->SetInt().SetTo(r.m_range.GetTo());
277  mapped_interval_collapsed->SetInt().SetStrand(sequence::GetStrand(*mapped_interval_merged, &m_scope));
278 
279  CRef<CSeq_id> id(new CSeq_id);
280  id->Assign(sequence::GetId(*mapped_interval_merged, &m_scope));
281  mapped_interval_collapsed->SetInt().SetId(*id);
282 
283  mapped_loc->Add(*mapped_interval_collapsed);
284  }
285 
286 
287  if(mapped_identity != NULL) {
288  if(m_is_spliced) {
289  *mapped_identity = 1;
290  } else {
291  TSeqPos d = query_len_total + collapsed_len_total - aligned_len_total;
292  *mapped_identity = d == 0 ? 0 : static_cast<double>(aligned_len_total) / d;
293  }
294 
295  }
296 
297  return mapped_loc;
298  }
299 
300 
301 private:
308 };
309 
310 ///////////////////////////////////////////////////////////////////////////////
311 ///////////////////////////////////////////////////////////////////////////////
313 {
314 public:
315  CGbScopeLoader(CScope& scope) : m_scope(scope) {};
316  virtual ~CGbScopeLoader() {}
317 
319  {
321  return true;
322  }
323 private:
325 };
326 
327 bool IsManualScope(CScope& scope) {
328  CScope::TTSE_Handles handles;
329  scope.GetAllTSEs(handles, CScope::eManualTSEs);
330  int k = handles.size();
331  return k != 0;
332 }
333 
334 bool IsInScope(CScope& scope, const CSeq_id& id)
335 {
336  //if(scope.GetBioseqHandle(id)) {
337  // return true;
338  //}
339 
340  //this used to be implemented by checking if bioseq handle could be retreived,
341  //(as above)
342  //but depending on what kind of scope we're dealing with and how it is indexed,
343  //this may not be reliable. So instead we check by trying to see if we can
344  //find any features at all
345  SAnnotSelector sa;
346 
347  sa.SetResolveAll();
348  if(IsManualScope(scope)) {
349  sa.SetSearchUnresolved();
350  }
351 
352  sa.SetMaxSize(1);
353 
355 
360 
361  CRef<CSeq_loc> loc(new CSeq_loc);
362 
363  CRef<CSeq_id> seq_id(new CSeq_id);
364  seq_id->Assign(id);
365  loc->SetWhole(*seq_id);
366 
367 
368 
369  for(CFeat_CI ci(scope, *loc, sa); ci; ++ci) {
370  return true;
371  }
372 
373  return false;
374 }
375 
376 string GetProductLabel(const CSeq_feat& feat, CScope& scope)
377 {
378  string s = "";
379  if(feat.CanGetProduct()) {
380  if(sequence::GetId(feat.GetProduct(), &scope).IsGi()) {
381  CRef<CSeq_loc> loc(new CSeq_loc);
382  loc->Assign(feat.GetProduct());
383  CRef<CSeq_id> id(new CSeq_id);
384  id->Assign(*sequence::GetId(
385  sequence::GetId(*loc, &scope),
386  scope,
388 
389  loc->SetId(*id);
390  loc->GetLabel(&s);
391  } else {
392  feat.GetProduct().GetLabel(&s);
393  }
394  }
395  return s;
396 }
397 
399 {
400  return
401  str == "asn_text" ? eSerial_AsnText :
402  str == "asn_bin" ? eSerial_AsnBinary :
403  str == "asn_xml" ? eSerial_Xml :
404  str == "asn_json" ? eSerial_Json :
405  eSerial_None;
406 }
407 
409  const CSeq_loc& loc,
410  CScope& scope,
411  bool use_long_label)
412 {
413  string long_label = "";
414  if(use_long_label) {
415  loc.GetLabel(&long_label);
416  long_label += "\t";
417  }
418 
419  //NcbiCout << MSerial_AsnText << loc;
421 
422  string out = "";
423  try {
425  sequence::GetId(*tmp_loc, &scope),
426  scope,
427  sequence::eGetId_ForceAcc).AsString();
428  } catch(...) {
429  out = sequence::GetId(*tmp_loc, &scope).AsFastaString();
430  }
431 
432  if(loc.IsWhole()) {
433  out += "\t\t";
434  } else {
435  out += "\t" + NStr::UInt8ToString(sequence::GetStart(*tmp_loc, &scope) + 1);
436  out += "\t" + NStr::UInt8ToString(sequence::GetStop(*tmp_loc, &scope) + 1);
437  }
438 
439  out += "\t"; out += (sequence::GetStrand(*tmp_loc, &scope) == eNa_strand_minus ? "-" : "+" );
440 
441  return long_label + out;
442 }
443 
444 
446  CScope& scope,
447  const CSeq_id& id,
449  string title = "Sentinel")
450 {
451  CRef<CSeq_annot> annot(new CSeq_annot);
452  //entry->SetSeq().SetAnnot().push_back(annot);
453  CRef<CSeq_id> new_id(new CSeq_id);
454  new_id->Assign(id);
455 
456 
457  CRef<CSeq_feat> feat(new CSeq_feat);
458  annot->SetData().SetFtable().push_back(feat);
459  feat->SetTitle(title);
460 
461  CRef<CSeq_loc> loc(new CSeq_loc);
462  loc->SetWhole(*new_id);
463  feat->SetLocation(*loc);
464  feat->SetData().Select(type);
465 
466 
467 
468  if(type == CSeqFeatData::e_Rna) {
469  CRef<CRNA_ref> rna_ref(new CRNA_ref);
470  rna_ref->SetType(CRNA_ref::eType_other);
471  feat->SetData().SetRna(*rna_ref);
472  } else if(type == CSeqFeatData::e_Gene) {
473  CRef<CGene_ref> gene_ref(new CGene_ref);
474  feat->SetData().SetGene();
475  } else {
476  NCBI_THROW(CException, eUnknown, "Type must be eGene, eRna");
477  }
478 
479  scope.AddSeq_annot(*annot);
480 }
481 
482 
483 /*
484  * Sometimes we compare alignment of some seq to annotation, and the source
485  * seq is an arbitrary fasta without annotation. We'll create a sentinel seq,
486  * add it to scope, and have a spanning RNA feat on it, so then we can
487  * compare it as usual
488  */
489 void AddSentinelRNASeq(CScope& scope, const CSeq_id& id)
490 {
491  CRef<CSeq_entry> entry(new CSeq_entry);
492  CRef<CSeq_id> new_id(new CSeq_id);
493  new_id->Assign(id);
494 
495  entry->SetSeq().SetId().push_back(new_id);
496  entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_virtual);
497  entry->SetSeq().SetInst().SetLength(1000000); //othrewise CFeat_CI throws
498  entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna); //mol type required by mapper to compute seq width
499 
500  scope.ResetHistory(); //otherwise adding new entry to non-clean scope produces warnings
501  scope.AddTopLevelSeqEntry(*entry);
502 
505 }
506 
507 
508 /* If the sequence has mRNA in title but no mrna annotated, add spanning
509  * mrna feature for the purpose of comparisons. Same goes for Gene
510  */
511 void AddDefaultSentinelFeats(CScope& scope, const CSeq_loc& loc)
512 {
513  const CSeq_id& id = sequence::GetId(loc, &scope);
514  CBioseq_Handle h = scope.GetBioseqHandle(id);
515  string title = sequence::CDeflineGenerator().GenerateDefline(h);
516 
517  bool add_gene = false;
518  bool add_rna = false;
519  bool is_gene = false;
520 
521  SAnnotSelector sel;
522  sel.SetResolveAll();
523  sel.SetOverlapTotalRange();
524  sel.SetResolveDepth(0);
525 
527  try {
528  biomol = sequence::GetMolInfo(h)->GetBiomol();
529  } catch (CException&) {};
530 
531  bool biomol_rna = !( biomol == CMolInfo::eBiomol_unknown
532  || biomol == CMolInfo::eBiomol_genomic
533  || biomol == CMolInfo::eBiomol_peptide
535  || biomol == CMolInfo::eBiomol_other);
536 
537 
538 
539  if( NStr::Find(title, "gene", NStr::eNocase) != NPOS
540  || NStr::Find(title, "mRNA", NStr::eNocase) != NPOS
541  || NStr::Find(title, "CDS", NStr::eNocase) != NPOS)
542  {
543  is_gene = true;
544  }
545 
546  //we will add a gene feature if missing if:
547  //biomol is rna
548  //or biomol is not rna BUT the title contains gene|mRNA|CDS
549  //(this happens when genes are submitted on genomic sequences and may have gene feature missing)
550 
551  if(is_gene || biomol_rna) {
553  add_gene = true;
554  for(CFeat_CI ci(scope, loc, sel); ci; ++ci) {
555  add_gene = false; break;
556  }
557 
558  if(add_gene) {
560  scope,
561  id,
563  "[Sentinel feat]" + title);
564  }
565 
566  }
567 
568  if(biomol_rna) {
570  add_rna = true;
571  for(CFeat_CI ci2(scope, loc, sel); ci2; ++ci2) {
572  add_rna = false; break;
573  }
574 
575  if(add_rna) {
577  scope,
578  id,
580  "[Sentinel feat]" + title);
581  }
582  }
583 
584 }
585 
586 unique_ptr<CObjectIStream> GetIStream(string path, ESerialDataFormat serial_format)
587 {
588  unique_ptr<CObjectIStream> obj_istr;
589  if ( NStr::EndsWith(path, ".gz")) {
590  obj_istr.reset(CObjectIStream::Open(
591  serial_format,
592  *(new CCompressionIStream(
593  *(new CNcbiIfstream(path.c_str(), IOS_BASE::in | IOS_BASE::binary)),
596  eTakeOwnership));
597  } else {
598  obj_istr.reset(CObjectIStream::Open(serial_format, path));
599  }
600 
601  return obj_istr;
602 }
603 
604 
605 //Note: we need to return the method because depending on the method
606 //we will decide whether to use scope object for remapping.
608 typedef int TLoadScopeMethod;
609 TLoadScopeMethod LoadScope(string arg_path, CScope& scope, ESerialDataFormat serial_format)
610 {
611  AutoPtr<CDirEntry> file_or_dir(new CDirEntry(arg_path));
612 
613  CDir::TEntries dir_entries;
614  if(file_or_dir->IsDir()) {
615  CDir dir(*file_or_dir);
616  dir_entries = dir.GetEntries();
617  } else {
618  dir_entries.push_back(file_or_dir);
619  }
620 
622 
623  ITERATE(CDir::TEntries, it, dir_entries) {
624  AutoPtr<CDirEntry> ent = *it;
625  string path = ent->GetPath();
626 
627 
628  if(!ent->IsFile()) continue;
629 
630  _TRACE("loading " + path);
631  try {
632  unique_ptr<CObjectIStream> obj_istr = GetIStream(path, serial_format);
633  if (!obj_istr->InGoodState()) {
634  ERR_POST(Error << "Could not open file " << *it);
635  return eLoadScope_Failed;
636  }
637  _TRACE("Trying as Seq-entry");
638 
639  while(!obj_istr->EndOfData()) {
640  CRef<CSeq_entry> seq_entry(new CSeq_entry);
641  *obj_istr >> *seq_entry;
642  _TRACE("adding TSE from " + path);
643  scope.AddTopLevelSeqEntry(*seq_entry);
644  }
645  _TRACE("Loaded as Seq-entry");
646  method = eLoadScope_SeqEntry;
647  continue;
648  } catch(CException&) {};
649 
650  try {
651  unique_ptr<CObjectIStream> obj_istr = GetIStream(path, serial_format);
652  if (!obj_istr->InGoodState()) {
653  ERR_POST(Error << "Could not open file " << *it);
654  return false;
655  }
656  _TRACE("Trying as Seq-annot");
657 
658  while(!obj_istr->EndOfData()) {
659  CRef<CSeq_annot> seq_annot(new CSeq_annot);
660  *obj_istr >> *seq_annot;
661  _TRACE("adding Seq-annot from " + path);
662  scope.AddSeq_annot(*seq_annot);
663  }
664  _TRACE("Loaded as Seq-annot");
665  method = eLoadScope_SeqAnnot;
666  continue;
667  } catch(CException&) {};
668 
669  try {
670  unique_ptr<CObjectIStream> obj_istr = GetIStream(path, serial_format);
671  if (!obj_istr->InGoodState()) {
672  ERR_POST(Error << "Could not open file " << *it);
673  return eLoadScope_Failed;;
674  }
675 
676  _TRACE("Trying as genbank bioseqset");
677  CGBReleaseFile rf(*obj_istr.release());
678  rf.RegisterHandler(new CGbScopeLoader(scope));
679  rf.Read();
680  _TRACE("Loaded as genbank bioseqset");
681  method = eLoadScope_GBR;
682  continue;
683  } catch (CException&) {}
684 
685  //ERR_POST(Fatal << "Cannot load " << path);
686  return eLoadScope_Failed; //should have 'continue'd in one of the tries above
687  }
688 
689  return method;
690 }
691 
692 
693 ///////////////////////////////////////////////////////////////////////////////
694 ///////////////////////////////////////////////////////////////////////////////
695 ///////////////////////////////////////////////////////////////////////////////
697 {
698 public:
699  virtual void Init(void);
700  virtual int Run (void);
701 
702 private:
703  void x_ProcessComparison(
704  CCompareSeqRegions& comparator,
705  TSeqPos enclosing_alignment_length);
706 
707  void x_ProcessSeqAlignSetFromFile(string filename);
709  string q_id,
710  TSeqPos q_start,
711  TSeqPos q_stop,
712  string t_id,
713  TSeqPos t_start,
714  TSeqPos t_stop);
715  void x_ProcessMappingRanges();
716 
722 
723  //for x_ProcessMappingRange
725 
728 
731 
732  map<int, map<string, string> > m_id_map; //aln_row (normally only 0 and 1) -> from_id -> to_id
733 
735 };
736 
737 
738 
739 
741 {
742  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
743 
744 
745  arg_desc->AddOptionalKey("q",
746  "path",
747  "Query Scope",
749  arg_desc->AddDefaultKey("q_serial",
750  "serialformat",
751  "Serial Format",
753  "asn_text");
754  arg_desc->SetConstraint("q_serial",
755  (new CArgAllow_Strings())
756  ->Allow("asn_text")
757  ->Allow("asn_bin"));
758 
759  arg_desc->AddOptionalKey("t",
760  "path",
761  "Target scope",
763  arg_desc->AddDefaultKey("t_serial",
764  "serialformat",
765  "Serial Format",
767  "asn_text");
768  arg_desc->SetConstraint("t_serial",
769  (new CArgAllow_Strings())
770  ->Allow("asn_text")
771  ->Allow("asn_bin"));
772 
773 
774  arg_desc->AddKey("i",
775  "input",
776  "File containing one of the following: "
777  "\n - list of paths to asn files of alignments"
778  "\n - pairs of seq-ids (assuming identity alignment) from query and target scopes"
779  "\n - 6-column input of seq-id1\\start1\\stop1\\seq-id2\\start2\\stop2 - compare via mapping ranges"
780  "\n - 3-column input of: seq-id\\tstart\\tstop - compare components to level-0 annots",
782 
783 
784  arg_desc->AddDefaultKey("i_container",
785  "asntype",
786  "Top-level ASN.1 container type",
788  "Seq-align-set");
789  arg_desc->SetConstraint("i_container",
790  (new CArgAllow_Strings())
791  ->Allow("Seq-align-set")
792  ->Allow("Seq-annot")
793  ->Allow("Seq-align"));
794 
795  arg_desc->AddDefaultKey("i_serial",
796  "serialformat",
797  "Serial Format",
799  "asn_text");
800  arg_desc->SetConstraint("i_serial",
801  (new CArgAllow_Strings())
802  ->Allow("asn_text")
803  ->Allow("asn_bin"));
804 
805 
806  arg_desc->AddOptionalKey("id_map",
807  "file",
808  "Convert seq-ids in alignments. (Format: aln_row\\tfrom_id\\tto_id\\n)",
810 
811 
812  arg_desc->AddDefaultKey("depth",
813  "integer",
814  "SAnnotSelector resolve depth",
816  "0");
817 
818  arg_desc->AddFlag("sentinel_seqs",
819  "If the query seq in the alignment is not in query scope (e.g. local id),"
820  " this will create a Seq-entry with spanning RNA feature and add it to scope "
821  " such that the sequence placement can be compared to the annotation");
822  arg_desc->AddFlag("sentinel_feats",
823  "If bioseq title has 'mRNA' in it, add spanning gene and mrna feats if missing;"
824  "if the title contains 'gene', add spanning gene feat if missing");
825  arg_desc->AddFlag("spliced",
826  "If using spliced alignments, this option must be specified such that"
827  "a spliced query location is not collapsed to single range as it would by default");
828  arg_desc->AddFlag("adaptive_depth",
829  "Use adaptive depth in SAnnotSelector (e.g. when dealing with chromosome locations)"
830  "Default is false, because normally we don't want to dig down to find features");
831 
832  arg_desc->AddFlag("allow_ID",
833  "Use ID when explicitly provided scope is lacking necessary info"
834  " (e.g. when remapping contig annots from file with chromosome alignments"
835  " we would need to get chromosome seq-map from gb to iterate feats in chrom coords)");
836 
837 
838  arg_desc->AddOptionalKey("add_qual",
839  "qualkey",
840  "Add additional named qualifier columns q_qualkey and t_qualkey",
843 
844  arg_desc->AddOptionalKey("add_dbxref",
845  "dbxrefkey",
846  "Add additional named dbxref columns q_dbxrefkey and t_dbxrefkey",
849 
850 
851  //TODO: Must fix the code to decide whether to use scopeless_mapper automatically
852 
853  arg_desc->AddFlag("range_overlap", "Use overlap by ranges to allow comparison between features that overlap by ranges but not by intervals.");
854  arg_desc->AddFlag("reverse", "Swap q and t in inputs");
855 // arg_desc->AddFlag("long_loc_label", "Long labels for location instead of id/start/stop/strand");
856  arg_desc->AddFlag("trace", "Turn on tracing");
857  arg_desc->AddFlag("strict_type", "Match features of the same type only");
858 
859  arg_desc->SetUsageContext(
860  GetArguments().GetProgramBasename(),
861  "Cross-compare annots on seqs", false);
862 
863  SetupArgDescriptions(arg_desc.release());
864 
865  #ifdef _DEBUG
867  //SetDiagPostLevel(eDiag_Error);
868  #else
870  //SetDiagPostLevel(eDiag_Error);
871  //SetDiagPostLevel(eDiag_Info);
872  #endif
873 
874 
876 /*
877  DBAPI_RegisterDriver_CTLIB();
878  GenBankReaders_Register_Pubseq2();
879  BDB_Register_Cache();
880 */
881  CPluginManager_DllResolver::EnableGlobally(true); //to allow handling of dlls as specified in conffile
882 }
883 
884 
885 
886 
888 {
889  if(!m_q_id_prev) return;
890 
891  if(m_q_loc.IsNull() || m_t_loc.IsNull() || m_q_loc->IsNull() || m_t_loc->IsNull()) {
892  return;
893  }
894 
896 
897 
899  //when remapping Gnomon ent's loaded from files, must use NULL to remap (or else fails)
900  //using m_scope_qt or m_scope_q or m_scqpe_t instead of NULL causes remapping to fail
901 
902 #if _DEBUG
903  string s ="";
904  qry_rgn_loc->GetLabel(&s);
905  _TRACE("Mapping range q: " + s);
906 
907  s = "";
908  CRef<CSeq_loc> mapped = simple_mapper->Map(*m_q_loc);
909  mapped->GetLabel(&s);
910  _TRACE("Mapping range t:" + s);
911 #endif
912 
914  *simple_mapper
915  , *m_scope_q
916  , m_args["spliced"]
917  , NULL
918  , false));
919 
921  rgn_loc->SetInt().SetStrand(eNa_strand_plus);
922 
923  //if we are working with the same scope and same query and target locs,
924  //we are comparing the seq against itself, so compare features on different
925  //genes only, and report all matches, not only best B|F|R classes, and
926  //report overlaps of same types only
927  bool self_comparison =
931  m_scope_q)
934 
935 
937  CRef<CCompareSeqRegions> region_comparator(new CCompareSeqRegions(
938  *rgn_loc,
939  m_scope_q,
940  m_scope_t,
941  *mapper,
942  m_sel,
943  m_sel,
944  *m_t_id_prev.GetSeqId()));
945  if(self_comparison)
946  {
947  region_comparator->SetOptions() |= CCompareSeqRegions::fDifferentGenesOnly;
948  region_comparator->SetOptions() &= ~CCompareSeqRegions::fSelectBest;
949  region_comparator->SetOptions() |= CCompareSeqRegions::fSameTypeOnly;
950  }
951 
952  if(m_args["strict_type"]) region_comparator->SetOptions() |= CCompareSeqRegions::fSameTypeOnly;
953  x_ProcessComparison(*region_comparator, len);
954 
955  rgn_loc->SetInt().SetStrand(eNa_strand_minus);
956  region_comparator.Reset(new CCompareSeqRegions(
957  *rgn_loc,
958  m_scope_q,
959  m_scope_t,
960  *mapper,
961  m_sel,
962  m_sel,
963  *m_t_id_prev.GetSeqId()));
964  if(self_comparison)
965  {
966  region_comparator->SetOptions() |= CCompareSeqRegions::fDifferentGenesOnly;
967  region_comparator->SetOptions() &= ~CCompareSeqRegions::fSelectBest;
968  region_comparator->SetOptions() |= CCompareSeqRegions::fSameTypeOnly;
969  }
970  if(m_args["strict_type"]) region_comparator->SetOptions() |= CCompareSeqRegions::fSameTypeOnly;
971  x_ProcessComparison(*region_comparator, len);
972 }
973 
974 
976  string str_q_id,
977  TSeqPos q_start,
978  TSeqPos q_stop,
979  string str_t_id,
980  TSeqPos t_start,
981  TSeqPos t_stop)
982 {
983  CRef<CSeq_id> q_seq_id(new CSeq_id(str_q_id));
984  CRef<CSeq_id> t_seq_id(new CSeq_id(str_t_id));
985  CSeq_id_Handle q_id = sequence::GetId(*q_seq_id, *m_scope_q);
986  CSeq_id_Handle t_id = sequence::GetId(*t_seq_id, *m_scope_t);
987 
988  //encountered hit on another q/t pair
989  if(m_t_id_prev && (q_id != m_q_id_prev || t_id != m_t_id_prev))
990  {
991  //process the alignment and reset it
992  this->x_ProcessMappingRanges();
993 
994  //m_mapping_ranges.Reset(new CMappingRanges);
997 
998  }
999 
1000  //add hit to the mapping ranges
1001 
1002 
1003  ENa_strand t_strand = eNa_strand_plus;
1004 
1005  if(t_start > t_stop) {
1006  TSeqPos t = t_start;
1007  t_start = t_stop;
1008  t_stop = t;
1009  t_strand = eNa_strand_minus;
1010  }
1011 
1012  CRef<CSeq_loc> qloc(new CSeq_loc);
1013  CRef<CSeq_loc> tloc(new CSeq_loc);
1014 
1015  qloc->SetInt().SetId(*q_seq_id);
1016  qloc->SetInt().SetFrom(q_start);
1017  qloc->SetInt().SetTo(q_stop);
1018  qloc->SetInt().SetStrand(eNa_strand_plus);
1019 
1020 
1021  tloc->SetInt().SetId(*t_seq_id);
1022  tloc->SetInt().SetFrom(t_start);
1023  tloc->SetInt().SetTo(t_stop);
1024  tloc->SetInt().SetStrand(t_strand);
1025 
1026 
1027  if(m_q_loc.IsNull() || m_q_loc->IsNull()) {
1028  m_q_loc = qloc;
1029  } else {
1030  m_q_loc->Add(*qloc);
1031  }
1032 
1033  if(m_t_loc.IsNull() || m_t_loc->IsNull()) {
1034  m_t_loc = tloc;
1035  } else {
1036  m_t_loc->Add(*tloc);
1037  }
1038 
1039 
1040  m_t_id_prev = t_id;
1041  m_q_id_prev = q_id;
1042 }
1043 
1044 
1045 
1046 
1047 
1049 {
1050  typedef list<CRef<CSeq_align> > TSeqAlignList;
1051  typedef map<string, CRef<CAlnMix> > TAlnMixes;
1052 
1054  istr.reset(CObjectIStream::Open(StringToSerialFormat(m_args["i_serial"].AsString()), filename));
1055 
1056  CRef<CSeq_align_set> aligns_set;
1057  CRef<CSeq_annot> aligns_annot;
1058  TSeqAlignList aligns_list;
1059 
1060 
1061  _TRACE("Processing " + filename);
1062 
1063 
1064  while(!istr->EndOfData()) {
1065  try {
1066  typedef list<CRef<CSeq_align> > TSeqAlignList;
1067  if(m_args["i_container"].AsString() == "Seq-align-set") {
1068  aligns_set.Reset(new CSeq_align_set);
1069  *istr >> *aligns_set;
1070  } else if(m_args["i_container"].AsString() == "Seq-annot") {
1071  aligns_annot.Reset(new CSeq_annot);
1072  *istr >> *aligns_annot;
1073  } else if (m_args["i_container"].AsString() == "Seq-align") {
1074  CRef<CSeq_align> aln(new CSeq_align);
1075  *istr >> *aln;
1076  aligns_list.clear();
1077  aligns_list.push_back(aln);
1078  } else {
1079  ERR_POST(Fatal << "Don't know about this format: " << m_args["i_container"].AsString());
1080  }
1081 
1082 
1083  /*
1084  * We could have stuck all alignments in one CAlnMix, but the resulting
1085  * merged Seq-align becomes too large and too sparse, (and thus too slow) so it is more efficient to
1086  * have a CAlnMix for each target sequence
1087  */
1088 
1089 
1090  TAlnMixes alnMixes;
1091 
1092 
1093 
1094  NON_CONST_ITERATE(TSeqAlignList, it,
1095  aligns_set ? aligns_set->Set() :
1096  aligns_annot ? aligns_annot->SetData().SetAlign() :
1097  aligns_list)
1098  {
1099 
1100 
1101  if(!m_id_map.empty()) {
1102  for(int i = 0; i <= m_id_map.rbegin()->first; i++) {
1103  CRef<CSeq_id>& seq_id = (*it)->SetSegs().SetDenseg().SetIds()[i];
1104  if(seq_id.IsNull()) continue;
1105 
1106  string str_seq_id = "";
1107  seq_id->GetLabel(&str_seq_id);
1108  if(m_id_map[i].find(str_seq_id) != m_id_map[i].end()) {
1109  seq_id->Set(m_id_map[i][str_seq_id]);
1110  }
1111  }
1112  }
1113 
1114 
1115 
1116  if(m_args["reverse"]) {
1117  (*it)->SwapRows(0, 1);
1118  }
1119 
1120 
1121 
1122  const CSeq_id& id_q = (*it)->GetSeq_id(0);
1123  const CSeq_id& id_t = (*it)->GetSeq_id(1);
1124 
1125 
1126 
1127  string str_id_q = "";
1128  id_q.GetLabel(&str_id_q);
1129 
1130  string str_id_t = "";
1131  id_t.GetLabel(&str_id_t);
1132 
1133  string str_aln = "aln:" + str_id_q + "->" + str_id_t;
1134 
1135 
1136  if(m_args["sentinel_seqs"] && !m_scope_q->GetBioseqHandle(id_q)) {
1137  AddSentinelRNASeq(*m_scope_q, id_q); //for remapper
1138  }
1139 
1140  if(!IsInScope(*m_scope_q, id_q)) {
1141  _TRACE(str_aln +" : query seq not in scope_q");
1142  continue;
1143  } else if(!IsInScope(*m_scope_t, id_t)) {
1144  _TRACE(str_aln + " : target seq not in scope_t");
1145  continue;
1146  } else {
1147  LOG_POST("Loading " + str_aln);
1148  if(alnMixes[str_aln].IsNull()) alnMixes[str_aln].Reset(new CAlnMix(/**m_scope_qt, 0*/));
1149  alnMixes[str_aln]->Add(**it, CAlnMix::fPreserveRows /*, CAlnMix::fCalcScore*/); //keep query as row 0
1150  }
1151  }
1152 
1153 
1154  ITERATE(TAlnMixes, it2, alnMixes) {
1155  string str_id = it2->first;
1156 
1157 
1158 
1159  CRef<CAlnMix> aln_mix = it2->second;
1160 
1161  aln_mix->Merge(
1162  // CAlnMix::fTruncateOverlaps
1165  // | CAlnMix::fSortInputByScore
1166 
1167  //| CAlnMix::fFillUnalignedRegions
1168  //| CAlnMix::fAllowTranslocation
1169  );
1170 
1171 
1172 
1173  CConstRef<CSeq_align> merged_aln(&aln_mix->GetSeqAlign());
1174 
1175 
1176  CSeq_align_Base::TDim aln_dim = merged_aln->GetDim();
1177 
1178 
1179  //NcbiCout << MSerial_AsnText << *merged_aln;
1180  //ERR_POST(Fatal << "temp exit");
1181 
1182  //when we have a seq-align of self-to-self, the CAlnMix merges
1183  //query and target together into a single row. This is unusable for remapping;
1184  //In this case revert to the original
1185  _TRACE("Alignment dim: " + NStr::IntToString(merged_aln->GetDim()));
1186  if(aln_dim == 1) {
1187  if(aln_mix->GetInputSeqAligns().size() != 1) {
1188  NCBI_THROW(CException, eUnknown, "Multiple alignments collapsed to CAlnMix of dim 1");
1189  } else {
1190  merged_aln.Reset(aln_mix->GetInputSeqAligns()[0]);
1191  aln_dim = 2; //a dangerous assumption
1192  }
1193  }
1194 
1195  // Construct the source sequence loc
1196  // We use a "source loc" instead of 'whole'
1197  // beacuse we want to limit the scope of comparison
1198  // to the region spanned by the alignment
1199  CRef<CSeq_loc> loc(new CSeq_loc);
1200  loc->SetInt().SetFrom(merged_aln->GetSeqStart(0));
1201  loc->SetInt().SetTo(merged_aln->GetSeqStop(0));
1202  //loc->SetInt().SetStrand(eNa_strand_both);
1203 
1204  CRef<CSeq_id> id(new CSeq_id);
1205  id->Assign(merged_aln->GetSeq_id(0));
1206 
1207 
1208 
1209  TSeqPos source_len = merged_aln->GetSeqRange(0).GetLength();
1210 
1211 
1212  loc->SetInt().SetId(*id);
1213 
1214  for(CSeq_align_Base::TDim row = 1; row < aln_dim; ++row) {
1215 
1216  CRef<ILocMapper> loc_mapper;
1217 
1218  //use m_scope_id instead of m_scope_q to create an aligner
1219  //because Local Data Storage source have seq-ids without versions, while the alignments are
1220  //with versions (i.e. can't remap via versioned alignments from versionless
1221  //location from local storage). Need to think through whether this is a legit thing to do
1222  //A possible failure scenario is an alignment from an old build with
1223  //old contig versions; we process a seq from SplignLDS source
1224  //without a version, so a scope gets the latest one and naturally can't remap it
1225  //if target scope is loaded from local starage, tell the mapper to strip versions from mapped locations
1226 
1227 
1228  CRef<CSeq_loc_Mapper> simple_mapper(new CSeq_loc_Mapper(
1229  *merged_aln,
1230  row,
1231  m_scope_for_mapper)); //m_scope_qt causes problems resolving seq-id from one scope to another scope
1232 
1233 
1234 
1235 
1236 
1237  loc_mapper.Reset(new CLocMapper_Default(
1238  *simple_mapper,
1239  *m_scope_q,
1240  m_args["spliced"],
1241  NULL,
1242  false
1243  ));
1244 
1245 
1246 
1247  #if 0 //this doesn't work when mapping a versionless sequence with versioned alignments
1248  //; the mapper can't map; BUT WHY REMAPPING INDIVIDUAL FEATURES WORKS??
1249 
1250  //calculate the mapped length
1251  CRef<CSeq_loc_Mapper> m2(...);
1252 
1253 
1254  string s = "";
1255  CConstRef<CSeq_loc> mapped = m2->Map(*loc);
1256  mapped->GetLabel(&s);
1257  _TRACE("Enclosing alignment (row " + NStr::IntToString(row) +") :" + s);
1258 
1259  TSeqPos enclosing_alignment_length = sequence::GetLength(*mapped, m_scope_t);
1260  #endif
1261 
1262 
1263 
1264 
1265  if(m_args["sentinel_feats"]) {
1267  loc->SetInt().SetStrand(eNa_strand_both);
1268  }
1269 
1270  //Update: now doing it one strand at a time because it is faster
1271  //(resolving smaller sized overlap groups)
1272  //also, doing it this way we avoid a situation
1273  //where non-overlapping features on the same strand
1274  //may be put in the same overlap group due to
1275  //common overlapping feature on the other strand
1276 
1277  loc->SetInt().SetStrand(eNa_strand_plus);
1278 
1279  CRef<CCompareSeqRegions> region_comparator(new CCompareSeqRegions(
1280  *loc,
1281  m_scope_q,
1282  m_scope_t,
1283  *loc_mapper,
1284  m_sel,
1285  m_sel,
1286  merged_aln->GetSeq_id(1)));
1287  if(m_args["strict_type"]) region_comparator->SetOptions() |= CCompareSeqRegions::fSameTypeOnly;
1288  x_ProcessComparison(*region_comparator, source_len);
1289 
1290  loc->SetInt().SetStrand(eNa_strand_minus);
1291  region_comparator.Reset(new CCompareSeqRegions(
1292  *loc,
1293  m_scope_q,
1294  m_scope_t,
1295  *loc_mapper,
1296  m_sel,
1297  m_sel,
1298  merged_aln->GetSeq_id(1)));
1299  if(m_args["strict_type"]) region_comparator->SetOptions() |= CCompareSeqRegions::fSameTypeOnly;
1300  x_ProcessComparison(*region_comparator, source_len);
1301 
1302 
1303  } //next row in the merged aln
1304  } //next aln_mix
1305  } catch (CException& e) {
1306  NCBI_REPORT_EXCEPTION("Can't process alignment", e);
1307  }
1308 
1309 
1310 
1311  //We want to flush the scope to avoid memory hogging, but not
1312  //for scopes that were manually loaded (or it will wipe out loaded data)
1316  }//next object from stream
1317 }
1318 
1319 
1320 
1321 
1322 
1324  CCompareSeqRegions& comparator,
1325  TSeqPos enclosing_alignment_length)
1326 {
1327  static unsigned groupNumber = 1;
1328 
1329  string loc_label = "";
1330  comparator.GetQueryLoc().GetLabel(&loc_label);
1331  LOG_POST("Processing location " << loc_label);
1332 
1333  //Process comparisons
1334  vector<CRef<CCompareFeats> > v;
1335 
1336  for(; comparator.NextComparisonGroup(v); groupNumber++) {
1337  int comparisonNumber = 1;
1338 
1339  _TRACE("Next comparison group");
1340  ITERATE(vector<CRef<CCompareFeats> >, it, v) {
1341  _TRACE("Next feat");
1342  CRef<CCompareFeats> cf = *it;
1343 
1344  if(cf->GetMappedIdentity() <= 0) {continue;}
1345  if(comparator.GetOptions() & CCompareSeqRegions::fDifferentGenesOnly && cf->GetFeatT().IsNull()) {continue;}
1346 
1347 
1348 
1349  NcbiCout << groupNumber << "\t" << comparisonNumber << "\t";
1350 
1351  if(!cf->GetFeatQ().IsNull()) {
1352  _ASSERT(!cf->GetSelfLocQ().IsNull());
1353  _ASSERT(!cf->GetMappedLocQ().IsNull());
1354 
1355  string s = "";
1356  feature::GetLabel(*cf->GetFeatQ(), &s, feature::fFGL_Type);
1357  NcbiCout << s << "\t";
1358 
1359  s = "";
1360  feature::GetLabel(*cf->GetFeatQ(), &s, feature::fFGL_Content);
1361  NcbiCout << s << "\t";
1362 
1363 
1364  NcbiCout << GetIdStartStopStrandStr(*cf->GetSelfLocQ(), *m_scope_q, false) << "\t";
1365  NcbiCout << (cf->GetSelfLocQ().IsNull() ? "" : NStr::IntToString(sequence::GetLength(*cf->GetSelfLocQ(), m_scope_q))) << "\t";
1366 
1367 
1368  NcbiCout << GetIdStartStopStrandStr(*cf->GetMappedLocQ(), *m_scope_t, false) << "\t";
1369 
1370  } else {
1371  NcbiCout << "\t\t\t\t\t\t\t\t\t\t\t";
1372  //if(m_args["long_loc_label"]) NcbiCout << "\t\t";
1373  }
1374 
1375  if(!cf->GetFeatT().IsNull()) {
1376  string s = "";
1377  feature::GetLabel(*cf->GetFeatT(), &s, feature::fFGL_Type);
1378  NcbiCout << s << "\t";
1379 
1380  s = "";
1381  feature::GetLabel(*cf->GetFeatT(), &s, feature::fFGL_Content);
1382  NcbiCout << s << "\t";
1383 
1384  NcbiCout << GetIdStartStopStrandStr(*cf->GetSelfLocT(), *m_scope_t, false) << "\t";
1385  NcbiCout << (cf->GetSelfLocT().IsNull() ? "" : NStr::IntToString(sequence::GetLength(*cf->GetSelfLocT(), m_scope_t))) << "\t";
1386 
1387 
1388  } else {
1389  NcbiCout << "\t\t\t\t\t\t\t";
1390  //if(m_args["long_loc_label"]) NcbiCout << "\t";
1391 
1392  }
1393 
1394  if(cf->IsMatch()) {
1395  NcbiCout.setf(ios::fixed);
1396  NcbiCout.setf(ios::showpoint);
1397  NcbiCout.precision(6);
1398 
1399  string sResult = "";
1400  cf->GetComparison()->GetResult(&sResult);
1401  NcbiCout << cf->GetComparison()->GetEvidenceString() << "\t";
1402  cf->GetComparison()->GetResult(&sResult);
1403  NcbiCout << sResult << "\t";
1404  NcbiCout << cf->GetMappedIdentity() << "\t";
1405  NcbiCout << cf->GetComparison()->GetRelativeOverlap() << "\t";
1406  NcbiCout << cf->GetComparison()->GetSymmetricalOverlap() << "\t";
1407 
1408 
1409  float shared_sites_score(0.0f);
1410  int loc1_intervals(0);
1411  int loc2_intervals(0);
1412  cf->GetComparison()->GetSplicingSimilarity(shared_sites_score, &loc1_intervals, &loc2_intervals);
1413  NcbiCout << loc1_intervals << "\t";
1414  NcbiCout << loc2_intervals << "\t";
1415  NcbiCout << shared_sites_score << "\t";
1416 
1417  } else {
1418  //should have moved mappedIdentity column earlier - hence the awkwardness
1419 
1420  //we don't have Comparison object because there's no comparison, but there was a request to
1421  //report exon counts anyway, so here we compute them "manually"
1422 
1423  int loc1_intervals(0);
1424  if(!cf->GetFeatQ().IsNull()) {
1425  for (CSeq_loc_CI ci(*cf->GetSelfLocQ()); ci; ++ci) {
1426  loc1_intervals++;
1427  }
1428  }
1429 
1430 
1431  int loc2_intervals(0);
1432  if(!cf->GetFeatT().IsNull()) {
1433  for (CSeq_loc_CI ci(*cf->GetSelfLocT()); ci; ++ci) {
1434  loc2_intervals++;
1435  }
1436  }
1437 
1438  NcbiCout << "\t\t" //no evidence and comment
1439  << cf->GetMappedIdentity() << "\t"
1440  << "\t\t" //no overlaps
1441  << (loc1_intervals ? NStr::IntToString(loc1_intervals) : "") << "\t"
1442  << (loc2_intervals ? NStr::IntToString(loc2_intervals) : "") << "\t"
1443  << "\t" //no shared splices score
1444  ;
1445  }
1446 
1447 
1448 
1449  int qry_gene_id = (cf->GetFeatQ().IsNull() ? 0 : CCompareSeqRegions::s_GetGeneId(*cf->GetFeatQ()));
1450  int tgt_gene_id = (cf->GetFeatT().IsNull() ? 0 : CCompareSeqRegions::s_GetGeneId(*cf->GetFeatT()));
1451 
1452  NcbiCout
1453  << enclosing_alignment_length << "\t"
1454  << (qry_gene_id ? NStr::IntToString(qry_gene_id) : "") << "\t"
1455  << (tgt_gene_id ? NStr::IntToString(tgt_gene_id) : "") << "\t";
1456 
1457 
1458  //report products
1459  NcbiCout << (!cf->GetFeatQ().IsNull() ? GetProductLabel(*cf->GetFeatQ(), *m_scope_id) : "") << "\t";
1460  NcbiCout << (!cf->GetFeatT().IsNull() ? GetProductLabel(*cf->GetFeatT(), *m_scope_id) : "") << "\t";
1461 
1462  int ir = cf->GetIrrelevance();
1463  NcbiCout << (ir == 0 ? "B" : ir == 1 ? "F" : ir == 2 ? "R" : "O");
1464 
1465 
1466 
1467  //add qual and dbxref values
1468  if(m_args["add_qual"]) {
1469  ITERATE(CArgValue::TStringArray, it2, m_args["add_qual"].GetStringList()) {
1470  NcbiCout << "\t" << (cf->GetFeatQ().IsNull() ? "" : cf->GetFeatQ()->GetNamedQual(*it2));
1471  NcbiCout << "\t" << (cf->GetFeatT().IsNull() ? "" : cf->GetFeatT()->GetNamedQual(*it2));
1472  }
1473  }
1474 
1475  if(m_args["add_dbxref"]) {
1476  ITERATE(CArgValue::TStringArray, it2, m_args["add_dbxref"].GetStringList()) {
1477  string str_tag = *it2;
1478  for(int j = 0; j < 2; j++) {
1479  CConstRef<CSeq_feat> feat = j == 0 ? cf->GetFeatQ() : cf->GetFeatT();
1480  //CScope& current_scope = j == 0 ? *m_scope_q : *m_scope_t;
1481  string tmpstr = "";
1482  if (!feat.IsNull()) {
1483  //special values prefixed with @ are special cases to represent
1484  //not dbxrefs per se but some other requested attributes of the feature, e.g.
1485  //feat_id or feat_id of the gene xref
1486  if(str_tag == "@feat_id" && feat->CanGetId() && feat->GetId().IsLocal()) {
1487  tmpstr = NStr::IntToString(feat->GetId().GetLocal().GetId());
1488  } else if(str_tag == "@gene_feat_id") {
1489  //if feature is a gene - get it from feat-id; otherwise from feat-id of gene xref
1490  if(feat->GetData().IsGene() && feat->CanGetId() && feat->GetId().IsLocal()) {
1491  tmpstr = NStr::IntToString(feat->GetId().GetLocal().GetId());
1492  } else {
1493  ITERATE(CSeq_feat::TXref, it, feat->GetXref()) {
1494  const CSeqFeatXref& ref = **it;
1495  if (ref.IsSetData() && ref.GetData().IsGene() && ref.CanGetId() && ref.GetId().IsLocal()) {
1496  tmpstr = NStr::IntToString(ref.GetId().GetLocal().GetId());
1497  break;
1498  }
1499  }
1500  }
1501  } else {
1502  CConstRef<CDbtag> db_tag = feat->GetNamedDbxref(str_tag);
1503  if(!db_tag.IsNull()) {
1504  db_tag->GetLabel(&tmpstr);
1505  }
1506  }
1507 
1508 
1509  }
1510  NcbiCout << "\t" << tmpstr;
1511  }
1512  }
1513  }
1514 
1515 
1516 
1517 
1518 
1519 
1520  NcbiCout << "\n" << flush;
1521  comparisonNumber++;
1522  }
1523  }
1524 }
1525 
1526 
1527 
1528 ///////////////////////////////////////////////////////////////////////////////
1530 {
1531  m_args.Assign(GetArgs());
1532 
1533  string args_str = "";
1534  m_args.Print(args_str);
1535  CTime time;
1536  time.SetCurrent();
1537 
1538  LOG_POST("Starting on " << time.AsString());
1539  LOG_POST("Args: " << args_str);
1540 
1541  if(m_args["trace"]) {
1544  }
1545 
1546  m_sel.SetSearchUnresolved(); //need for manually supplied far-reference annots with no seq-entries for scaffolds
1547  if(m_args["adaptive_depth"]) {
1549  } else {
1550  m_sel.SetExactDepth();
1551  }
1552  m_sel.SetResolveAll(); //not sure
1553 
1558  m_sel.SetResolveDepth(m_args["depth"].AsInteger());
1559 
1560  if(m_args["range_overlap"]) {
1562  } else {
1564  }
1565 
1566 
1568 
1570 
1571  m_scope_q.Reset(new CScope(*object_manager));
1572  m_scope_t.Reset(new CScope(*object_manager));
1573 
1574 
1575 
1576 
1577  m_scope_id.Reset(new CScope(*object_manager)); //need this to translate between gi's, accs and accvers
1578  //as different sources and alignments may be using different types of ids
1579 
1581  bool use_scopeless_mapper = false;
1582  if(m_args["q"]) {
1583  TLoadScopeMethod res = LoadScope(m_args["q"].AsString(), *m_scope_q, StringToSerialFormat(m_args["q_serial"].AsString()));
1584  if(res == eLoadScope_Failed) {
1585  ERR_POST(Fatal << "Can't load query scope");
1586  }
1587  if(m_args["allow_ID"]) {
1589  }
1590 
1591  if(res == eLoadScope_SeqAnnot || res == eLoadScope_SeqEntry) {
1592  use_scopeless_mapper = true;
1593  }
1594  } else {
1596  }
1597  LOG_POST(Info << "Loaded query scope on " << time.SetCurrent().AsString());
1598 
1599  if(m_args["t"] && m_args["q"] && m_args["t"].AsString() == m_args["q"].AsString()) {
1600  m_scope_t = m_scope_q;
1601  } else if(m_args["t"]) {
1602  bool res = LoadScope(m_args["t"].AsString(), *m_scope_t, StringToSerialFormat(m_args["t_serial"].AsString()));
1603  if(res == eLoadScope_Failed) {
1604  ERR_POST(Fatal << "Can't load target scope");
1605  }
1606  if(m_args["allow_ID"]) {
1608  }
1609  } else {
1611  }
1612  LOG_POST(Info << "Loaded target scope on " << time.SetCurrent().AsString());
1613 
1614 
1615  /**************************************************************************
1616  *
1617  * Fill seq-id synonyms, if provided
1618  *
1619  *************************************************************************/
1620  if(m_args["id_map"]) {
1621  LOG_POST("Loading id conversion map");
1622  CNcbiIstream& istr=m_args["id_map"].AsInputFile();
1623  string line;
1624  while(getline(istr, line).good()) {
1625  if(line.size() == 0 || line.compare(0, 1, "#") == 0) continue;
1626  vector<string> tokens;
1627  NStr::Split(line, "\t", tokens);
1628  if(tokens.size() != 3) {
1629  ERR_POST(Fatal << "Unexpected input it id_map. Execting 3 columns" << line);
1630  } else {
1631  m_id_map[NStr::StringToInt(tokens[0])][tokens[1]] = tokens[2];
1632  }
1633  }
1634  }
1635 
1636 
1639 
1640  if(use_scopeless_mapper) {
1642  } else {
1643 
1644 #if 0
1646 #else
1647  m_scope_for_mapper.Reset(new CScope(*object_manager));
1649  //trying to use priorities because CAlnMix sometimes resolves a seq-id from one scope to
1650  //a synonym from another scope
1653 #endif
1654  }
1655 
1656  LOG_POST(Info << "Finished initalizing scopes on " << time.SetCurrent().AsString());
1657 
1658 
1659 
1660  NcbiCout << "#"
1661  << "Group\t"
1662  << "N_in_group\t"
1663  << "Qry_type\t"
1664  << "Qry_name\t"
1665  << "Qry_id\t"
1666  << "Qry_start\t"
1667  << "Qry_stop\t"
1668  << "Qry_strand\t"
1669  << "Qry_len\t"
1670  << "Qry_mapped_id\t"
1671  << "Qry_mapped_start\t"
1672  << "Qry_mapped_stop\t"
1673  << "Qry_mapped_strand\t"
1674  << "Tgt_type\t"
1675  << "Tgt_name\t"
1676  << "Tgt_id\t"
1677  << "Tgt_start\t"
1678  << "Tgt_stop\t"
1679  << "Tgt_strand\t"
1680  << "Tgt_len\t"
1681  << "Comparison\t"
1682  << "Comment\t"
1683  << "Mapped_identity\t"
1684  << "Relative_overlap\t"
1685  << "Symmetric_overlap\t"
1686  << "Qry_exons\t"
1687  << "Tgt_exons\t"
1688  << "Splicing_similarity\t"
1689  << "Aln_q_length\t"
1690  << "Qry_LocusID\t"
1691  << "Tgt_LocusID\t"
1692  << "Qry_product\t"
1693  << "Tgt_product\t"
1694  << "Preference";
1695 
1696 
1697  //add qual and dbxref headers
1698  if(m_args["add_qual"]) {
1699  ITERATE(CArgValue::TStringArray, it, m_args["add_qual"].GetStringList()) {
1700  NcbiCout << "\tq_" << *it;
1701  NcbiCout << "\tt_" << *it;
1702  }
1703  }
1704 
1705  if(m_args["add_dbxref"]) {
1706  ITERATE(CArgValue::TStringArray, it, m_args["add_dbxref"].GetStringList()) {
1707  NcbiCout << "\tq_" << *it;
1708  NcbiCout << "\tt_" << *it;
1709  }
1710  }
1711 
1712  NcbiCout << "\n";
1713 
1714 
1715 
1716 
1717 
1718 
1719  CNcbiIstream& istr = m_args["i"].AsInputFile();
1720 
1721  string line;
1722  while (getline(istr, line).good()) {
1723  if(line.size() == 0 || line.compare(0, 1, "#") == 0) continue;
1724  vector<string> tokens;
1725  NStr::Split(line, "\t", tokens);
1726 
1727  if(m_args["i"].AsString().find(".asn") != string::npos && tokens[0].find(":=") != string::npos)
1728  {
1729  try {
1730  this->x_ProcessSeqAlignSetFromFile(m_args["i"].AsString());
1731  } catch (CException& e) {
1732  NCBI_REPORT_EXCEPTION("Cannot process alignment file\n" + line, e);
1733  }
1734  break;
1735  } else if(tokens.size() == 1) {
1736  try {
1737  this->x_ProcessSeqAlignSetFromFile(tokens[0]);
1738  } catch (CException& e) {
1739  NCBI_REPORT_EXCEPTION("Cannot process alignment file\n" + line, e);
1740  }
1741  } else if(tokens.size() == 6 || tokens.size() == 2) {
1742  try {
1743  //either pairs of seq_ids, or pair of mapping ranges (seq_id\tstart\tstop)
1744  TSeqPos kMaxPos = std::numeric_limits<TSeqPos>::max() - 100;
1745  //-100 because there are some special reserved values up there, e.g. kInvalidSeqPos
1746 
1747  string q_id = tokens[0];
1748  string t_id = tokens.size() == 2 ? tokens[1] : tokens[3];
1749  TSeqPos q_start = 0;
1750  TSeqPos q_stop = kMaxPos;
1751  TSeqPos t_start = 0;
1752  TSeqPos t_stop = kMaxPos;
1753 
1754  if(tokens.size() == 6) {
1755  q_start = NStr::StringToUInt(tokens[1]) - 1;
1756  q_stop = NStr::StringToUInt(tokens[2]) - 1;
1757  t_start = NStr::StringToUInt(tokens[4]) - 1;
1758  t_stop = NStr::StringToUInt(tokens[5]) - 1;
1759  }
1760 
1761  if(m_args["reverse"]) {
1762  std::swap(t_id, q_id);
1763  std::swap(q_start, t_start);
1764  std::swap(q_stop, t_stop);
1765  }
1766 
1767  this->x_ProcessMappingRange(
1768  q_id,
1769  q_start,
1770  q_stop,
1771  t_id,
1772  t_start,
1773  t_stop);
1774  } catch(CException& e) {
1775  NCBI_REPORT_EXCEPTION("Cannot process mapping ranges at line\n" + line, e);
1776 
1779  }
1780 
1781  } else if(tokens.size() == 3) {
1782  try {
1783  CRef<CSeq_loc> loc(new CSeq_loc);
1784  CRef<CSeq_id> id(new CSeq_id(tokens[0]));
1785  if(tokens[1] == "" || tokens[2] == "") {
1786  loc->SetWhole(*id);
1787  } else {
1788  loc->SetInt().SetId(*id);
1789  loc->SetInt().SetFrom(NStr::StringToUInt(tokens[1]) - 1);
1790  loc->SetInt().SetTo(NStr::StringToUInt(tokens[2]) - 1);
1791  }
1792 
1793 
1794  SAnnotSelector sel;
1795  sel.SetResolveAll();
1796  sel.SetExactDepth(true);
1797  sel.SetAdaptiveDepth(false);
1798  sel.SetResolveDepth(0);
1799 
1804 
1805  CRef<CSeq_loc_Mapper> simple_mapper(new CSeq_loc_Mapper(
1806  m_scope_q->GetBioseqHandle(*id),
1808 
1810  *simple_mapper
1811  , *m_scope_q
1812  , m_args["spliced"]
1813  , NULL
1814  , false));
1815 
1816 #if 1
1817  //need to process seq-map manually instead of just having
1818  //SAnnotSelector iterate on lower depth, because if we are to use
1819  //GetMappedFeature, we lose information about locations on components,
1820  //as the mapped feature is remapped to level 0.
1821  //If we use GetOriginalFeature, some locations on components may be
1822  //in terms of lower-level components, and self-mappers will not be able
1823  //to process those (they will be stripped)
1824 
1825  SSeqMapSelector sel_map;
1826  for(CSeqMap_CI ci(m_scope_q->GetBioseqHandle(*id), sel_map); ci; ++ci) {
1827  if(ci.GetType() != CSeqMap::eSeqRef) {
1828  continue;
1829  }
1830 
1831  CRef<CSeq_loc> loc_q(new CSeq_loc);
1832  CRef<CSeq_id> id_q(new CSeq_id);
1833  id_q->Assign(*ci.GetRefSeqid().GetSeqId());
1834  loc_q->SetInt().SetId(*id_q);
1835  loc_q->SetInt().SetFrom(ci.GetRefPosition());
1836  loc_q->SetInt().SetTo(ci.GetEndPosition());
1837  //loc_q->SetInt().SetStrand(ci.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus);
1838  loc_q->SetInt().SetStrand(eNa_strand_both);
1839 
1840  CRef<CCompareSeqRegions> region_comparator(new CCompareSeqRegions(
1841  *loc_q,
1842  m_scope_q,
1843  m_scope_t,
1844  *mapper,
1845  sel,
1846  sel,
1847  *id));
1848 
1849  x_ProcessComparison(*region_comparator, 0);
1850 
1851  }
1852 #endif
1853 
1854 
1855 
1856 
1857  } catch(CException& e) {
1858  NCBI_REPORT_EXCEPTION("Cannot process self-comparison at line\n" + line, e);
1859  }
1860  } else {
1861  ERR_POST(Fatal << "Unexpected number of columns, " << tokens.size() << line);
1862  }
1863 
1864  //Resetting history after every processing seems to reduce
1865  //memory gobbling. Must not do that to manually loaded scopes as
1866  //it will unload them
1870  }
1871 
1872  //process alignments accumulated by x_ProcessMappingRange, if any, one last time
1873  this->x_ProcessMappingRanges();
1874 
1875  LOG_POST(Info << "Done on " << time.SetCurrent().AsString());
1876  return 0;
1877 }
1878 
1879 
1880 
1881 /////////////////////////////////////////////////////////////////////////////
1882 // MAIN
1883 
1884 
1885 int main(int argc, const char* argv[])
1886 {
1887  return CXcompareAnnotsApplication().AppMain(argc, argv, 0, eDS_Default, 0);
1888 }
User-defined methods of the data storage class.
User-defined methods of the data storage class.
AutoPtr –.
Definition: ncbimisc.hpp:401
@ fPreserveRows
Definition: alnmix.hpp:80
const TConstAlns & GetInputSeqAligns(void) const
Definition: alnmix.hpp:181
@ fMinGap
Definition: alnmix.hpp:104
@ fGapJoin
Definition: alnmix.hpp:103
void Merge(TMergeFlags flags=0)
Definition: alnmix.cpp:273
const CSeq_align & GetSeqAlign(void) const
Definition: alnmix.cpp:302
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
Compare multiple feature annotations on the specified seq_locs.
static int s_GetGeneId(const CSeq_feat &feat)
const CSeq_loc & GetQueryLoc() const
bool NextComparisonGroup(vector< CRef< CCompareFeats > > &v)
Return the next group of comparisons on the region (return true iff found any) A group is a set of fe...
TComparisonOptions GetOptions() const
void GetLabel(string *label) const
Definition: Dbtag.cpp:187
CDirEntry –.
Definition: ncbifile.hpp:262
CDir –.
Definition: ncbifile.hpp:1695
CFeat_CI –.
Definition: feat_ci.hpp:64
static string GetLoaderNameFromArgs(CReader *reader=0)
Definition: gbloader.cpp:377
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
Interface for handling Seq-entry objects.
CGBReleaseFile is a utility class to ease the processing of Genbank release files one Seq-entry at a ...
void RegisterHandler(ISeqEntryHandler *handler)
Register handler.
void Read(void)
Read the release file.
bool HandleSeqEntry(CRef< CSeq_entry > &entry)
user code for handling a Seq-entry goes here.
CGbScopeLoader(CScope &scope)
CGappedRange(const CGappedRange &r1, const CGappedRange &r2)
CGappedRange(TRange range=TRange(), unsigned gaps=0)
CRef< CSeq_loc > Map(const CSeq_loc &loc, double *mapped_identity=NULL)
CRef< CSeq_id > m_from_id
CLocMapper_Default(CSeq_loc_Mapper &mapper, CScope &scope, bool is_spliced=false, const CSeq_id *from_id=NULL, bool strip_versions=false)
CGappedRange CollapseRanges(list< CGappedRange > &ranges, TSeqPos query_len, double *identity_out=NULL)
CSeq_loc_Mapper & m_mapper
static double GetBiasedSymmetricIdentity(TSeqPos qry_len, TSeqPos tgt_len, TSeqPos aln_len, double qry_bias=0.5)
Storage for multiple mapping ranges.
@RNA_ref.hpp User-defined methods of the data storage class.
Definition: RNA_ref.hpp:54
CScope –.
Definition: scope.hpp:92
CSeqFeatXref –.
Definition: SeqFeatXref.hpp:66
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
CRange< TSeqPos > GetSeqRange(TDim row) const
GetSeqRange NB: On a Spliced-seg, in case the product-type is protein, these only return the amin par...
Definition: Seq_align.cpp:153
TSeqPos GetSeqStop(TDim row) const
Definition: Seq_align.cpp:273
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
TSeqPos GetSeqStart(TDim row) const
Definition: Seq_align.cpp:252
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CConstRef< CDbtag > GetNamedDbxref(const CTempString &db) const
Return a specified DB xref.
Definition: Seq_feat.cpp:415
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
CSeq_loc_Mapper –.
CTime –.
Definition: ncbitime.hpp:296
map< int, map< string, string > > m_id_map
virtual void Init(void)
Initialize the application.
void x_ProcessSeqAlignSetFromFile(string filename)
CRef< CMappingRanges > m_mapping_ranges
void x_ProcessMappingRange(string q_id, TSeqPos q_start, TSeqPos q_stop, string t_id, TSeqPos t_start, TSeqPos t_stop)
void x_ProcessComparison(CCompareSeqRegions &comparator, TSeqPos enclosing_alignment_length)
virtual int Run(void)
Run the application.
CZipStreamDecompressor – zlib based decompression stream processor.
Definition: zlib.hpp:817
bool empty() const
Definition: map.hpp:149
Definition: map.hpp:338
Include a standard set of the NCBI C++ Toolkit most basic headers.
API (CDeflineGenerator) for computing sequences' titles ("definitions").
std::ofstream out("events_result.xml")
main entry point for tests
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static const char * str(char *buf, int n)
Definition: stats.c:84
void reset(element_type *p=0, EOwnership ownership=eTakeOwnership)
Reset will delete the old pointer (if owned), set content to the new value, and assume the ownership ...
Definition: ncbimisc.hpp:480
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
@ eTakeOwnership
An object can take ownership of another.
Definition: ncbi_types.h:136
string & Print(string &str) const
Print (append) all arguments to the string "str" and return "str".
Definition: ncbiargs.cpp:1876
CArgs & Assign(const CArgs &other)
Copy contents of another object into this one.
Definition: ncbiargs.cpp:1770
vector< string > TStringArray
Some values types can contain several value lists.
Definition: ncbiargs.hpp:293
@ fAllowMultiple
Repeated key arguments are legal (use with AddKey)
Definition: ncbiargs.hpp:635
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NULL
Definition: ncbistd.hpp:225
@ fCheckFileHeader
Check (and skip) gzip file header on decompression stage.
Definition: zlib.hpp:104
#define _TRACE(message)
Definition: ncbidbg.hpp:122
void SetDiagPostFlag(EDiagPostFlag flag)
Set the specified flag (globally).
Definition: ncbidiag.cpp:6070
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6129
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void SetDiagTrace(EDiagTrace how, EDiagTrace dflt=eDT_Default)
Set the diagnostic trace settings.
Definition: ncbidiag.cpp:6226
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
@ eDPF_All
All flags (except for the "unusual" ones!)
Definition: ncbidiag.hpp:718
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1790
@ eDT_Enable
Enable messages of severity "eDiag_Trace".
Definition: ncbidiag.hpp:1550
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
void Fatal(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1209
#define NCBI_REPORT_EXCEPTION(title, ex)
Generate a report on the exception.
Definition: ncbiexpt.hpp:755
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
TEntries GetEntries(const string &mask=kEmptyStr, TGetEntriesFlags flags=0) const
Get directory entries based on the specified "mask".
Definition: ncbifile.cpp:3846
list< TEntry > TEntries
Definition: ncbifile.hpp:1750
@ eUnknown
Definition: app_popup.hpp:72
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_Xml
XML.
Definition: serialdef.hpp:75
@ eSerial_Json
JSON.
Definition: serialdef.hpp:76
@ eSerial_None
Definition: serialdef.hpp:72
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
CConstRef< CSeq_id > GetSeqId(void) const
CSeq_id & Set(const CTempString &the_id, TParseFlags flags=fParse_AnyRaw)
Reassign based on flat specifications; arguments interpreted as with constructors.
Definition: Seq_id.cpp:2457
string GetLabel(const CSeq_id &id)
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
int Compare(const CSeq_loc &loc) const
Definition: Seq_loc.cpp:590
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
void GetLabel(string *label) const
Appends a label suitable for display (e.g., error messages) label must point to an existing string ob...
Definition: Seq_loc.cpp:3467
void ChangeToMix(void)
Definition: Seq_loc.cpp:3633
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
@ fMerge_SingleRange
Definition: Seq_loc.hpp:332
@ fMerge_All
Definition: Seq_loc.hpp:331
@ fCompare_Default
Definition: Seq_loc.hpp:245
virtual bool EndOfData(void)
Check if there is still some meaningful data that can be read; in text streams this function will ski...
Definition: objistr.cpp:588
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
@ fFGL_Content
Include its content if there is any.
Definition: feature.hpp:73
@ fFGL_Type
Always include the feature's type.
Definition: feature.hpp:72
TSeqPos GetStop(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the stop of the location.
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
ENa_strand GetStrand(const CSeq_loc &loc, CScope *scope=0)
Returns eNa_strand_unknown if multiple Bioseqs in loc Returns eNa_strand_other if multiple strands in...
TSeqPos GetStart(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the start of the location.
CRef< CSeq_loc > Seq_loc_Merge(const CSeq_loc &loc, CSeq_loc::TOpFlags flags, CScope *scope)
Merge ranges in the seq-loc.
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
string GetAccessionForId(const objects::CSeq_id &id, CScope &scope, EAccessionVersion use_version=eWithAccessionVersion, EGetIdType flags=0)
Retrieve the accession string for a Seq-id.
Definition: sequence.cpp:708
const CMolInfo * GetMolInfo(const CBioseq &bioseq)
Retrieve the MolInfo object for a given bioseq handle.
Definition: sequence.cpp:284
@ eWithoutAccessionVersion
accession only, even if version is available
Definition: sequence.hpp:92
@ eGetId_ForceAcc
return only an accession based seq-id
Definition: sequence.hpp:100
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
Definition: scope.cpp:325
void AddDataLoader(const string &loader_name, TPriority pri=kPriority_Default)
Add data loader by name.
Definition: scope.cpp:510
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void GetAllTSEs(TTSE_Handles &tses, enum ETSEKind kind=eManualTSEs)
Definition: scope.cpp:295
CSeq_annot_Handle AddSeq_annot(CSeq_annot &annot, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add Seq-annot, return its CSeq_annot_Handle.
Definition: scope.cpp:538
void UpdateAnnotIndex(void)
Definition: scope.cpp:657
void AddScope(CScope &scope, TPriority pri=kPriority_Default)
Add the scope's datasources as a single group with the given priority All data sources (data loaders ...
Definition: scope.cpp:516
vector< CSeq_entry_Handle > TTSE_Handles
Definition: scope.hpp:645
@ eManualTSEs
Definition: scope.hpp:642
@ eSeqMap_Up
map from segments to the top level bioseq
SAnnotSelector & IncludeFeatSubtype(TFeatSubtype subtype)
Include feature subtype in the search.
SAnnotSelector & SetFeatType(TFeatType type)
Set feature type (also set annotation type to feat)
SAnnotSelector & SetExactDepth(bool value=true)
SetExactDepth() specifies that annotations will be searched on the segment level specified by SetReso...
SAnnotSelector & SetResolveAll(void)
SetResolveAll() is equivalent to SetResolveMethod(eResolve_All).
SAnnotSelector & SetOverlapTotalRange(void)
Check overlapping only of total ranges.
SAnnotSelector & SetAdaptiveDepth(bool value=true)
SetAdaptiveDepth() requests to restrict subsegment resolution depending on annotations found on lower...
SAnnotSelector & SetResolveDepth(int depth)
SetResolveDepth sets the limit of subsegment resolution in searching annotations.
SAnnotSelector & SetSearchUnresolved(void)
SAnnotSelector & IncludeFeatType(TFeatType type)
Include feature type in the search.
SAnnotSelector & SetMaxSize(TMaxSize max_size)
Set maximum number of annotations to find.
SAnnotSelector & SetOverlapIntervals(void)
Check overlapping of individual intervals.
SAnnotSelector & SetSortOrder(ESortOrder sort_order)
Set sort order of annotations.
@ eSortOrder_None
do not sort annotations for faster retrieval
@ eSeqRef
reference to Bioseq
Definition: seq_map.hpp:100
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:1401
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
static void EnableGlobally(bool enable=true)
Enable (or disable, if called with enable = false) loading plugins from DLLs in general.
position_type GetLength(void) const
Definition: range.hpp:158
TThisType CombinationWith(const TThisType &r) const
Definition: range.hpp:358
#define NcbiCout
Definition: ncbistre.hpp:543
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5187
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
Definition: ncbistr.hpp:5168
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1511
CTime & SetCurrent(void)
Make the time current in the presently active time zone.
Definition: ncbitime.hpp:2302
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
void SetType(TType value)
Assign a value to Type data member.
Definition: RNA_ref_.hpp:538
Tdata & Set(void)
Assign a value to data member.
TDim GetDim(void) const
Get the Dim member data.
Definition: Seq_align_.hpp:856
const TData & GetData(void) const
Get the Data member data.
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_feat_.hpp:904
const TLocal & GetLocal(void) const
Get the variant data.
Definition: Feat_id_.cpp:134
E_Choice
Choice variants.
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Feat_id_.hpp:353
const TId & GetId(void) const
Get the Id member data.
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Seq_feat_.hpp:1181
bool IsGene(void) const
Check if variant Gene is selected.
bool CanGetId(void) const
Check if it is safe to call GetId method.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
bool CanGetId(void) const
Check if it is safe to call GetId method.
Definition: Seq_feat_.hpp:898
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
const TProduct & GetProduct(void) const
Get the Product member data.
Definition: Seq_feat_.hpp:1096
const TXref & GetXref(void) const
Get the Xref member data.
Definition: Seq_feat_.hpp:1308
vector< CRef< CSeqFeatXref > > TXref
Definition: Seq_feat_.hpp:122
bool CanGetProduct(void) const
Check if it is safe to call GetProduct method.
Definition: Seq_feat_.hpp:1090
bool IsEmpty(void) const
Check if variant Empty is selected.
Definition: Seq_loc_.hpp:516
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
@ eBiomol_other_genetic
other genetic material
Definition: MolInfo_.hpp:109
int i
int len
unsigned int id_t
Definition: bmconst.h:38
range(_Ty, _Ty) -> range< _Ty >
EIPRangeType t
Definition: ncbi_localip.c:101
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
Defines NCBI C++ exception handling.
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
T max(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
The Object manager core.
Plugin manager (using class factory paradigm).
static bool GetSeqId(const T &d, set< string > &labels, const string name="", bool detect=false, bool found=false)
#define row(bind, expected)
Definition: string_bind.c:73
SAnnotSelector –.
Selector used in CSeqMap methods returning iterators.
Definition: seq_map_ci.hpp:113
Definition: type.c:6
#define _ASSERT
USING_SCOPE(objects)
bool IsInScope(CScope &scope, const CSeq_id &id)
string GetIdStartStopStrandStr(const CSeq_loc &loc, CScope &scope, bool use_long_label)
bool IsManualScope(CScope &scope)
TLoadScopeMethod LoadScope(string arg_path, CScope &scope, ESerialDataFormat serial_format)
int TLoadScopeMethod
void AddSpanningSentinelFeat(CScope &scope, const CSeq_id &id, CSeqFeatData::E_Choice type, string title="Sentinel")
void AddDefaultSentinelFeats(CScope &scope, const CSeq_loc &loc)
ESerialDataFormat StringToSerialFormat(string str)
int main(int argc, const char *argv[])
void AddSentinelRNASeq(CScope &scope, const CSeq_id &id)
unique_ptr< CObjectIStream > GetIStream(string path, ESerialDataFormat serial_format)
USING_NCBI_SCOPE
string GetProductLabel(const CSeq_feat &feat, CScope &scope)
ELoadScopeMethod
@ eLoadScope_SeqEntry
@ eLoadScope_GBR
@ eLoadScope_SeqAnnot
@ eLoadScope_Failed
ZLib Compression API.
Modified on Wed Apr 24 14:11:27 2024 by modify_doxy.py rev. 669887