NCBI C++ ToolKit
read_blast_result_lib.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: read_blast_result_lib.cpp 92163 2020-12-22 17:39:50Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 * major top-level routines, including main
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include "read_blast_result.hpp"
35 
36 
37 // static members definition
38 // algorithm control
48 // verbosity
52 
53 ESerialDataFormat s_GetFormat(const string& name)
54 {
55  if (name == "asn") {
56  return eSerial_AsnText;
57  } else if (name == "asnb") {
58  return eSerial_AsnBinary;
59  } else if (name == "xml") {
60  return eSerial_Xml;
61  } else {
62  // Should be caught by argument processing, but as an illustration...
63  THROW1_TRACE(runtime_error, "Bad serial format name " + name);
64  }
65 }
66 
67 
68 vector<long> CReadBlastApp::getGIs(CBioseq::TAnnot::const_iterator& annot)
69 {
70  vector<long> result;
71  result.clear();
72  ITERATE(CSeq_align::TScore, score, (*(*annot)->GetData().GetAlign().begin())->GetScore())
73  {
74  string name = (*score)->GetId().GetStr();
75  if(name!="use_this_gi") continue;
76  result.push_back((*score)->GetValue().GetInt());
77  }
78 
79  return result;
80 }
81 
82 bool CReadBlastApp::giMatch(const vector<long>& left, const vector<long>& right)
83 {
84 
85  bool result=false;
86  if(PrintDetails()) NcbiCerr << "giMatch starts" << NcbiEndl;
88  ITERATE(vector<long>, gi1, left)
89  {
91  ITERATE(vector<long>, gi2, right)
92  {
93  if(PrintDetails()) NcbiCerr << "try: gi1, gi2: " << *gi1 << " " << *gi2 << NcbiEndl;
94  if (*gi1==*gi2)
95  {
96  if(PrintDetails()) NcbiCerr << "giMatch!!! : gi1, gi2: " << *gi1 << " " << *gi2 << NcbiEndl;
97  return true;
98  }
99  }
101  }
103  if(PrintDetails()) NcbiCerr << "giMatch oops" << NcbiEndl;
104 
105  return result;
106 }
107 
108 void CReadBlastApp::dumpAlignment( const string& alignment, const string& file)
109 {
110  ofstream out(file.c_str());
111  out<< alignment;
112 }
113 
115 {
116  // open out2 here
117  string fn = GetArgs()["in"].AsString();
118  string::size_type ipos=fn.rfind(".");
119  if(ipos!=string::npos) fn.erase(ipos);
120  ipos=fn.rfind("/");
121  fn.erase(0,ipos+1);
122  string fncdd = fn + "_cdd_html.fsa";
123  fn+= "_html.fsa";
124 
126  CScope scope(*objmgr);
127 
128  unique_ptr<CNcbiOfstream> out2 (new CNcbiOfstream (fn.c_str()));
129  CFastaOstream fasta_out(*out2);
130 
131  unique_ptr<CNcbiOfstream> out2_cdd ( new CNcbiOfstream (fncdd.c_str()));
132  CFastaOstream fasta_out_cdd(*out2_cdd);
133 
134  for (CTypeConstIterator<CBioseq> seq = ConstBegin(); seq; ++seq)
135  {
136  if( hasProblems(*seq, diag, eFrameShift )||
137  hasProblems(*seq, diag, ePartial)
138  )
139  {
140  CBioseq_Handle handle = scope.AddBioseq(*seq);
141  if( hasProblems(*seq, diag, eFrameShift )) fasta_out.Write(handle);
142  if( hasProblems(*seq, diag, ePartial)) fasta_out_cdd.Write(handle);
143  }
144  }
145 
146 }
147 
149  (
150  distanceReportStr *report,
151  ostream& out
152  )
153 {
154  out << "q1 | lcl | " << report->q_id_left << " | " << report->q_name_left << NcbiEndl;
155  out << "q2 | lcl | " << report->q_id_right << " | " << report->q_name_right << NcbiEndl;
156  out << report->q_id_left
157  << (report->left_strand == eNa_strand_plus ? "(+)" : "(-)" )
158  << "\t"
159  << report->q_id_right
160  << (report->right_strand == eNa_strand_plus ? "(+)" : "(-)" )
161  << NcbiEndl;
162  string range1 = report->loc1.Empty()
163  ? printed_range(report->q_loc_left_from, report->q_loc_left_to)
164  : printed_ranges(*(report->loc1));
165  string range2 = report->loc2.Empty()
166  ? printed_range(report->q_loc_right_from, report->q_loc_right_to)
167  : printed_ranges(*(report->loc2));
168  out
169  << "[" << range1 << "] " << report->left_frame << "\t\t"
170  << "[" << range2 << "] " << report->right_frame
171  << NcbiEndl;
172  out << NcbiEndl;
173 
174  out << "Overlap: " << report->space << " aa" << NcbiEndl;
175 }
176 
178  (
179  const perfectHitStr& hit,
180  ostream& out
181  )
182 {
183 
184  out << "Perfect hit: " << hit.s_name << NcbiEndl;
185 
186 }
187 
189 {
190  out << "General info extracted from the header of the submission:" << NcbiEndl;
191  out << NcbiEndl;
192  out << "reldate: ";
193  if ( IsSubmit() && m_Submit.GetSub().CanGetReldate() )
194  {
195  string date;
196  m_Submit.GetSub().GetReldate().GetDate(&date);
197  out << date.c_str();
198  }
199  else
200  out << "unspecified";
201  out << NcbiEndl;
202 
203 
204 // simple texts
205  out << "tool: ";
206  if ( IsSubmit() && m_Submit.GetSub().CanGetTool() )
207  out << m_Submit.GetSub().GetTool();
208  else
209  out << "unspecified";
210  out << NcbiEndl;
211 
212  out << "user_tag: ";
213  if ( IsSubmit() && m_Submit.GetSub().CanGetUser_tag() )
215  else
216  out << "unspecified";
217  out << NcbiEndl;
218 
219  out << "comment: ";
220  if ( IsSubmit() && m_Submit.GetSub().CanGetComment() )
221  out << m_Submit.GetSub().GetComment();
222  else
223  out << "unspecified";
224  out << NcbiEndl;
225 
226 
227  out << NcbiEndl;
228 }
229 
231  (
232  distanceReportStr *report,
233  ostream& out
234  )
235 {
236  out << "q1 | lcl | " << report->q_id_left << " | " << report->q_name_left << NcbiEndl;
237  out << "q2 | lcl | " << report->q_id_right << " | " << report->q_name_right << NcbiEndl;
238  out << "s | " << report->s_name << NcbiEndl;
239  out << report->q_id_left
240  << (report->left_strand == eNa_strand_plus ? "(+)" : "(-)" )
241  << "\t"
242  << report->q_id_right
243  << (report->right_strand == eNa_strand_plus ? "(+)" : "(-)" )
244  << NcbiEndl;
245  out
246  << "[" << report->q_loc_left_from << "..." << report->q_loc_left_to << "]\t\t"
247  << "[" << report->q_loc_right_from << "..." << report->q_loc_right_to << "]"
248  << NcbiEndl;
249  out << NcbiEndl;
250  out
251  << "q\t" << report->q_left_left << "\t" << report->q_left_middle << "\t" << report->q_left_right
252  << "\t" << report->space
253  << "\t" << report->q_right_left << "\t" << report->q_right_middle << "\t" << report->q_right_right
254  << NcbiEndl;
255  out
256  << "s\t" << report->s_left_left << "\t" << report->s_left_middle << "\t" << "..."
257  << "\t" << "..."
258  << "\t" << "..." << "\t" << report->s_left_right << "\t" << "..."
259  << NcbiEndl;
260  out
261  << "s\t" << "..." << "\t" << report->s_right_left << "\t" << "..."
262  << "\t" << "..."
263  << "\t" << "..." << "\t" << report->s_right_middle << "\t" << report->s_right_right
264  << NcbiEndl;
265  out << "diff_left, diff_right: " << report->diff_left << ", " << report->diff_right << NcbiEndl;
266  out << "diff_edge_left, diff_edge_right: " << report->diff_edge_left << ", " << report->diff_edge_right << NcbiEndl;
267  if(report->diff_edge_left > 0)
268  {
269  out << "Potential deletetion of a nucleotide sequence equivalent to " << report->diff_edge_left << " occurred." << NcbiEndl;
270  }
271  else if (report->diff_edge_left < 0)
272  {
273  if ( -report->diff_edge_left < report->space )
274  out << "Potential insertion of a nucleotide sequence equivalent to " << -report->diff_edge_left << " occurred." << NcbiEndl;
275  else
276  out << "Subjects are shifted apart by " << -report->diff_edge_left << NcbiEndl;
277  }
278  else
279  {
280  out << "Potential sequencing error or replacement mutation without insertion or deletion of a nucleotide sequence occurred." << NcbiEndl;
281  }
282 
283 
284 }
285 
287  (
288  const LocMap& loc_map,
289  CSeq_annot::C_Data::TFtable& rna_feats,
290  const CSeq_annot::C_Data::TFtable& feats
291  )
292 {
293  if(PrintDetails()) NcbiCerr << "GetRNAfeats() starts" << NcbiEndl;
296  {
297  if(PrintDetails()) NcbiCerr << "GetRNAfeats(): feature start " << NcbiEndl;
298  if( !(*f)->GetData().IsRna() ) continue;
299  if(PrintDetails()) NcbiCerr << "GetRNAfeats(): feature is RNA" << NcbiEndl;
300  string loc_string = GetLocationString( (*f)->GetLocation() );
301  if(PrintDetails()) NcbiCerr << "GetRNAfeats(): ["
302  << loc_string
303  << "] feature location" << NcbiEndl;
304  LocMap::const_iterator aname = loc_map.find(loc_string);
305  if(aname == loc_map.end())
306  {
307  // error
308  NcbiCerr << "CReadBlastApp::GetRNAfeats(): ERROR: cannot find gene for location "
309  << loc_string
310  << NcbiEndl;
311  continue;
312  }
313  rna_feats.push_back(aname->second);
314  if(PrintDetails()) NcbiCerr << "GetRNAfeats(): ["
315  << aname->first
316  << "] feature end" << NcbiEndl;
317  }
319  if(PrintDetails()) NcbiCerr << "GetRNAfeats(): rna_feats size: "
320  << rna_feats.size()
321  << NcbiEndl;
322  if(PrintDetails()) NcbiCerr << "GetRNAfeats() ends" << NcbiEndl;
323 
324  return;
325 }
326 
327 
328 string GetLocusTag(const CSeq_feat& f, const LocMap& loc_map)
329 {
330  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: start" << NcbiEndl;
331  string loc_string = GetLocationString(f.GetLocation());
332  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: I" << NcbiEndl;
333  LocMap::const_iterator aname = loc_map.find(loc_string);
334  if(aname == loc_map.end())
335  {
336  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: not in locmap" << NcbiEndl;
337  return loc_string;
338  }
339  else
340  {
341  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: in locmap" << NcbiEndl;
342  const CGene_ref& gene=aname->second->GetData().GetGene();
343  string qname="";
344  if(gene.IsSetLocus_tag())
345  {
346  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: getting it" << NcbiEndl;
347  return gene.GetLocus_tag();
348  }
349 
350  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: not getting it" << NcbiEndl;
351  return qname;
352  }
353 
354  if(CReadBlastApp::PrintDetails()) NcbiCerr << "GetLocusTag: INTERNAL ERROR: should be never here" << NcbiEndl;
355  return loc_string;
356 }
358 {
359  const CBioseq_set::TSeq_set* seqset = get_parent_seqset(seq);
360  const CSeq_loc& loc = getGenomicLocation(seq);
362  const CSeq_id& nu_id = inter->GetId();
363 
364  ITERATE(CBioseq_set::TSeq_set, seq_nu, *seqset)
365  {
366  if(!(*seq_nu)->IsSeq()) continue;
367  if(!(*seq_nu)->GetSeq().IsNa()) continue;
368  const CBioseq::TId& nu_ids = (*seq_nu)->GetSeq().GetId();
369  ITERATE(CBioseq::TId, cnu_id , nu_ids)
370  {
371  if((*cnu_id)->Compare(nu_id) == CSeq_id::e_YES) return (*seq_nu)->GetSeq();
372  }
373  }
374  NcbiCerr << "CReadBlastApp::get_nucleotide_seq: INTERNAL FATAL ERROR: could not find nucleotide sequence" << NcbiEndl;
375  throw;
376  return seq; // formality
377 }
378 
380 {
381  string name = GetStringDescr(seq);
382  CSeq_entry* parent = seq.GetParentEntry();
383  if(parent==NULL)
384  {
385  NcbiCerr << "get_parent_seqset: WARNING: " << name << ": no parent\n";
386  return NULL;
387  }
388  if(parent->IsSeq())
389  {
390  while(parent!=NULL && !parent->IsSet())
391  parent = parent->GetParentEntry();
392  if(parent==NULL)
393  {
394  NcbiCerr << "get_parent_seqset: WARNING: " << name << ": no set ancestor\n";
395  return NULL;
396  }
397  }
398  else if(!parent->IsSet()) return NULL;
399 
400  if(!parent->GetSet().CanGetSeq_set())
401  {
402  NcbiCerr << "get_parent_seqset: WARNING: " << name << ": (grand)parent set does not have Seq_set\n";
403  return NULL;
404  }
405  return &(parent->SetSet().SetSeq_set());
406 }
407 
408 
409 string let1_2_let3(char let1)
410 {
411  switch(let1)
412  {
413  case 'A': return "Ala"; break;
414  case 'Q': return "Gln"; break;
415  case 'W': return "Trp"; break;
416  case 'E': return "Glu"; break;
417  case 'R': return "Arg"; break;
418  case 'T': return "Thr"; break;
419  case 'Y': return "Tyr"; break;
420  case 'U': return "Sec"; break;
421  case 'I': return "Ile"; break;
422  case 'J': return "Xeu"; break; // Ile or Leu, there is no conventional three-letter code
423  case 'P': return "Pro"; break;
424  case 'S': return "Ser"; break;
425  case 'D': return "Asp"; break;
426  case 'F': return "Phe"; break;
427  case 'G': return "Gly"; break;
428  case 'H': return "His"; break;
429  case 'K': return "Lys"; break;
430  case 'L': return "Leu"; break;
431  case 'Z': return "Glx"; break;
432  case 'X': return "Ukn"; break;
433  case 'C': return "Cys"; break;
434  case 'V': return "Val"; break;
435  case 'B': return "Asx"; break;
436  case 'N': return "Asn"; break;
437  case 'M': return "Met"; break;
438  case 'O': return "Pyl"; break;
439  default: break;
440  }
441  NcbiCerr << "let1_2_let3: ERROR: char " << let1 << "(" << (int)let1 << ") is not handled" << NcbiEndl;
442 
443  return "";
444 }
445 
446 int CReadBlastApp::collectPerfectHits(vector<perfectHitStr>& perfect, const CBioseq& seq)
447 {
448  ITERATE(CBioseq::TAnnot, annot, seq.GetAnnot())
449  {
450  if(!(*annot)->GetData().IsAlign()) continue;
451  check_alignment(annot, seq, perfect);
452  }
453  return perfect.size();
454 }
455 
457  (
458  CBioseq::TAnnot::const_iterator& annot,
459  const CBioseq& seq,
460  vector<perfectHitStr>& results
461  )
462 {
463  int sLen = getLenScore(annot);
464  int qLen = getQueryLen(seq);
465  string q_name = GetProtName(seq);
466 
467  string s_name = getAnnotName(annot);
468 
469  int qFrom, qTo, sFrom, sTo;
470  getBounds(annot, &qFrom, &qTo, &sFrom, &sTo);
471 
472  bool result=false;
473 
474  int qtails = qLen - qTo + qFrom - 1;
475  int stails = sLen - sTo + sFrom - 1;
476  double thr = m_small_tails_threshold; // default should be .1
477  result =
478  (double)qtails/qLen < thr &&
479  (double)stails/sLen < thr &&
480  s_name.find("hypothetical") == string::npos;
481 // hypothetical exhonerating hits do not count
482 
483  if(result)
484  {
485  results.resize(results.size()+1);
486  int ihit = results.size()-1;
487  results[ihit].s_name = s_name;
488  }
489 }
490 
492 {
493  bool result;
494  // if(PrintDetails()) NcbiCerr << "is_prot_entry?" << NcbiEndl;
495  result=seq.GetInst().IsAa();
496  // if(PrintDetails()) NcbiCerr << result << NcbiEndl;
497  return result;
498 }
499 
500 
502 {
503  if(PrintDetails()) NcbiCerr << "SortSeqs: start " << NcbiEndl;
504  PushVerbosity();
505  if(IsSubmit())
506  {
507  if(PrintDetails()) NcbiCerr << "SortSeqs: IsSubmit" << NcbiEndl;
508  if (
510  &&
511  (*m_Submit.GetData().GetEntrys().begin())->IsSet()
512  )
513  {
514  SortSeqs((*m_Submit.SetData().SetEntrys().begin())->SetSet().SetSeq_set());
515  }
516  else
517  {
518  NcbiCerr << "ERROR: submit file does not have proper seqset" << NcbiEndl;
519  }
520  }
521  else
522  {
523  if(PrintDetails()) NcbiCerr << "SortSeqs: IsEntry" << NcbiEndl;
525  }
526 
527  PopVerbosity();
528  if(PrintDetails()) NcbiCerr << "SortSeqs: end" << NcbiEndl;
529  return -1;
530 }
531 
532 
533 
535 {
536 /*
537  NON_CONST_ITERATE( CBioseq_set::TSeq_set, left, where)
538  {
539  if((*left)->IsSet())
540  {
541  CBioseq_set::TSeq_set& seqs_down = (*left)->SetSet().SetSeq_set();
542  SetParents(parent, seqs_down);
543  }
544  else
545  {
546  (*left)->SetParentEntry(parent);
547  }
548  }
549 */
550  return 0;
551 }
552 
553 // TSimpleSeq
554 
556  const TSimpleSeq& second)
557 {
558  return first.key < second.key;
559 } // less_seq
560 
561 bool CReadBlastApp::less_pair(const pair<int,int>& first,
562  const pair<int,int>& second)
563 {
564  if(first.first != second.first) return first.first < second.first;
565  return first.second < second.second;
566 }
567 
568 
569 
571  const CRef<CSeq_entry>& second)
572 {
573  // if(PrintDetails()) NcbiCerr << "less_seq start" << NcbiEndl;
574  if (first->IsSet() || second->IsSet())
575  {
576  return false;
577  }
578  // if(PrintDetails()) NcbiCerr << "less_seq both seqs" << NcbiEndl;
579  if (!hasGenomicLocation(first->GetSeq()))
580  {
581  if(first->GetSeq().IsAa())
582  NcbiCerr << "less_seq first does not have genomic location or is nucleotide seq" << NcbiEndl;
583  return true; // to take care of nucleotide sequence
584  }
585  if (!hasGenomicLocation(second->GetSeq()))
586  {
587  if(second->GetSeq().IsAa())
588  NcbiCerr << "less_seq second does not have genomic location or is nucleotide seq" << NcbiEndl;
589  return false;
590  }
591  // if(PrintDetails()) NcbiCerr << "less_seq both have genomic locations" << NcbiEndl;
592  const CSeq_loc& left_genomic_int = getGenomicLocation(first->GetSeq());
593  const CSeq_loc& right_genomic_int = getGenomicLocation(second->GetSeq());
594  TSeqPos from1, to1, from2, to2;
595 
596  ENa_strand strand1;
597  ENa_strand strand2;
598 
599  getFromTo(left_genomic_int, from1, to1, strand1);
600  getFromTo(right_genomic_int, from2, to2, strand2);
601 /*
602  if(PrintDetails()) NcbiCerr << "less_seq comparing"
603  << ": " << GetStringDescr( first->GetSeq()) << " = " << from1 << "-" << to1
604  << ", " << GetStringDescr(second->GetSeq()) << " = " << from2 << "-" << to2
605  << NcbiEndl;
606 */
607  return from1 < from2;
608 } // less_seq
609 
611 {
612  if(PrintDetails()) NcbiCerr << "SortSeqs(CBioseq_set::TSeq_set& seqs): start"
613  << ": " << seqs.size()
614  << NcbiEndl;
615 
617  {
618  if ((*seq)->IsSet()) SortSeqs((*seq)->SetSet().SetSeq_set());
619  }
620 
621  seqs.sort(&less_seq); // does not do anything
622  int n =1;
623 
624  if(PrintDetails()) NcbiCerr << "SortSeqs(CBioseq_set::TSeq_set& seqs): stop" << NcbiEndl;
625  return n;
626 }
627 
629 {
630  char buffer[1024];
631  in.getline(buffer, 1024);
632  in.seekg(0);
633 
634  string result = buffer;
635  result.erase(result.find_first_of(": "));
636 
637  if(result=="Seq-entry")
638  {
639  result = buffer;
640  if(result.find("set") != string::npos)
641  return eEntry;
642  return eUndefined;
643  }
644  if(result=="Seq-submit") return eSubmit;
645  if(NStr::StartsWith(result, ">Feature")) return eTbl;
646  return eUndefined;
647 }
648 
650 {
651  return m_coreDataType == eSubmit;
652 }
653 
655 {
656  return m_coreDataType == eEntry;
657 }
658 
660 {
661  return m_coreDataType == eTbl;
662 }
663 
664 
666 {
669 
670  return ::Begin(m_Submit); // some platforms might have warnings without it
671 }
672 
674 {
677 
678  return ::ConstBegin(m_Submit); // some platforms might have warnings without it
679 }
680 
681 
682 char *CReadBlastApp::next_w(char *w)
683 {
684  while(*w && !isspace(*w)) ++w;
685  while(isspace(*w)) ++w;
686  return(w);
687 
688 }
690 {
691  while(*w && isspace(*w)) ++w;
692  return(w);
693 
694 }
696 {
697  int nums=0;
698  for(nums=0 ;seq.IsValid(); ++seq, nums++)
699  {
700  const CSeq_inst& inst = seq->GetInst();
701  if(!inst.IsAa()) continue;
702  break;
703  }
704  return nums;
705 }
706 
708 {
709  int nums=0;
710  for(nums=0 ;seq; ++seq, nums++)
711  {
712  const CSeq_inst& inst = seq->GetInst();
713  if(!inst.IsAa()) continue;
714  break;
715  }
716  return nums;
717 }
718 
719 bool CReadBlastApp::skip_toprot(CBioseq_set::TSeq_set::const_iterator& seq,
720  const CBioseq_set::TSeq_set& seqs)
721 {
722 
724  while( seq != seqs.end() &&
725  (
726  (*seq)->IsSet() || // set of sequences, skip
727  !is_prot_entry((*seq)->GetSeq()) // not a protein, skip
728  )
729  )
730  {
731  if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
732  // << CSeq_id::GetStringDescr ((*seq)->GetSeq(), CSeq_id::eFormat_FastA) << NcbiEndl;
733  << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
734  ++seq;
735  }
737 
738  return (seq != seqs.end()) ;
739 
740 }
741 
742 
743 bool CReadBlastApp::skip_toprot(CBioseq_set::TSeq_set::iterator& seq,
744  CBioseq_set::TSeq_set& seqs)
745 {
746 
748  while( seq != seqs.end() &&
749  (
750  (*seq)->IsSet() || // set of sequences, skip
751  !is_prot_entry((*seq)->GetSeq()) // not a protein, skip
752  )
753  )
754  {
755  if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
756  << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
757  ++seq;
758  }
760 
761  return (seq != seqs.end()) ;
762 
763 }
764 
765 
767  CBioseq_set::TSeq_set::const_iterator& seq,
768  const CBioseq_set::TSeq_set& seqs)
769 {
771  while( seq != seqs.end() &&
772  (
773  (*seq)->IsSet() || // set of sequences, skip
774  !is_prot_entry((*seq)->GetSeq()) || // not a protein, skip
775  !has_blast_hits((*seq)->GetSeq()) // does not have a blast , skip
776  )
777  )
778  {
779  if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
780  << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
781  ++seq;
782  }
784 
785  return (seq != seqs.end()) ;
786 }
787 
789  CBioseq_set::TSeq_set::iterator& seq,
790  CBioseq_set::TSeq_set& seqs)
791 {
793  while( seq != seqs.end() &&
794  (
795  (*seq)->IsSet() || // set of sequences, skip
796  !is_prot_entry((*seq)->GetSeq()) || // not a protein, skip
797  !has_blast_hits((*seq)->GetSeq()) // does not have a blast , skip
798  )
799  )
800  {
801  if(PrintDetails()) NcbiCerr << "skip_to_valid_seq_cand: seq++"
802  << GetStringDescr ((*seq)->GetSeq()) << NcbiEndl;
803  ++seq;
804  }
806 
807  return (seq != seqs.end()) ;
808 }
809 
810 // ReadPreviousAcc(args["parentacc"].AsString(), m_previous_genome)
811 bool CReadBlastApp::ReadPreviousAcc(const string& file, list<long>& input_acc)
812 {
813  const unsigned int max_acc_len=0xF;
814  ifstream in(file.c_str());
815  if(!in.is_open()) return false; // the input parameter is not a file
816  list<long> scratch_acc;
817  while(in.good())
818  {
819  char buffer[max_acc_len+2];
820  in.getline(buffer, max_acc_len+1);
821 /*
822  if(buffer[0] < 'A' || buffer[0] > 'Z' ||
823  buffer[1] < 'A' || buffer[1] > 'Z' ||
824  buffer[2] != '_') return false;
825 */
826  string test(buffer);
827 // remove white spaces
828  string::size_type ipos=string::npos;
829  while((ipos=test.find_first_of("\t\n\r ")) != string::npos)
830  test.erase(ipos,1);
831 // too long, may be it is a wrong file
832  if(test.size()>max_acc_len) return false;
833 // has non-digits, go away
834  if(test.find_first_not_of("0123456789") != string::npos) return false;
835 // accept
836  scratch_acc.push_back(atol(test.c_str()));
837  }
838  if(!scratch_acc.size()) return false;
839  input_acc = scratch_acc;
840  return true;
841 }
842 
843 /////////////////////////////////////////////////////////////////////////////
844 // MAIN
845 
846 
847 int main(int argc, const char* argv[])
848 {
849  // Execute main application function
850  return CReadBlastApp().AppMain(argc, argv, 0, eDS_Default, 0);
851 }
852 
853 
Class holding information about root of non-modifiable object hierarchy Do not use it directly.
Definition: iterator.hpp:58
CBioseq_Handle –.
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq.hpp:174
bool IsAa(void) const
Definition: Bioseq.cpp:350
Class holding information about root of non-modifiable object hierarchy Do not use it directly.
Definition: iterator.hpp:86
void GetDate(string *label, bool year_only=false) const
Append a standardized string representation of the date to the label.
Definition: Date.hpp:149
FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp>
Definition: sequence.hpp:770
void printGeneralInfo(ostream &out=NcbiCerr)
static bool hasProblems(const CBioseq &seq, diagMap &diag, const EProblem type)
Definition: problems.cpp:141
static int m_verbosity_threshold
static bool PrintDetails(int current_verbosity=m_current_verbosity)
static string GetProtName(const CBioseq &seq)
Definition: shortcuts.cpp:82
static bool less_pair(const pair< int, int > &first, const pair< int, int > &second)
static bool giMatch(const vector< long > &left, const vector< long > &right)
static int getQueryLen(const CBioseq &seq)
Definition: shortcuts.cpp:96
static vector< long > getGIs(CBioseq::TAnnot::const_iterator &annot)
CConstBeginInfo ConstBegin(void)
static stack< int > m_saved_verbosity
static bool has_blast_hits(const CBioseq &seq)
Definition: fit_blast.cpp:369
static void IncreaseVerbosity(void)
static int m_n_best_hit
static char * next_w(char *w)
static int collectPerfectHits(vector< perfectHitStr > &perfect, const CBioseq &seq)
static void printOverlapReport(distanceReportStr *report, ostream &out=NcbiCout)
static string getAnnotName(CBioseq::TAnnot::const_iterator &annot)
Definition: shortcuts.cpp:459
static double m_trnascan_scoreThreshold
static bool hasGenomicLocation(const CBioseq &seq)
Definition: locations.cpp:100
static int m_current_verbosity
bool ReadPreviousAcc(const string &file, list< long > &input_acc)
static int skip_toprot(CTypeIterator< CBioseq > &seq)
static void dumpAlignment(const string &alignment, const string &file)
static int m_cds_overlapThreshold
static double m_entireThreshold
static int m_rna_overlapThreshold
void GetRNAfeats(const LocMap &loc_map, CSeq_annot::C_Data::TFtable &rna_feats, const CSeq_annot::C_Data::TFtable &feats)
int SetParents(CSeq_entry *parent, CBioseq_set::TSeq_set &where)
static void check_alignment(CBioseq::TAnnot::const_iterator &annot, const CBioseq &seq, vector< perfectHitStr > &results)
static ECoreDataType getCoreDataType(istream &in)
static bool is_prot_entry(const CBioseq &seq)
const CBioseq & get_nucleotide_seq(const CBioseq &seq)
static char * skip_space(char *w)
static void DecreaseVerbosity(void)
static bool less_simple_seq(const TSimpleSeq &first, const TSimpleSeq &second)
static void getBounds(CBioseq::TAnnot::const_iterator &annot, int *qFrom, int *qTo, int *sFrom, int *sTo)
Definition: shortcuts.cpp:435
static void printPerfectHit(const perfectHitStr &hit, ostream &out=NcbiCout)
static int m_shortProteinThreshold
static void PopVerbosity(void)
ECoreDataType m_coreDataType
static const CSeq_loc & getGenomicLocation(const CBioseq &seq)
Definition: locations.cpp:120
static void printReport(distanceReportStr *report, ostream &out=NcbiCout)
static int getLenScore(CBioseq::TAnnot::const_iterator &annot)
Definition: shortcuts.cpp:420
static bool less_seq(const CRef< CSeq_entry > &first, const CRef< CSeq_entry > &second)
static void getFromTo(const CSeq_loc &loc, TSeqPos &from, TSeqPos &to, ENa_strand &strand)
Definition: locations.cpp:34
static double m_small_tails_threshold
static double m_partThreshold
static double m_eThreshold
static bool skip_to_valid_seq_cand(CBioseq_set::TSeq_set::const_iterator &seq, const CBioseq_set::TSeq_set &seqs)
CBeginInfo Begin(void)
void dump_fasta_for_pretty_blast(diagMap &diag)
static void PushVerbosity(void)
CSeq_submit m_Submit
CScope –.
Definition: scope.hpp:92
Definition: Seq_entry.hpp:56
CSeq_entry * GetParentEntry(void) const
Definition: Seq_entry.hpp:131
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
static bool IsAa(EMol mol)
Definition: Seq_inst.hpp:99
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
std::ofstream out("events_result.xml")
main entry point for tests
#define test(a, b, c, d, e)
Definition: numeric.c:170
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static FILE * f
Definition: readconf.c:23
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:832
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1790
#define THROW1_TRACE(exception_class, exception_arg)
Throw trace.
Definition: ncbiexpt.hpp:417
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_Xml
XML.
Definition: serialdef.hpp:75
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:583
bool IsValid(void) const
Definition: iterator.hpp:252
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
Definition: iterator.hpp:1012
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
virtual void Write(const CSeq_entry_Handle &handle, const CSeq_loc *location=0)
Unspecified locations designate complete sequences; non-empty custom titles override the usual title ...
Definition: sequence.cpp:2727
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define NcbiEndl
Definition: ncbistre.hpp:548
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
#define NcbiCerr
Definition: ncbistre.hpp:544
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
bool CanGetSeq_set(void) const
Check if it is safe to call GetSeq_set method.
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
list< CRef< CSeq_entry > > TSeq_set
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
const TTool & GetTool(void) const
Get the Tool member data.
const TReldate & GetReldate(void) const
Get the Reldate member data.
const TComment & GetComment(void) const
Get the Comment member data.
bool CanGetTool(void) const
Check if it is safe to call GetTool method.
bool CanGetReldate(void) const
Check if it is safe to call GetReldate method.
bool CanGetComment(void) const
Check if it is safe to call GetComment method.
const TEntrys & GetEntrys(void) const
Get the variant data.
const TUser_tag & GetUser_tag(void) const
Get the User_tag member data.
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
const TSub & GetSub(void) const
Get the Sub member data.
bool IsEntrys(void) const
Check if variant Entrys is selected.
bool CanGetUser_tag(void) const
Check if it is safe to call GetUser_tag method.
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
FILE * file
yy_size_t n
string GetLocationString(const CSeq_feat &f)
Definition: locations.cpp:228
int isspace(Uchar c)
Definition: ncbictype.hpp:69
std::istream & in(std::istream &in_, double &x_)
static int * results[]
static uint8_t * buffer
Definition: pcre2test.c:1016
@ eFrameShift
string GetStringDescr(const CBioseq &bioseq)
Definition: shortcuts.cpp:309
string printed_ranges(const CSeq_loc &seq_interval)
Definition: shortcuts.cpp:341
string printed_range(const TSeqPos from2, const TSeqPos to2)
Definition: shortcuts.cpp:320
ESerialDataFormat s_GetFormat(const string &name)
string let1_2_let3(char let1)
int main(int argc, const char *argv[])
string GetLocusTag(const CSeq_feat &f, const LocMap &loc_map)
CBioseq_set::TSeq_set * get_parent_seqset(const CBioseq &seq)
CRef< const CSeq_loc > loc1
CRef< const CSeq_loc > loc2
CRef< CTestThread > thr[k_NumThreadsMax]
Definition: test_mt.cpp:267
else result
Definition: token2.c:20
Modified on Fri Sep 20 14:57:49 2024 by modify_doxy.py rev. 669887