NCBI C++ ToolKit
problems.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34 
35 
36 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out,
37  const CBioseq::TAnnot& annots, const EProblem type)
38 {
39  ITERATE(CBioseq::TAnnot, gen_feature, annots)
40  {
41  if ( !(*gen_feature)->GetData().IsFtable() ) continue;
42  reportProblems(report_and_forget, diag, out, (*gen_feature)->GetData().GetFtable(), type);
43  }
44 }
45 
46 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out,
47  const CSeq_annot::C_Data::TFtable& feats, const EProblem type)
48 {
50  {
51  if( !(*f)->GetData().IsGene() ) continue;
52  string qname; (*f)->GetData().GetGene().GetLabel(&qname);
53  if( diag.find(qname) == diag.end() ) continue;
54  reportProblems(qname, diag, out, type);
55  if(report_and_forget) erase_problems(qname, diag, type);
56  }
57 }
58 void CReadBlastApp::reportProblems(const string& qname, diagMap& diag, ostream& out, const EProblem type)
59 {
60  if(!hasProblems(qname, diag, type)) return;
61  if(PrintDetails()) NcbiCerr << "problem type: " << ProblemType(type) << NcbiEndl;
63 
64 // reporting
66  NON_CONST_ITERATE(list<problemStr>, problem, diag[qname].problems)
67  {
68  if(PrintDetails()) NcbiCerr << "current problem: " << ProblemType(problem->type) << NcbiEndl;
69  if( !(problem->type & type) ) continue;
70  if(PrintDetails()) NcbiCerr << "it is that problem" << NcbiEndl;
71  if(!problem->message.size()) continue;
72  if(PrintDetails()) NcbiCerr << "has nonzero message" << NcbiEndl;
73  reportProblemType(problem->type, out);
74  reportProblemMessage(problem->message, out);
75  if(PrintDetails()) NcbiCerr << "current problem: done: " << problem->type << NcbiEndl;
76  }
78 }
79 void CReadBlastApp::erase_problems(const string& qname, diagMap& diag, const EProblem type)
80 {
82  for(list<problemStr>::iterator problem = diag[qname].problems.begin(); problem!=diag[qname].problems.end();)
83  {
84  if(PrintDetails()) NcbiCerr << "erasing? current problem: " << problem->type << NcbiEndl;
85  if( !(problem->type & type) )
86  {
87  problem++;
88  continue;
89  }
90  if(PrintDetails()) NcbiCerr << "it is that problem for erazing" << NcbiEndl;
91  problem=diag[qname].problems.erase(problem);
92  if(PrintDetails()) NcbiCerr << "erased" << NcbiEndl;
93  if(PrintDetails()) NcbiCerr << "next current problem: " << problem->type << NcbiEndl;
94  }
96 }
97 void CReadBlastApp::reportProblems(const bool report_and_forget, diagMap& diag, ostream& out, const EProblem type)
98 {
101  if(PrintDetails()) NcbiCerr << "reportProblems: all problems of type (" << ProblemType(type) << ")" << NcbiEndl;
102  for (CTypeConstIterator<CBioseq> seq = ConstBegin(); seq; ++seq)
103  {
104  if ( !is_prot_entry(*seq) )
105  {
106  reportProblems(report_and_forget, diag, out, seq->GetAnnot(), type);
107  }
108  string qname1 = GetStringDescr (*seq);
109  string qname2 = CSeq_id::GetStringDescr (*seq, CSeq_id::eFormat_FastA);
110  if(PrintDetails()) NcbiCerr << "reportProblems: start: "
111  << qname1 << " or "
112  << qname2
113  << NcbiEndl;
114  string qnames[2];
115  qnames[0]=qname1; qnames[1]=qname2;
116  for(int i=0; i<2; i++)
117  {
118  string& qname = qnames[i];
119  if( diag.find(qname) != diag.end() )
120  {
121  reportProblems(qname, diag, out, type);
122  if(report_and_forget) erase_problems(qname, diag, type);
123  done[qname]=true;
124  if(PrintDetails()) NcbiCerr << "reportProblems: full end: " << qname << NcbiEndl;
125  }
126  }
127  }
128 
129  ITERATE(diagMap, problem, diag)
130  {
131  string qname = problem->first;
132  if(done.find(qname)!=done.end()) continue;
133  reportProblems(qname, diag, out, type);
134  if(report_and_forget) erase_problems(qname, diag, type);
135  if(PrintDetails()) NcbiCerr << "reportProblems: alternative problems: " << qname << NcbiEndl;
136  }
138 }
139 // hasProblemType(seq, diag[qname].problems), problem->type
140 
142 {
143  string qname = GetStringDescr (seq);
144  string qname2 = CSeq_id::GetStringDescr (seq, CSeq_id::eFormat_FastA);
145  return hasProblems(qname, diag, type) || hasProblems(qname2, diag, type);
146 }
147 
148 bool CReadBlastApp::hasProblems(const string& qname, diagMap& diag, const EProblem type)
149 {
150  if(PrintDetails()) NcbiCerr << "hasProblems: start: " << qname << NcbiEndl;
151  if( type != eAllProblems && diag.find(qname) == diag.end() ) return false;
152  if ( type == eAllProblems && diag[qname].problems.size()>0) return true;
153 
155  ITERATE(list<problemStr>, problem, diag[qname].problems)
156  {
157  if(PrintDetails()) NcbiCerr << "hasProblems: checking problem type: " << "\t"
158  << ProblemType(problem->type) << "\t"
159  << qname << "\t"
160  << NcbiEndl;
161  if (problem->type & type)
162  {
163  if(PrintDetails()) NcbiCerr << "hasProblems: end: does have problem: " << qname << NcbiEndl;
164  return true;
165  }
166  }
168  if(PrintDetails()) NcbiCerr << "hasProblems: end: no problem: " << qname << NcbiEndl;
169  return false;
170 }
171 
172 void CReadBlastApp::reportProblemMessage(const string& message, ostream& out)
173 {
174  out << message.c_str() << NcbiEndl;
175 }
176 
178 {
179  CNcbiStrstream strres;
180  strres << "unknown_problem_type=" << type << '\0';
181  string result=strres.str();
182  if(type & eOverlap)
183  result = "Potential overlap found";
184  else if(type & eRnaOverlap)
185  result = "Potential RNA overlap found";
186  else if(type & eCompleteOverlap)
187  result = "Complete overlap found";
188  else if(type & eRemoveOverlap)
189  result = "overlap marked for removal";
190  else if(type & eRelFrameShift)
191  result = "Something relevant to frame shift found";
192  else if(type & eFrameShift)
193  result = "Potential frame shift evidence found";
194  else if(type & eMayBeNotFrameShift)
195  result = "Evidence absolving from the frame shift accusation found";
196  else if(type & ePartial)
197  result = "Potential partial protein annotation found";
198  else if(type & eShortProtein)
199  result = "Short annotation found";
200  else if(type & eTRNAMissing)
201  result = "tRNA is missing in the list of independently annotated tRNAs";
202  else if(type & eTRNAAbsent)
203  result = "RNA is missing in the list of annotated RNAs in the input";
204  else if(type & eTRNABadStrand)
205  result = "RNA is present at the wrong strand";
206  else if(type & eTRNAUndefStrand)
207  result = "RNA is present with undefined strand";
208  else if(type & eTRNAComMismatch)
209  result = "tRNA is a complete mismatch";
210  else if(type & eTRNAMismatch)
211  result = "tRNA has mismatched ends";
212 
213  return result;
214 
215 }
216 
218 {
219  out
220  << "---"
221  << " ";
222  string stype = ProblemType(type);
223  if(!stype.empty())
224  {
225  out << stype;
226  }
227  else
228  {
229  NcbiCerr << "FATAL: internal error: unknown problem type" << type << NcbiEndl;
230  throw;
231  }
232  out
233  << " "
234  << "---"
235  << NcbiEndl;
236 }
237 
238 void CReadBlastApp::reportProblemSequenceName(const string& name, ostream& out)
239 {
240  out
241  << "====="
242  << " ";
243  out << name.c_str() ;
244  out
245  << " "
246  << "====="
247  << NcbiEndl;
248 }
249 
251 {
252  TProblem_locs problem_locs;
253  if(PrintDetails()) NcbiCerr << "FixStrands: start " << NcbiEndl;
254 
255 // relevant problems
256  CollectRNAFeatures(problem_locs);
257 
258 // first, count features matching each range
259  for(CTypeIterator<CSeq_feat> feat=Begin(); feat; ++feat)
260  {
261  if (!feat->GetData().IsRna() && !feat->GetData().IsGene()) continue;
262  const CSeq_loc& loc = feat->GetLocation();
263  ENa_strand strand;
264  TSeqPos from, to;
265  getFromTo(loc, from, to, strand);
266  string range = printed_range(from, to);
267  if(PrintDetails()) NcbiCerr << "FixStrands: rna or gene [" << range << "]" << NcbiEndl;
268 
269  if(problem_locs.find(range)==problem_locs.end()) continue; // not relevant
270  if(PrintDetails()) NcbiCerr << "FixStrands: range: " << range << NcbiEndl;
271  problem_locs[range].count++;
272  if(feat->GetData().IsRna()) problem_locs[range].rnacount++;
273  if(feat->GetData().IsGene()) problem_locs[range].genecount++;
274  }
275 // now fix
276  for(CTypeIterator<CSeq_feat> feat=Begin(); feat; ++feat)
277  {
278  if (!feat->GetData().IsRna() && !feat->GetData().IsGene()) continue;
279  if(PrintDetails()) NcbiCerr << "FixStrands: rna or gene for fixing" << NcbiEndl;
280  CSeq_loc& loc = feat->SetLocation();
281  ENa_strand strand;
282  TSeqPos from, to;
283  getFromTo(loc, from, to, strand);
284  string range = printed_range(from, to);
285  if(problem_locs.find(range)==problem_locs.end()) continue; // not relevant
286  if(PrintDetails()) NcbiCerr << "FixStrands: range for fix: " << range << NcbiEndl;
287  if( problem_locs[range].count!=2
288  || problem_locs[range].rnacount!=1
289  || problem_locs[range].genecount!=1
290  )
291  {
292  // no touching
293  // warning
294  NcbiCerr << "CReadBlastApp::FixStrands: "
295  << "WARNING: "
296  << "location found, but the number of features with that location is confusing, "
297  << "no fixing for "
298  << "[" << problem_locs[range].name << "]"
299  << "(" << range << ")"
300  << NcbiEndl;
301  continue;
302  }
303 // over all intervals
304  int ninter=0;
305  for(CTypeIterator<CSeq_interval> inter = ::Begin(loc); inter; ++inter, ++ninter)
306  {
307  inter->SetStrand(problem_locs[range].strand);
308  NcbiCerr << "CReadBlastApp::FixStrands: "
309  << "[" << problem_locs[range].name << "] "
310  << "fixed"
311  << NcbiEndl;
312  }
313  NcbiCerr << "CReadBlastApp::FixStrands: ninters= " << ninter << NcbiEndl;
314  } // for(CTypeIterator<CSeq_feat>
315  if(PrintDetails()) NcbiCerr << "FixStrands: end" << NcbiEndl;
316  return 1;
317 }
318 
320 {
321  if(PrintDetails()) NcbiCerr << "RemoveProblems: start " << NcbiEndl;
322 
323  PushVerbosity();
324  if(IsSubmit())
325  {
326  if ( m_Submit.GetData().IsEntrys())
327  {
328  for(CSeq_submit::C_Data::TEntrys::iterator entry = m_Submit.SetData().SetEntrys().begin();
329  entry != m_Submit.SetData().SetEntrys().end();)
330  // NON_CONST_ITERATE(CSeq_submit::C_Data::TEntrys, entry, m_Submit.SetData().SetEntrys())
331  {
332  int removeme = RemoveProblems(**entry, problem_names, loc_map);
333  if(PrintDetails())
334  NcbiCerr
335  << "RemoveProblems(void): doing entry: removeme = "
336  << removeme
337  << NcbiEndl;
338  if(removeme)
339  {
340  NcbiCerr << "RemoveProblems(): WARNING: "
341  << "CSeq_entry deleted, loss of annotation might occur"
342  << NcbiEndl;
343  entry=m_Submit.SetData().SetEntrys().erase(entry);
344  }
345  else entry++;
346  }
347  }
348  else
349  {
350  NcbiCerr << "ERROR: submit file does not have proper seqset"<< NcbiEndl;
351  }
352  }
353  else
354  {
355  if(PrintDetails())
356  NcbiCerr
357  << "RemoveProblems(void): case is single entry "
358  << NcbiEndl;
359  RemoveProblems(m_Entry, problem_names, loc_map);
360  }
361 
362  PopVerbosity();
363  if(PrintDetails()) NcbiCerr << "RemoveProblems: end" << NcbiEndl;
364  return 1;
365 }
366 
368 {
369  int removeme=0;
370  if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_entry): start" << NcbiEndl;
371  if(entry.IsSeq())
372  {
373  removeme=RemoveProblems(entry.SetSeq(), problem_seqs, loc_map);
374  if(PrintDetails())
375  NcbiCerr
376  << "RemoveProblems(CSeq_entry)(seq case): removeme = "
377  << removeme
378  << NcbiEndl;
379 
380  }
381  else //several seqs
382  {
383  CBioseq_set& bioset = entry.SetSet();
384  removeme=RemoveProblems(bioset, problem_seqs, loc_map);
386  int size=0;
387  for (CTypeConstIterator<CBioseq> seq = ::ConstBegin(entry); seq; ++seq, size++);
388  if(PrintDetails())
389  NcbiCerr
390  << "RemoveProblems(CSeq_entry)(set case): removeme = "
391  << removeme
392  << ", entries.size = "
393  << entries.size()
394  << ", total seqs = "
395  << size
396  << NcbiEndl;
397  if (size<=1) NormalizeSeqentry(entry);
398  } // entry.IsSet()
399  if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_entry): end" << NcbiEndl;
400  return removeme;
401 }
402 
404 {
405  if(!entry.IsSet()) return;
406  CBioseq_set& bioset = entry.SetSet();
408  if(entries.size()!=1) return;
409 // 1. merge descriptions
410  CBioseq& seq = (*entries.begin())->SetSeq();
411  if(bioset.IsSetDescr())
412  {
413  CSeq_descr::Tdata& descs = bioset.SetDescr().Set();
414  for(CSeq_descr::Tdata::iterator desc = descs.begin(); desc!=descs.end(); )
415  {
416  seq.SetDescr().Set().push_back(*desc);
417  desc=descs.erase(desc);
418  }
419  } // if(entry.SetSet().IsSetDescr())
420 // 2. move CBioseq under the CSeq_entr
421  CRef<CBioseq> pseq (&seq);
422  entry.SetSeq(*pseq);
423  NcbiCerr << "NormalizeSeqentry(CSeq_entry...): "
424  << "WARNING: "
425  << "converted sequence set to sequence"
426  << NcbiEndl;
427  return;
428 }
429 
431 {
432  bool noseqs=false;
433  bool noannot=false;
434  int removeme=0;
435  if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq_set): start" << NcbiEndl;
436  if(setseq.IsSetSeq_set())
437  {
438  int all_entries_removed = RemoveProblems(setseq.SetSeq_set(), problem_seqs, loc_map);
439  if(all_entries_removed > 0) {/* mandatory, no deletion */; noseqs=true;}
440  }
441  if(setseq.IsSetAnnot())
442  {
443  int all_annot_removed = RemoveProblems(setseq.SetAnnot(), problem_seqs, loc_map);
444  if(all_annot_removed > 0) {setseq.ResetAnnot(); noannot=true;}
445  }
446  if(noseqs ) removeme = 1;
447  if(PrintDetails())
448  NcbiCerr
449  << "RemoveProblems(CBioseq_set): noseqs = "
450  << noseqs
451  << ", noannot = "
452  << noannot
453  << ", removeme (return) = "
454  << removeme
455  << NcbiEndl;
456 
457  return removeme;
458 }
459 
461 {
462  int remove=0;
463  if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq): start" << NcbiEndl;
464  if(!seq.IsAa()) // nucleotide sequnce
465  {
466  if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq): nuc" << NcbiEndl;
467  if(seq.IsSetAnnot())
468  {
469  int annotations_removed = RemoveProblems(seq.SetAnnot(), problem_seqs, loc_map);
470  if(annotations_removed) seq.ResetAnnot();
471  }
472  }
473  else // aminoacid sequence
474 // checkif needed to kill it
475  {
476  string thisName = GetStringDescr(seq);
477  string origName = thisName;
478  string::size_type ipos = thisName.rfind('|'); if(ipos!=string::npos) thisName.erase(0, ipos+1);
479  ipos = thisName.rfind('_'); if(ipos!=string::npos) ipos= thisName.rfind('_', ipos-1);
480  if(PrintDetails())
481  NcbiCerr
482  << "RemoveProblems(CBioseq): remove? sequence "
483  << "[" << origName << "]"
484  << " looking for "
485  << "[" << thisName << "]"
486  << NcbiEndl;
487 
488  if(problem_seqs.find(thisName) != problem_seqs.end())
489  {
490  NcbiCerr
491  << "RemoveProblems(CBioseq): sequence "
492  << "[" << origName << "]"
493  << " is marked for removal, because of a match to "
494  << "[" << thisName << "]"
495  << NcbiEndl;
496  remove=1; // whack the sequence
497  }
498  }
499  if(PrintDetails())
500  NcbiCerr
501  << "RemoveProblems(CBioseq): remove = "
502  << remove
503  << NcbiEndl;
504 
505 
506  return remove;
507 }
508 
509 
511 {
513  int remove=0;
514  for(CBioseq_set::TSeq_set::iterator entries_end = entries.end(), entry=entries.begin(); entry != entries_end; )
515  {
516  int removeseq=RemoveProblems(**entry, problem_seqs, loc_map);
517  if(PrintDetails())
518  NcbiCerr
519  << "RemoveProblems(CBioseq_set::TSeq_set): removeseq = "
520  << removeseq
521  << NcbiEndl;
522 
523  if(removeseq) entry=entries.erase(entry);
524  else entry++;
525  } // each seqs
526  if(entries.size()==0) remove=1;
527  if(PrintDetails())
528  NcbiCerr
529  << "RemoveProblems(CBioseq_set::TSeq_set): nentries = "
530  << entries.size()
531  << NcbiEndl;
532 
534  return remove;
535 }
536 
538 {
539  int remove=0;
540  if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq::TAnnot): start" << NcbiEndl;
541  for(CBioseq::TAnnot::iterator annot=annots.begin(); annot!=annots.end(); )
542  {
543  int removeme=0;
544  if( (*annot)->GetData().IsFtable()) removeme=RemoveProblems((*annot)->SetData().SetFtable(), problem_seqs, loc_map);
545  if(removeme)
546  {
547  NcbiCerr << "RemoveProblems(annots, problem_seqs): "
548  << "INFO: "
549  << "annotation has empty feature table and it will be removed"
550  << NcbiEndl;
551  annot=annots.erase(annot);
552  }
553  else annot++;
554  }
555  if(annots.size()==0) remove=1;
556  if(PrintDetails()) NcbiCerr << "RemoveProblems(CBioseq::TAnnot): end" << NcbiEndl;
557 
558  return remove;
559 }
560 
562 // this one needs cleaning
563 {
564  int removeme=0;
565  if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): start" << NcbiEndl;
566  GetLocMap(loc_map, table);
567  for(CSeq_annot::C_Data::TFtable::iterator feat_end = table.end(), feat = table.begin(); feat != feat_end;)
568  {
569  if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat 000" << NcbiEndl;
570  bool gene, cdregion;
571  gene = (*feat)->GetData().IsGene();
572  cdregion = (*feat)->GetData().IsCdregion();
573  bool del_feature=false;
574  if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat I" << NcbiEndl;
575  string real_loc_string = GetLocationString((*feat)->GetLocation());
576  if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat II" << NcbiEndl;
577 
578  string loc_string = GetLocusTag(**feat, loc_map); // more general, returns location string
579  if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): feat: (" << real_loc_string << ")(" << loc_string << ")" << NcbiEndl;
580 //
581 // case *: matching locus tag
582 //
583  if(problem_seqs.find(loc_string) != problem_seqs.end())
584  {
585  if((*feat)->GetData().IsImp() &&
586  (*feat)->GetData().GetImp().CanGetKey())
587  {
588  NcbiCerr << "RemoveProblems: INFO: feature " << loc_string << ": imp, key = " << (*feat)->GetData().GetImp().GetKey() << NcbiEndl;
589  }
590  if((*feat)->GetData().IsImp() &&
591  (*feat)->CanGetComment() )
592  {
593  NcbiCerr << "RemoveProblems: INFO: feature " << loc_string << ": imp, comment = " << (*feat)->GetComment() << NcbiEndl;
594  }
595 /*
596  if((*feat)->GetData().IsImp() &&
597  (*feat)->GetData().GetImp().CanGetKey() &&
598  (*feat)->GetData().GetImp().GetKey() == "misc_feature" &&
599  (*feat)->CanGetComment() &&
600  (*feat)->GetComment().find("potential frameshift") != string::npos
601  ) del_feature = false; // this is a new feature, that we are not supposed to delete
602 */
603  if((*feat)->GetData().IsImp() &&
604  (*feat)->GetData().GetImp().CanGetKey() &&
605  (*feat)->GetData().GetImp().GetKey() == "misc_feature"
606  ) del_feature = false; // this is a new feature, we do not delete them
607  else del_feature=true;
608  }
609 
610  if ( PrintDetails() )
611  {
612  NcbiCerr << "RemoveProblems: feature " << loc_string << ": ";
613  if(del_feature) NcbiCerr << "WILL BE REMOVED";
614  else NcbiCerr << "stays until further analysis for it";
615  NcbiCerr << NcbiEndl;
616  }
617  if(del_feature)
618  {
619  NcbiCerr << "RemoveProblems: WARNING: feature "
620  << "{" << (*feat)->GetData().SelectionName((*feat)->GetData().Which()) << "} "
621  << loc_string << ": ";
622  NcbiCerr << "will be removed because of a problem: ";
623  NcbiCerr << problem_seqs.find(loc_string)->second;
624  NcbiCerr << NcbiEndl;
625  }
626  if(!del_feature && gene && (*feat)->GetData().GetGene().CanGetLocus_tag() )
627 //
628 // case *: gene
629 //
630  {
631  string locus_tag = (*feat)->GetData().GetGene().GetLocus_tag();
632  if(problem_seqs.find(locus_tag) != problem_seqs.end()) del_feature=true;
633  if ( PrintDetails() )
634  {
635  NcbiCerr << "RemoveProblems: gene " << locus_tag << ": ";
636  if(del_feature)
637  NcbiCerr << "WILL BE REMOVED";
638  else
639  NcbiCerr << "stays";
640  NcbiCerr << NcbiEndl;
641  }
642  if(del_feature)
643  {
644  NcbiCerr << "RemoveProblems: WARNING: gene " << locus_tag << ": ";
645  NcbiCerr << "will be removed because of a problem: ";
646  NcbiCerr << problem_seqs.find(locus_tag)->second;
647  NcbiCerr << NcbiEndl;
648  }
649  }
650  if(!del_feature && cdregion && (*feat)->CanGetProduct() )
651  {
652 //
653 // case *: cdregion
654 //
655  string productName;
656  if( (*feat)->CanGetProduct() &&
657  (*feat)->GetProduct().IsWhole() &&
658  (*feat)->GetProduct().GetWhole().IsGeneral() &&
659  (*feat)->GetProduct().GetWhole().GetGeneral().CanGetTag() &&
660  (*feat)->GetProduct().GetWhole().GetGeneral().GetTag().IsStr() )
661  {
662  productName = (*feat)->GetProduct().GetWhole().GetGeneral().GetTag().GetStr();
663  }
664  else if (
665  (*feat)->CanGetProduct() &&
666  (*feat)->GetProduct().IsWhole())
667  {
668  productName = (*feat)->GetProduct().GetWhole().AsFastaString();
669  }
670 // strip leading contig ID if any
671  string::size_type ipos=productName.rfind('_', productName.size());
672  if(ipos != string::npos)
673  {
674  string::size_type ipos2;
675  ipos2=productName.rfind('_', ipos-1);
676  if(ipos2 != string::npos) productName.erase(0, ipos2+1);
677  // "1103032000567_RAZWK3B_00550" -> "RAZWK3B_00550";
678  else
679  {
680  ipos2=productName.rfind('|', ipos-1);
681  if(ipos2 != string::npos) productName.erase(0, ipos2+1);
682  }
683 // lcl|Xoryp_00025 -> Xoryp_00025
684  }
685  if(productName.length() && problem_seqs.find(productName) != problem_seqs.end()) del_feature=true;
686  if ( PrintDetails() )
687  {
688  NcbiCerr << "RemoveProblems: cdregion " << productName << ": ";
689  if(del_feature)
690  NcbiCerr << "WILL BE REMOVED";
691  else
692  NcbiCerr << "stays";
693  NcbiCerr << NcbiEndl;
694  }
695  }
696  if(del_feature)
697  {
698  if(problem_seqs.find(real_loc_string) == problem_seqs.end())
699  {
700  problem_seqs[real_loc_string]=problem_seqs[loc_string]; // saving for later
701  }
702  }
703  if(del_feature) feat=table.erase(feat);
704  else feat++;
705  }
706  if(table.size()==0) removeme=1;
707  if(PrintDetails()) NcbiCerr << "RemoveProblems(CSeq_annot::C_Data::TFtable): end" << NcbiEndl;
708  return removeme;
709 }
710 
711 // RemoveInterim
712 
714 {
715 
716  int nremoved=0;
717 
718  if(PrintDetails()) NcbiCerr << "RemoveInterim: start " << NcbiEndl;
719  PushVerbosity();
720  for(CTypeIterator<CBioseq> seq=Begin(); seq; ++seq)
721  {
722  if(seq->IsSetAnnot() && seq->IsAa()) nremoved+= RemoveInterim(seq->SetAnnot());
723  if(seq->IsSetAnnot() && seq->IsNa()) nremoved+= RemoveInterim2(seq->SetAnnot());
724  }
725 
726  PopVerbosity();
727  if(PrintDetails()) NcbiCerr << "RemoveInterim: end" << NcbiEndl;
728  return nremoved;
729 }
730 // protein sequence annotations
732 {
733  int nremoved=0;
734  if(PrintDetails()) NcbiCerr << "RemoveInterim(annots): start " << NcbiEndl;
735 
736  for(CBioseq::TAnnot::iterator annot=annots.begin(), annot_end = annots.end(); annot != annot_end; )
737  {
738  bool erased = false;
739  if((*annot)->GetData().IsAlign())
740  {
741  nremoved++; erased = true;
742  }
743  if ( (*annot)->GetData().IsFtable())
744  {
745  int dremoved=0;
746  CSeq_annot::C_Data::TFtable& table = (*annot)->SetData().SetFtable();
747  for(CSeq_annot::C_Data::TFtable::iterator feat=table.begin(), feat_end= table.end(); feat != feat_end; )
748  {
749  string test = "Genomic Location:";
750  if ((*feat)->IsSetData() && (*feat)->GetData().IsProt() &&
751  (*feat)->IsSetComment() && (*feat)->GetComment().substr(0, test.size()) == test )
752  {
753  table.erase(feat++); dremoved++;
754  nremoved++;
755  }
756  else
757  {
758  feat++;
759  }
760  }
761 
762 /*
763  this is really crappy way of doing it!
764 */
765 /*
766  while( (*annot)->GetData().GetFtable().size()>1 )
767  {
768  (*annot)->SetData().SetFtable().pop_back();
769  nremoved++;
770  dremoved++;
771  }
772 */
773  if(PrintDetails()) NcbiCerr << "RemoveInterim(CBioseq::TAnnot& annots): dremoved = "
774  << dremoved
775  << ", left=" << (*annot)->GetData().GetFtable().size()
776  << NcbiEndl;
777  if((*annot)->SetData().SetFtable().size() == 0)
778  {
779  nremoved++;
780  erased = true;
781  }
782  }
783  if(erased) annot=annots.erase(annot);
784  else annot++;
785  }
786 
787  if(PrintDetails()) NcbiCerr << "RemoveInterim(annots): end" << NcbiEndl;
788  return nremoved;
789 }
790 
791 // nucleotide sequence annotations
793 {
794  int nremoved=0;
795  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): start " << NcbiEndl;
796 
797  NON_CONST_ITERATE(CBioseq::TAnnot, gen_feature, annots)
798  {
799  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature start" << NcbiEndl;
800  if ( !(*gen_feature)->GetData().IsFtable() ) continue;
801  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature is ftable" << NcbiEndl;
802  map<string,bool> feat_defined;
803  CSeq_annot::C_Data::TFtable& table = (*gen_feature)->SetData().SetFtable();
804  for(CSeq_annot::C_Data::TFtable::iterator feat_end = table.end(), feat = table.begin(); feat != feat_end;)
805  {
806  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat start" << NcbiEndl;
807  const CSeq_feat& featr = **feat;
808  CNcbiStrstream buff;
809  buff << MSerial_AsnText << featr;
810  buff << '\0';
811  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): feat ASN:" << NcbiEndl;
812  if(PrintDetails()) NcbiCerr << buff.str() << NcbiEndl;
813  if(feat_defined.find(buff.str()) != feat_defined.end()) // dupe
814  {
815  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat dupe: erase" << NcbiEndl;
816  feat=table.erase(feat);
817  }
818  else
819  {
820  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat new: add to feat_defined map" << NcbiEndl;
821  feat_defined[buff.str()]=true;
822  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat new: add to feat_defined map done" << NcbiEndl;
823  // feat_defined[featr]=true;
824  feat++;
825  }
826  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature feat end" << NcbiEndl;
827  }
828 
829  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): gen_feature end" << NcbiEndl;
830  }
831 
832  if(PrintDetails()) NcbiCerr << "RemoveInterim2(annots): end" << NcbiEndl;
833  return nremoved;
834 }
835 
836 
837 
838 
839 string diagName(const string& type, const string& value)
840 {
841  return type + "|" + value;
842 }
843 
844 int addProblems(list<problemStr>& dest, const list<problemStr>& src)
845 {
846  int n=0;
847  ITERATE(list<problemStr>, src_p, src)
848  {
849  dest.push_back(*src_p);
850  n++;
851  }
852  return n;
853 }
854 
856 {
857  ITERATE( diagMap, feat, m_diag)
858  {
859  ITERATE(list<problemStr>, problem, feat->second.problems)
860  {
861  bool added = false;
862  string name = feat->first;
863  string::size_type ipos = name.rfind('|'); if(ipos!=string::npos) name.erase(0, ipos+1);
864  ipos = name.rfind('_'); if(ipos!=string::npos) ipos= name.rfind('_', ipos-1);
865  if(ipos!=string::npos) name.erase(0, ipos+1);
866  string range = printed_range(problem->i1, problem->i2);
867  if(
868  (problem->type & eTRNABadStrand || problem->type & eTRNAUndefStrand) // irrelevant problem
869  && !problem->misc_feat_message.empty()
870  )
871  {
872  problem_locs[range].strand = problem->strand;
873  problem_locs[range].name = name;
874  problem_locs[range].count =
875  problem_locs[range].rnacount =
876  problem_locs[range].genecount = 0;
877  added=true;
878  }
879  if(PrintDetails())
880  NcbiCerr << "CReadBlastApp::CollectRNAFeatures: " << feat->first
881  << "[" << range << "]: "
882  << "(" << name << ")"
883  << (added ? "added" : "skipped") << NcbiEndl;
884  }
885  }
886  return problem_locs.size();
887 
888 }
889 
891 {
892  const CArgs& args = GetArgs();
893  bool keep_frameshifted = args["kfs"].HasValue();
894  ITERATE( diagMap, feat, m_diag)
895  {
896  ITERATE(list<problemStr>, problem, feat->second.problems)
897  {
898  bool added = false;
899  string name = feat->first;
900  string::size_type ipos = name.rfind('|'); if(ipos!=string::npos) name.erase(0, ipos+1);
901  ipos = name.rfind('_'); if(ipos!=string::npos) ipos= name.rfind('_', ipos-1);
902  if(ipos!=string::npos) name.erase(0, ipos+1);
903  if(
904  (problem->type == eFrameShift && !keep_frameshifted)
905  ||
906  problem->type == eRemoveOverlap
907  ||
908  problem->type & eShortProtein
909  )
910  { problem_names[name]=ProblemType(problem->type); added=true; }
911  if(PrintDetails())
912  NcbiCerr << "CollectFrameshiftedSeqs: " << feat->first
913  << ": "
914  << "(" << name << ")"
915  << (added ? "added" : "skipped") << NcbiEndl;
916  }
917  }
918  return problem_names.size();
919 }
920 
921 void CReadBlastApp::append_misc_feature(CBioseq_set::TSeq_set& seqs, const string& name, EProblem problem_type)
922 {
923  if(m_diag.find(name)==m_diag.end())
924  {
925  // should not happen
926  NcbiCerr << "append_misc_feature: FATAL: do not have problems for " << name << NcbiEndl;
927  throw;
928  return;
929  }
930 
932  {
933  if( is_prot_entry((*na)->GetSeq()) ) continue;
934  list<CRef<CSeq_id> >& na_id = (*na)->SetSeq().SetId();
935  NON_CONST_ITERATE(CBioseq::TAnnot, gen_feature, (*na)->SetSeq().SetAnnot())
936  {
937  if ( !(*gen_feature)->GetData().IsFtable() ) continue;
938  typedef map<string, bool> Tmessage_misced;
939  typedef map<EProblem, Tmessage_misced> Tproblem_misced;
940  Tproblem_misced problem_misced; // list of problems fixed
941  ITERATE(list<problemStr>, problem, m_diag[name].problems)
942  {
943  if ( !(problem->type & problem_type) ) continue;
944  int from=-1, to=-1;
945 // string lt1, lt2;
946  ENa_strand strand;
947  string message="";
948  from = problem->i1;
949  if(from<0) continue;
950 // add new feature
951  to = problem->i2;
952 /*
953  lt1 = problem->id1;
954  lt2 = problem->id2;
955 */
956  strand = problem->strand;
957  message = problem->misc_feat_message;
958  if(message.size()==0) continue; // do not print empty misc_feat, they are empty for a reason
959  if(problem_misced.find(problem->type) != problem_misced.end() &&
960  problem_misced[problem->type].find(message) != problem_misced[problem->type].end()
961  ) continue;
962  else problem_misced[problem->type][message] = true;
963  SIZE_TYPE pos;
964  while((pos=message.find_first_of("\n\r"))!=string::npos)
965  {
966  message[pos]=' ';
967  }
968  CRef<CSeq_feat> feat(new CSeq_feat());
969 
970  feat->SetComment(message);
971  feat->SetData().SetImp().SetKey("misc_feature");
972  feat->SetLocation().SetInt().SetFrom(from);
973  feat->SetLocation().SetInt().SetTo(to);
974  feat->SetLocation().SetInt().SetId(**na_id.begin());
975  feat->SetLocation().SetInt().SetStrand(strand);
976  (*gen_feature)->SetData().SetFtable().push_back(feat);
977  }
978 
979  }
980  break;
981  }
982 
983  return;
984 }
985 
986 
987 
988 
CArgs –.
Definition: ncbiargs.hpp:379
bool IsAa(void) const
Definition: Bioseq.cpp:350
int CollectFrameshiftedSeqs(map< string, string > &problem_names)
Definition: problems.cpp:890
int CollectRNAFeatures(TProblem_locs &problem_locs)
Definition: problems.cpp:855
static bool hasProblems(const CBioseq &seq, diagMap &diag, const EProblem type)
Definition: problems.cpp:141
static bool PrintDetails(int current_verbosity=m_current_verbosity)
CConstBeginInfo ConstBegin(void)
void erase_problems(const string &qname, diagMap &diag, const EProblem type)
Definition: problems.cpp:79
void GetLocMap(LocMap &loc_map, const CSeq_annot::C_Data::TFtable &feats)
Definition: locations.cpp:203
int RemoveInterim2(CBioseq::TAnnot &annots)
Definition: problems.cpp:792
static void IncreaseVerbosity(void)
void reportProblemMessage(const string &message, ostream &out=NcbiCout)
Definition: problems.cpp:172
void reportProblemType(const EProblem type, ostream &out=NcbiCout)
Definition: problems.cpp:217
static string ProblemType(const EProblem type)
Definition: problems.cpp:177
int FixStrands(void)
Definition: problems.cpp:250
void reportProblemSequenceName(const string &name, ostream &out=NcbiCout)
Definition: problems.cpp:238
int RemoveProblems(map< string, string > &problem_seqs, LocMap &loc_map)
Definition: problems.cpp:319
static bool is_prot_entry(const CBioseq &seq)
static void DecreaseVerbosity(void)
static void PopVerbosity(void)
static void getFromTo(const CSeq_loc &loc, TSeqPos &from, TSeqPos &to, ENa_strand &strand)
Definition: locations.cpp:34
void reportProblems(const bool report_and_forget, diagMap &diag, ostream &out, const CBioseq::TAnnot &annots, const EProblem type)
Definition: problems.cpp:36
int RemoveInterim(void)
Definition: problems.cpp:713
CBeginInfo Begin(void)
void NormalizeSeqentry(CSeq_entry &entry)
Definition: problems.cpp:403
void append_misc_feature(CBioseq_set::TSeq_set &seqs, const string &name, EProblem problem_type)
Definition: problems.cpp:921
static void PushVerbosity(void)
CSeq_submit m_Submit
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
std::ofstream out("events_result.xml")
main entry point for tests
#define test(a, b, c, d, e)
Definition: numeric.c:170
static void DLIST_NAME() remove(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:90
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
static string GetStringDescr(const CBioseq &bioseq, EStringFormat fmt)
Definition: Seq_id.cpp:2292
@ eFormat_FastA
Definition: Seq_id.hpp:662
@ eOverlap
CSeq_locs overlap.
#define NcbiEndl
Definition: ncbistre.hpp:548
#define NcbiCerr
Definition: ncbistre.hpp:544
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
void ResetAnnot(void)
Reset Annot data member.
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
bool IsSetDescr(void) const
Check if a value has been assigned to Descr data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
void SetDescr(TDescr &value)
Assign a value to Descr data member.
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
void ResetAnnot(void)
Reset Annot data member.
Definition: Bioseq_.cpp:91
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
bool IsEntrys(void) const
Check if variant Entrys is selected.
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
int i
yy_size_t n
string GetLocationString(const CSeq_feat &f)
Definition: locations.cpp:228
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
int addProblems(list< problemStr > &dest, const list< problemStr > &src)
Definition: problems.cpp:844
string diagName(const string &type, const string &value)
Definition: problems.cpp:839
@ eMayBeNotFrameShift
@ eTRNAComMismatch
@ eTRNABadStrand
@ eShortProtein
@ eTRNAMismatch
@ eAllProblems
@ eTRNAUndefStrand
@ eRemoveOverlap
@ eRnaOverlap
@ eTRNAAbsent
@ eFrameShift
@ eCompleteOverlap
@ eTRNAMissing
@ eRelFrameShift
string GetStringDescr(const CBioseq &bioseq)
Definition: shortcuts.cpp:309
string GetLocusTag(const CSeq_feat &f, const LocMap &loc_map)
string printed_range(const TSeqPos from2, const TSeqPos to2)
Definition: shortcuts.cpp:320
Definition: type.c:6
done
Definition: token1.c:1
else result
Definition: token2.c:20
static wxAcceleratorEntry entries[3]
Modified on Wed Jun 12 11:12:15 2024 by modify_doxy.py rev. 669887