NCBI C++ ToolKit
agp_validate_reader.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef AGP_VALIDATE_READER
2 #define AGP_VALIDATE_READER
3 
4 /* $Id: agp_validate_reader.hpp 100352 2023-07-23 17:37:28Z stakhovv $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors:
30  * Victor Sapojnikov
31  *
32  * File Description:
33  * Global (whole-file) AGP validation and statistics.
34  *
35  */
36 
37 #include <corelib/ncbistd.hpp>
38 #include <iostream>
40 #include <util/range_coll.hpp>
41 
42 #include <set>
43 
45 
46 // Map of string->int populated by sequence ids->lengths from a FASTA file.
49 {
50 public:
51  // may be less than size() because we add some names twice, e.g.: lcl|id1 and id1
52  int m_count;
53 
54  typedef pair<TMapStrInt::iterator, bool> TMapStrIntResult;
55  // returns 0 on success, or a previous length not equal to the new one
56  TAgpLen AddCompLen(const string& acc, TAgpLen len, bool increment_count=true);
58  {
59  m_count=0;
60  }
61 
62 };
63 
64 
67 
68  // Determines accession naming patterns, counts accessions.
69 class CAccPatternCounter;
70 
71 // Count how many times a atring value appears;
72 // report values and counts ordered by count.
73 class NCBI_XOBJREAD_EXPORT CValuesCount : public map<string, int>
74 {
75 public:
76  void add(const string& c);
77 
78  // >pointer to >value_type vector for sorting
79  typedef vector<value_type*> TValPtrVec;
80  void GetSortedValues(TValPtrVec& out);
81 
82 private:
83  // For sorting by value count
84  static int x_byCount( value_type* a, value_type* b );
85 };
86 
88 {
89 public:
90  TAgpPos beg, end;
91  char ori;
92  int file_num, line_num;
93 
94  enum { ORI_plus, ORI_minus, ORI_zero, ORI_na, ORI_count };
95 
96  void init(CAgpRow& row, int line_num_arg)
97  {
98  beg=row.component_beg;
99  end=row.component_end;
100  ori=row.orientation;
101 
102  line_num=line_num_arg;
103  file_num = ((CAgpErrEx*)(row.GetErrorHandler()))->GetFileNum();
104  }
105  TAgpLen getLen() const { return end - beg + 1; }
106 
107  string ToString(CAgpErrEx* agpErrEx) const
108  {
109  string s;
110  s += NStr::IntToString(beg);
111  s += "..";
112  s += NStr::IntToString(end);
113  s += " at ";
114  if(file_num) {
115  s += agpErrEx->GetFile(file_num);
116  s += ":";
117  }
118  else {
119  s += "line ";
120  }
122  return s;
123  }
124 };
125 
126 // To save memory, this is a vector instead of a map.
127 // Multiple spans on one component are uncommon.
128 class NCBI_XOBJREAD_EXPORT CCompSpans : public vector<CCompVal>
129 {
130 public:
131  // Construct a vector with one element
132  CCompSpans(const CCompVal& src)
133  {
134  push_back(src);
135  }
136 
137  // Returns the first overlapping span and CAgpErr::W_SpansOverlap,
138  // or the first span out of order and CAgpErr::W_SpansOrder,
139  // or begin() and CAgpErr::W_DuplicateComp.
140  // The caller can ignore the last 2 warnings for draft seqs.
141  typedef pair<iterator, TAgpPos> TCheckSpan;
142  TCheckSpan CheckSpan(TAgpPos span_beg, TAgpPos span_end, bool isPlus);
143  void AddSpan(const CCompVal& span); // CCompSpans::iterator it,
144 
145 };
146 
148 {
149 public:
150  virtual void SaveRow(const string& s, CRef<CAgpRow> row, TRangeColl* runs_of_Ns)=0;
151  virtual ~IAgpRowOutput() {}
152 };
153 
154 class XPrintTotalsItem;
155 class CAgpValidateReader;
157 {
158 public:
159 
160  // false: called from constructor
161  // true : after finishing with scaf-fronm-ctg files, before starting with chr-from-scaf
162  void Reset(bool for_chr_from_scaf=false);
163 
164  CAgpValidateReader(CAgpErrEx& agpErr, CMapCompLen& comp2len, TMapStrRangeColl& comp2range_coll); // , bool checkCompNames=false);
165  virtual ~CAgpValidateReader();
166  void PrintTotals(CNcbiOstream& out=cout, bool use_xml=false);
167 
169  bool m_CheckObjLen; // false: check component lengths
170  bool m_unplaced; // check that singleton components are in '+' orientation
171 
172  bool m_is_chr, m_explicit_scaf;
173  // false false - no extra checks
174  // true false - check telomer/centromer/short_arm counts [-cc; not implemented]
175  // false true - no breaking gaps allowed
176  // true true - no within-scaffold gaps allowed
177  // to do: E_UnusedScaf
178 
179  void SetRowOutput(IAgpRowOutput* row_output);
180 
182 protected:
183  void x_PrintTotals(CNcbiOstream& out=cout, bool use_xml=false); // without comment counts or ids not in AGP
184  //void x_PrintIdsNotInAgp(CNcbiOstream& out=cout, bool use_xml=false);
185 
186  // true: a suspicious mix of ids - some look like GenBank accessions, some do not.
187  static bool x_PrintPatterns(CAccPatternCounter& namePatterns, const string& strHeader, int fasta_count, const char* count_label=nullptr, CNcbiOstream& out=cout, bool use_xml=false);
188 
192 
193  // Callbacks from CAgpReader
194  virtual void OnScaffoldEnd();
195  virtual void OnObjectChange();
196  virtual void OnGapOrComponent();
197  virtual bool OnError();
198  virtual void OnComment();
199 
200  // for W_ObjOrderNotNumerical (JIRA: GP-773)
201  string m_obj_id_pattern; // object_id with each run of conseq digits replaced with '#'
202  // >0 number of sorted literally so far - 1 (for the same current m_obj_id_pattern)
203  int m_obj_id_sorted; // 0 not established yet <0 sort order violated
206 
207  CMapCompLen* m_comp2len; // for optional check of component lengths (or maybe object lengths)
208  CMapCompLen m_scaf2len; // for: -scaf Scaf_AGP_file(s) -chr Chr_AGP_file(s)
213 
218  //bool m_prev_orientation_unknown;
219  char m_prev_orientation; // 0 when we need not warn about it (not in singleton, etc)
220  TAgpPos m_prev_component_beg, m_prev_component_end;
221 
222  string m_prev_component_id; // for W_BreakingGapSameCompId: only set when encountering a breaking gap
223 
231  // add m_NoCompObjects?
232 
235  int m_CompOri[4];
236 
237  //int m_GapTypeCnt[CGapVal::GAP_count+CGapVal::GAP_yes_count];
239  // Count component types and N/U gap types.
240  CValuesCount m_TypeCompCnt; // column 5: A, D, F, ..., N, U
241 
242  // keep track of the object ids to detect duplicates.
244  typedef pair<TObjSet::iterator, bool> TObjSetResult;
246 
248 
249  // keep track of the component and object ids used
250  // in the AGP. Used to detect duplicates and
251  // duplicates with seq range intersections.
253  typedef pair<string, CCompSpans> TCompIdSpansPair;
255 
256  typedef pair<TAgpPos,int> TPairIntInt;
258 
259  typedef pair<TMapIntInt::iterator, bool> TMapIntIntResult;
262 
264 
265  // returns: first=plain text string for the end of line, second= attributes for XML tag (cnt, pct, mf_len)
266  // uses m_NgapByType_ln2count[]
267  void x_GetMostFreqGapsText(int gap_type, string& eol_text, string& attrs);
268 
269  void x_PrintGapCountsLine(XPrintTotalsItem& xprint, int gap_type, const string& label=NcbiEmptyString);
270 
271  // an optional callback object
273 
277 
280  bool m_has_partial_comp, m_has_comp_of_unknown_len;
281 
282  // Former CAgpValidateReader::x_PrintIdsNotInAgp(), split into member functions for the purpose of
283  // running CheckIds() earlier, to be able to add this error to the total error/warning counts.
285  {
289  int m_cnt;
290  public:
291  CIdsNotInAgp(CAgpValidateReader& reader) : m_reader(reader)
292  {
293  m_cnt=0;
294  }
295 
296  // returns: an error/warning message, to be later passed to Print() or PrintXml();
297  // "" if no error.
298  string CheckIds();
299 
300  void Print(CNcbiOstream& out, const string& msg);
301  void PrintXml(CNcbiOstream& out, const string& msg);
302  };
303  friend class CIdsNotInAgp;
304 };
305 
307 
308 #endif /* AGP_VALIDATE_READER */
309 
TSeqPos TAgpLen
Definition: agp_util.hpp:65
TSeqPos TAgpPos
Definition: agp_util.hpp:62
map< string, TAgpLen > TMapStrInt
CRangeCollection< TSeqPos > TRangeColl
map< string, TRangeColl > TMapStrRangeColl
Accession naming patterns; find ranges for consequtive digits.
Definition: agp_util.hpp:877
vector< double > TDoubleVec
Definition: agp_util.hpp:879
Correctly print multiple errors and warnings on consequitive lines; suppress undesired or higly repet...
Definition: agp_util.hpp:650
const string & GetFile(int num)
Definition: agp_util.hpp:846
Detects scaffolds, object boundaries, errors that involve 2 consecutive lines, and is intended as a s...
Definition: agp_util.hpp:327
A container for both the original string column values (Get*() methods) and the values converted to i...
Definition: agp_util.hpp:72
@ eGapYes_count
Definition: agp_util.hpp:190
@ eGapCount
Definition: agp_util.hpp:189
CIdsNotInAgp(CAgpValidateReader &reader)
CAccPatternCounter::TDoubleVec * m_prev_id_digits
pair< TObjSet::iterator, bool > TObjSetResult
pair< string, CCompSpans > TCompIdSpansPair
pair< TAgpPos, int > TPairIntInt
IAgpRowOutput * m_row_output
pair< TMapIntInt::iterator, bool > TMapIntIntResult
map< int, int > TMapIntInt
map< string, CCompSpans > TCompId2Spans
CAccPatternCounter m_objNamePatterns
TMapStrRangeColl * m_comp2range_coll
CAccPatternCounter::TDoubleVec * m_obj_id_digits
pair< iterator, TAgpPos > TCheckSpan
CCompSpans(const CCompVal &src)
TAgpLen getLen() const
void init(CAgpRow &row, int line_num_arg)
string ToString(CAgpErrEx *agpErrEx) const
pair< TMapStrInt::iterator, bool > TMapStrIntResult
vector< value_type * > TValPtrVec
virtual void SaveRow(const string &s, CRef< CAgpRow > row, TRangeColl *runs_of_Ns)=0
Output the count as text or as xml.
Definition: map.hpp:338
void Print(const CCompactSAMApplication::AlignInfo &ai)
Include a standard set of the NCBI C++ Toolkit most basic headers.
std::ofstream out("events_result.xml")
main entry point for tests
static unsigned int line_num
Definition: attributes.c:11
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5086
#define NcbiEmptyString
Definition: ncbistr.hpp:122
#define NCBI_XOBJREAD_EXPORT
Definition: ncbi_export.h:1315
static const char label[]
int len
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
unsigned int a
Definition: ncbi_localip.c:102
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
#define row(bind, expected)
Definition: string_bind.c:73
Modified on Wed Sep 04 14:58:48 2024 by modify_doxy.py rev. 669887