NCBI C++ ToolKit
text_output.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: text_output.cpp 100293 2023-07-17 20:59:36Z kans $
2  * =========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: please dont mention my name here
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include "discrepancy_core.hpp"
32 
36 
37 static constexpr std::initializer_list<eTestNames> g_ReportOrder0 = {
38  eTestNames::COUNT_NUCLEOTIDES,
39  eTestNames::VERY_LONG_NO_ANNOTATION,
40  eTestNames::LONG_NO_ANNOTATION,
41  eTestNames::NO_ANNOTATION,
42  };
43 
44 static constexpr std::initializer_list<eTestNames> g_ReportOrder1 = {
45  eTestNames::SOURCE_QUALS,
46  eTestNames::DUP_SRC_QUAL,
47  eTestNames::MAP_CHROMOSOME_CONFLICT,
48  eTestNames::BIOMATERIAL_TAXNAME_MISMATCH,
49  eTestNames::SPECVOUCHER_TAXNAME_MISMATCH,
50  eTestNames::STRAIN_CULTURE_COLLECTION_MISMATCH,
51  eTestNames::TRINOMIAL_SHOULD_HAVE_QUALIFIER,
52  eTestNames::REQUIRED_STRAIN,
53  eTestNames::BACTERIA_SHOULD_NOT_HAVE_ISOLATE,
54  eTestNames::METAGENOMIC,
55  eTestNames::METAGENOME_SOURCE,
56  eTestNames::MAG_SHOULD_NOT_HAVE_STRAIN,
57  eTestNames::MAG_MISSING_ISOLATE,
58 
59  eTestNames::TITLE_ENDS_WITH_SEQUENCE,
60  eTestNames::GAPS,
61  eTestNames::N_RUNS,
62  eTestNames::PERCENT_N,
63  eTestNames::TEN_PERCENTN,
64  eTestNames::TERMINAL_NS,
65  eTestNames::ZERO_BASECOUNT,
66  eTestNames::LOW_QUALITY_REGION,
67  eTestNames::UNUSUAL_NT,
68  //eTestNames::SHORT_CONTIG,
69  //eTestNames::SHORT_SEQUENCES,
70  //eTestNames::SEQUENCES_ARE_SHORT,
71  eTestNames::GENOMIC_MRNA,
72 
73  eTestNames::CHECK_AUTH_CAPS,
74  eTestNames::CHECK_AUTH_NAME,
75  eTestNames::TITLE_AUTHOR_CONFLICT,
76  eTestNames::CITSUBAFFIL_CONFLICT,
77  eTestNames::SUBMITBLOCK_CONFLICT,
78  eTestNames::UNPUB_PUB_WITHOUT_TITLE,
79  eTestNames::USA_STATE,
80 
81  eTestNames::FEATURE_COUNT,
82  eTestNames::PROTEIN_NAMES,
83  eTestNames::SUSPECT_PRODUCT_NAMES,
84  eTestNames::SUSPECT_PHRASES,
85  eTestNames::INCONSISTENT_PROTEIN_ID,
86  eTestNames::MISSING_PROTEIN_ID,
87  eTestNames::MRNA_SHOULD_HAVE_PROTEIN_TRANSCRIPT_IDS,
88  eTestNames::BAD_LOCUS_TAG_FORMAT,
89  eTestNames::INCONSISTENT_LOCUS_TAG_PREFIX,
90  eTestNames::DUPLICATE_LOCUS_TAGS,
91  eTestNames::MISSING_LOCUS_TAGS,
92  eTestNames::NON_GENE_LOCUS_TAG,
93  eTestNames::MISSING_GENES,
94  eTestNames::EXTRA_GENES,
95  eTestNames::BAD_BACTERIAL_GENE_NAME,
96  eTestNames::BAD_GENE_NAME,
97  eTestNames::BAD_GENE_STRAND,
98  eTestNames::DUP_GENES_OPPOSITE_STRANDS,
99  eTestNames::GENE_PARTIAL_CONFLICT,
100  eTestNames::GENE_PRODUCT_CONFLICT,
101  eTestNames::SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME,
102  eTestNames::EC_NUMBER_ON_UNKNOWN_PROTEIN,
103  eTestNames::MISC_FEATURE_WITH_PRODUCT_QUAL,
104  eTestNames::PARTIAL_CDS_COMPLETE_SEQUENCE,
105  eTestNames::CONTAINED_CDS,
106  eTestNames::RNA_CDS_OVERLAP,
107  eTestNames::CDS_TRNA_OVERLAP,
108  eTestNames::OVERLAPPING_RRNAS,
109  eTestNames::FIND_OVERLAPPED_GENES,
110  eTestNames::ORDERED_LOCATION,
111  eTestNames::PARTIAL_PROBLEMS,
112  eTestNames::FEATURE_LOCATION_CONFLICT,
113  eTestNames::PSEUDO_MISMATCH,
114  eTestNames::EUKARYOTE_SHOULD_HAVE_MRNA,
115  eTestNames::MULTIPLE_CDS_ON_MRNA,
116  eTestNames::CDS_WITHOUT_MRNA,
117  eTestNames::BACTERIA_SHOULD_NOT_HAVE_MRNA,
118  eTestNames::BACTERIAL_PARTIAL_NONEXTENDABLE_EXCEPTION,
119  eTestNames::BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS,
120  eTestNames::BACTERIAL_JOINED_FEATURES_NO_EXCEPTION,
121  eTestNames::JOINED_FEATURES,
122  eTestNames::RIBOSOMAL_SLIPPAGE,
123  eTestNames::BAD_BGPIPE_QUALS,
124  eTestNames::CDS_HAS_NEW_EXCEPTION,
125  eTestNames::SHOW_TRANSL_EXCEPT,
126  eTestNames::RNA_NO_PRODUCT,
127  eTestNames::RRNA_NAME_CONFLICTS,
128  eTestNames::SUSPECT_RRNA_PRODUCTS,
129  eTestNames::SHORT_RRNA,
130  eTestNames::FIND_BADLEN_TRNAS,
131  eTestNames::UNUSUAL_MISC_RNA,
132  eTestNames::SHORT_LNCRNA,
133  eTestNames::SHORT_INTRON,
134  eTestNames::EXON_INTRON_CONFLICT,
135  eTestNames::EXON_ON_MRNA,
136  eTestNames::SHORT_PROT_SEQUENCES,
137 
138  eTestNames::INCONSISTENT_DBLINK,
139  eTestNames::INCONSISTENT_MOLINFO_TECH,
140  eTestNames::INCONSISTENT_MOLTYPES,
141  eTestNames::INCONSISTENT_STRUCTURED_COMMENTS,
142  eTestNames::QUALITY_SCORES,
143  eTestNames::SEGSETS_PRESENT,
144  };
145 
146 //static_assert(g_ReportOrder0.size()+g_ReportOrder1.size() == TTestNamesSet::capacity(), "Not all of the tests included in the reporting groups");
147 
148 
149 static bool ShowFatal(const CReportItem& item)
150 {
151  if (!item.IsFatal()) {
152  return false;
153  }
154  TReportItemList subs = item.GetSubitems();
155  for (const auto& it : subs) {
156  if (it->IsSummary() && it->IsFatal()) {
157  return false;
158  }
159  }
160  return true;
161 }
162 
163 
164 static inline string_view s_RemoveInitialUnderscore(string_view s)
165 {
166  return s[0] == '_' ? s.substr(1) : s;
167 }
168 
169 
170 static void RecursiveText(ostream& out, const TReportItemList& list, unsigned short flags)
171 {
172  bool ext = (flags & CDiscrepancySet::eOutput_Ext) != 0;
174  for (const auto& it : list) {
175  if (it->IsExtended() && !ext) {
176  continue;
177  }
178  if (fatal && ShowFatal(*it)) {
179  out << "FATAL: ";
180  }
181  auto title = s_RemoveInitialUnderscore(it->GetTitle());
182  auto IsDupDefline = (title == string_view("DUP_DEFLINE"));
183  out << title << ": " << it->GetMsg() << '\n';
184  TReportItemList subs = it->GetSubitems();
185  if (!subs.empty() && (ext || !subs[0]->IsExtended())) {
186  RecursiveText(out, subs, flags);
187  }
188  else {
189  TReportObjectList det = it->GetDetails();
190  for (const auto& obj : det) {
192  out << obj->GetPath() << ":";
193  }
194  if (obj->IsFixed()) {
195  out << "[FIXED] ";
196  }
197 
198  auto text = (IsDupDefline && obj->GetType() == CReportObj::eType_sequence) ?
199  obj->GetBioseqLabel() :
200  obj->GetText();
201  out << text << '\n';
202  }
203  }
204  }
205 }
206 
207 
208 static void RecursiveSummary(ostream& out, const TReportItemList& list, unsigned short flags, size_t level = 0)
209 {
211  for (const auto& it : list) {
212  auto title = it->GetTitle();
213  auto msg = it->GetMsg();
214  bool includeInSummary = (level == 0 )
215  || (title == string_view("SOURCE_QUALS") && level == 1);
216  if (includeInSummary) {
217  if (fatal && ShowFatal(*it)) {
218  out << "FATAL: ";
219  }
220  out << s_RemoveInitialUnderscore(title) << ": " << msg << '\n';
221  }
222  else if (it->IsSummary()) {
223  out << string(level, '\t');
224  if (fatal && ShowFatal(*it)) {
225  out << "FATAL: ";
226  }
227  out << msg << '\n';
228  }
229  else {
230  continue;
231  }
232  RecursiveSummary(out, it->GetSubitems(), flags, level + 1);
233  }
234 }
235 
236 
237 static bool RecursiveFatalSummary(ostream& out, const TReportItemList& list, size_t level = 0)
238 {
239  bool found = false;
240  for (const auto& it : list) {
241  if (it->IsFatal() && it->GetTitle() != string_view("SOURCE_QUALS")
242  && it->GetTitle() != string_view("SUSPECT_PRODUCT_NAMES")) {
243  found = true;
244  if (level == 0) {
245  out << "FATAL: ";
246  out << s_RemoveInitialUnderscore(it->GetTitle()) << ": " << it->GetMsg() << '\n';
247  }
248  else if (it->IsSummary()) {
249  out << string(level, '\t');
250  out << "FATAL: ";
251  out << it->GetMsg() << '\n';
252  }
253  else {
254  continue;
255  }
256  RecursiveFatalSummary(out, it->GetSubitems(), level + 1);
257  }
258  }
259  return found;
260 }
261 
262 static TReportItemList x_CollectGroup(const std::initializer_list<eTestNames>& m_List, TDiscrepancyCoreMap& tests, bool all)
263 {
265  for (const auto& it : m_List) {
266  if (tests.find(it) != tests.end()) {
267  TReportItemList tmp = tests[it]->GetReport();
268  for (const auto& tt : tmp) {
269  out.push_back(tt);
270  }
271  tests.erase(it);
272  }
273  }
274  if (all) {
275  for (const auto& it : tests) {
276  TReportItemList list = it.second->GetReport();
277  for (const auto& it2 : list) {
278  out.push_back(it2);
279  }
280  }
281  }
282  #if 0
283  for (const auto& it : m_List) {
284  TReportItemList tmp = it.Collect(tests, false);
285  for (const auto& tt : tmp) {
286  out.push_back(tt);
287  }
288  }
289  if (tests.find(m_Test) != tests.end()) {
290  TReportItemList tmp = tests[m_Test]->GetReport();
291  for (const auto& tt : tmp) {
292  out.push_back(tt);
293  }
294  tests.erase(m_Test);
295  }
296  if (!m_Label.empty()) {
297  TReportObjectList objs;
299  _ASSERT(0);
300  #if 0
301  CRef<CDiscrepancyItem> di(new CDiscrepancyItem(m_Label));
302  di->m_Subs = out;
303  bool empty = true;
304  for (const auto& tt : out) {
305  TReportObjectList details = tt->GetDetails();
306  if (!details.empty() || tt->GetCount() > 0) {
307  empty = false;
308  }
309  for (auto& ob : details) {
310  CReportNode::Add(objs, hash, *ob);
311  }
312  if (tt->CanAutofix()) {
313  di->m_Autofix = true;
314  }
315  if (tt->IsInfo()) {
316  di->m_Severity = CDiscrepancyItem::eSeverity_info;
317  }
318  else if (tt->IsFatal()) {
319  di->m_Severity = CDiscrepancyItem::eSeverity_error;
320  }
321  }
322  di->m_Objs = objs;
323  out.clear();
324  if (!empty) {
325  out.push_back(CRef<CReportItem>(di));
326  }
327  #endif
328  }
329  if (all) {
330  for (const auto& it : tests) {
331  TReportItemList list = it.second->GetReport();
332  for (const auto& it : list) {
333  out.push_back(it);
334  }
335  }
336  }
337  #endif
338 
339  return out;
340 }
341 
342 void CDiscrepancyProductImpl::OutputText(ostream& out, unsigned short flags, char group)
343 {
344  switch (group) {
345  case 'b':
346  out << "Discrepancy Report Results (due to the large size of the file some checks may not have run)\n\n";
347  break;
348  case 'q':
349  out << "Discrepancy Report Results (SMART set of checks)\n\n";
350  break;
351  case 'u':
352  out << "Discrepancy Report Results (submitter set of checks)\n\n";
353  break;
354  default:
355  out << "Discrepancy Report Results\n\n";
356  }
357 
358  out << "Summary\n";
359  TReportItemList m_Group0;
360  TReportItemList m_Group1;
361 
362  m_Group0 = x_CollectGroup(g_ReportOrder0, m_Tests, false);
363  m_Group1 = x_CollectGroup(g_ReportOrder1, m_Tests, true);
364  #ifdef _DEBUG111
365  std::cerr << g_ReportOrder0.size() << ":" << g_ReportOrder1.size() << ":" << TTestNamesSet::capacity() << "\n";
366  std::cerr << m_Group0.size() << ":" << m_Group1.size() << "\n";
367  #endif
368 
369 
370  RecursiveSummary(out, m_Group0, flags);
372  RecursiveFatalSummary(out, m_Group1, flags);
373  }
374  RecursiveSummary(out, m_Group1, flags);
375 
377 
378  out << "\nDetailed Report\n\n";
379  RecursiveText(out, m_Group0, flags);
380  RecursiveText(out, m_Group1, flags);
381 }
382 
383 static void Indent(ostream& out, size_t indent)
384 {
385  static const size_t XML_INDENT = 2;
386  out << string(indent * XML_INDENT, ' ');
387 }
388 
389 static string SevLevel[CReportItem::eSeverity_error + 1] = { "INFO", "WARNING", "FATAL" };
390 
391 static void RecursiveXML(ostream& out, const TReportItemList& list, unsigned short flags, size_t indent)
392 {
393  bool ext = (flags & CDiscrepancySet::eOutput_Ext) != 0;
394  for (const auto& it : list) {
395  if (it->IsExtended() && !ext) {
396  continue;
397  }
398  Indent(out, indent);
399  out << "<details message=\"" << NStr::XmlEncode(it->GetXml()) << "\"";
400  out << " severity=\"" << SevLevel[it->GetSeverity()] << "\"";
401  if (it->GetCount() > 0) {
402  out << " cardinality=\"" << NStr::Int8ToString(it->GetCount()) << "\"";
403  }
404  if (!it->GetUnit().empty()) {
405  out << " unit=\"" << NStr::XmlEncode(it->GetUnit()) << "\"";
406  }
407  if (it->CanAutofix()) {
408  out << " autofix=\"true\"";
409  }
410  out << ">\n";
411 
412  ++indent;
413  TReportItemList subs = it->GetSubitems();
414  if (!subs.empty() && (ext || !subs[0]->IsExtended())) {
415  RecursiveXML(out, subs, flags, indent);
416  }
417  else {
418  for (const auto& obj : it->GetDetails()) {
419  Indent(out, indent);
420  out << "<object type=";
421  switch (obj->GetType()) {
423  out << "\"feature\"";
424  break;
426  out << "\"descriptor\"";
427  break;
429  out << "\"sequence\"";
430  break;
432  out << "\"set\"";
433  break;
435  out << "\"submit_block\"";
436  break;
438  out << "\"string\"";
439  break;
440  default:
441  out << "\"\"";
442  break;
443  }
445  out << " file=\"" << NStr::XmlEncode(obj->GetPath()) << "\"";
446  }
447  const string sFeatureType = obj->GetFeatureType();
448  if (!sFeatureType.empty()) {
449  out << " feature_type=\"" << NStr::XmlEncode(sFeatureType) << "\"";
450  }
451  const string sProductName = obj->GetProductName();
452  if (!sProductName.empty()) {
453  out << (sFeatureType == "Gene" ? " symbol=\"" : " product=\"") << NStr::XmlEncode(sProductName) << "\"";
454  }
455  const string sLocation = obj->GetLocation();
456  if (!sLocation.empty()) {
457  out << " location=\"" << NStr::XmlEncode(sLocation) << "\"";
458  }
459  const string sLocusTag = obj->GetLocusTag();
460  if (!sLocusTag.empty()) {
461  out << " locus_tag=\"" << NStr::XmlEncode(sLocusTag) << "\"";
462  }
463  const string text = obj->GetText();
464  out << " label=\"" << NStr::XmlEncode(text) << "\" />\n";
465  }
466  }
467  --indent;
468  Indent(out, indent);
469  out << "</details>\n";
470  }
471 }
472 
473 static list<CRef<CDiscrepancyCore>> x_ReorderList(const std::vector<eTestNames>& order, const TDiscrepancyCoreMap& tests)
474 {
475  vector<std::pair<TDiscrepancyCoreMap::key_type, TDiscrepancyCoreMap::mapped_type>> vec;
476  vec.reserve(tests.size());
477  for (const auto& test : tests) {
478  vec.push_back(test);
479  }
480 
481  sort(vec.begin(), vec.end(), [&order](auto& l, auto r)
482  {
483  auto it_l = std::find(order.begin(), order.end(), l.first);
484  auto it_r = std::find(order.begin(), order.end(), r.first);
485  // in case the test is not put into ordering list, compare by test value
486  if (it_l == it_r)
487  return r.first < l.first;
488  return it_l < it_r;
489  });
490 
491  list<CRef<CDiscrepancyCore>> result;
492 
493  for (const auto& test : vec) {
494  result.push_back(test.second);
495  }
496 
497  return result;
498 }
499 
500 void CDiscrepancyProductImpl::OutputXML(ostream& out, unsigned short flags)
501 {
502  out << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
503  out << "<discrepancy_report>\n";
504 
505  auto sorted = x_ReorderList(g_ReportOrder1, m_Tests);
506  for (const auto& test : sorted) {
507  TReportItemList rep = test->GetReport();
508  if (rep.empty()) {
509  continue;
510  }
512  for (const auto& it : rep) {
513  CReportItem::ESeverity s = it->GetSeverity();
514  if (max_sev < s) {
515  max_sev = s;
516  }
517  }
518  Indent(out, 1);
519  out << "<test name=\"" << s_RemoveInitialUnderscore(test->GetSName())
520  << "\" description=\"" << NStr::XmlEncode(test->GetDescription())
521  << "\" severity=\"" << SevLevel[max_sev]
522  << "\" cardinality=\"" << rep.size() << "\">\n";
523  RecursiveXML(out, rep, flags, 2);
524  Indent(out, 1);
525  out << "</test>\n";
526  }
527  out << "</discrepancy_report>\n";
528 }
529 
#define static
static void fatal(const char *msg,...)
Definition: attributes.c:18
void OutputXML(CNcbiOstream &out, unsigned short flags) override
void OutputText(CNcbiOstream &out, unsigned short flags, char group=0) override
TDiscrepancyCoreMap m_Tests
CRef –.
Definition: ncbiobj.hpp:618
virtual vector< CRef< CReportItem > > GetSubitems() const =0
virtual bool IsFatal() const =0
static void Add(TReportObjectList &list, TReportObjectSet &hash, CReportObj &obj, bool unique=true)
@ eType_submit_block
Definition: discrepancy.hpp:82
static uch flags
vector< CRef< CReportItem > > TReportItemList
vector< CRef< CReportObj > > TReportObjectList
std::ofstream out("events_result.xml")
main entry point for tests
string
Definition: cgiapp.hpp:687
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string Int8ToString(Int8 value, TNumToStringFlags flags=0, int base=10)
Convert Int8 to string.
Definition: ncbistr.hpp:5158
static string XmlEncode(const CTempString str, TXmlEncode flags=eXmlEnc_Contents)
Encode a string for XML.
Definition: ncbistr.cpp:4032
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is smart and slim</td> n<td> orig</td> n</tr> n<tr> n<td> last_modified</td> n<td> optional</td> n<td> Integer</td> n<td class=\"description\"> The blob last modification If provided then the exact match will be requested with n the Cassandra storage corresponding field value</td> n<td> Positive integer Not provided means that the most recent match will be selected</td> n<td></td> n</tr> n<tr> n<td> use_cache</td> n<td> optional</td> n<td> String</td> n<td class=\"description\"> The option controls if the Cassandra LMDB cache and or database should be used It n affects the seq id resolution step and the blob properties lookup step The following n options are BIOSEQ_INFO and BLOB_PROP at all
static void text(MDB_val *v)
Definition: mdb_dump.c:62
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static char tmp[2048]
Definition: utf8.c:42
string indent(" ")
Definition: _hash_fun.h:40
int test(int srctype, const void *srcdata, int srclen, int dsttype, int dstlen)
Definition: t0019.c:43
#define _ASSERT
static DbTestEntry * tests
Definition: testodbc.c:388
USING_SCOPE(objects)
static list< CRef< CDiscrepancyCore > > x_ReorderList(const std::vector< eTestNames > &order, const TDiscrepancyCoreMap &tests)
static void RecursiveXML(ostream &out, const TReportItemList &list, unsigned short flags, size_t indent)
static void RecursiveText(ostream &out, const TReportItemList &list, unsigned short flags)
static bool ShowFatal(const CReportItem &item)
static bool RecursiveFatalSummary(ostream &out, const TReportItemList &list, size_t level=0)
static constexpr std::initializer_list< eTestNames > g_ReportOrder0
Definition: text_output.cpp:37
static TReportItemList x_CollectGroup(const std::initializer_list< eTestNames > &m_List, TDiscrepancyCoreMap &tests, bool all)
static constexpr std::initializer_list< eTestNames > g_ReportOrder1
Definition: text_output.cpp:44
static string SevLevel[CReportItem::eSeverity_error+1]
static void RecursiveSummary(ostream &out, const TReportItemList &list, unsigned short flags, size_t level=0)
static string_view s_RemoveInitialUnderscore(string_view s)
static void Indent(ostream &out, size_t indent)
else result
Definition: token2.c:20
Modified on Tue Dec 05 02:19:26 2023 by modify_doxy.py rev. 669887