NCBI C++ ToolKit
struc_cmt_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: struc_cmt_reader.cpp 99087 2023-02-10 16:48:37Z ludwigf $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Sergiy Gotvyanskyy, NCBI
27 *
28 * File Description:
29 * Reader for structured comments for sequences
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
37 #include <objects/seq/Seqdesc.hpp>
38 #include <objects/seq/Bioseq.hpp>
48 #include <util/line_reader.hpp>
51 
53 #include <objmgr/scope.hpp>
54 #include <objmgr/bioseq_ci.hpp>
55 
56 #include "struc_cmt_reader.hpp"
57 #include "table2asn_context.hpp"
58 #include "visitors.hpp"
59 
60 #include <common/test_assert.h> /* This header must go last */
61 
62 #include <set>
63 
66 
68  ILineErrorListener* pEC,
69  const string& commentID)
70 {
71  if (!pEC) {
72  return;
73  }
74  string message =
75  string("Structured comment for \"") +
76  commentID +
77  "\" could not be matched to any input sequence.";
78 
82  0, 0,
83  commentID,
84  0,
85  message);
86  pEC->PutError(*pErr);
87 }
88 
89 
91  const std::string& filename, ILineErrorListener* logger, bool verbose)
92  : CStructuredCommentsReader(logger), m_verbose(verbose)
93 {
94  CRef<ILineReader> reader{ILineReader::New(filename)};
95  m_vertical = IsVertical(*reader);
96  if (m_vertical) {
97  m_comments.push_back({});
98  LoadCommentsByRow(*reader, m_comments.front());
100  } else {
102  for (CStructComment& comment : m_comments) {
104  }
105  }
106 }
107 
109 {
110 }
111 
113 {
114  set<CRef<CSeq_id>> matchedCommentIds;
115 
116  if (m_vertical) {
117  VisitAllSeqDesc(entry, true, [this](CBioseq* bioseq, CSeq_descr& descr)
118  {
119  if (bioseq && !bioseq->IsNa())
120  return;
121 
122  _AddStructuredComments(descr, m_comments.front());
123  });
124  } else {
125  for (const CStructComment& comment : m_comments) {
126  if (_AddStructuredComments(entry, comment)) {
127  matchedCommentIds.insert(comment.m_id);
128  }
129  }
130  }
131  if (m_logger && m_verbose && !m_vertical) {
132  for (auto& comment: m_comments) {
133  auto id = comment.m_id;
134  if (!id || matchedCommentIds.find(id) == matchedCommentIds.end()) {
135  string commentId("[Unrecognized SeqID]");
136  if (id && id->IsLocal() && id->GetLocal().IsStr()) {
137  commentId = comment.m_id->GetLocal().GetStr();
138  }
140  }
141  }
142  }
143 }
144 
145 void CTable2AsnStructuredCommentsReader::_AddStructuredComments(CSeq_descr& descr, const CStructComment& comments)
146 {
147  for (const auto& new_desc : comments.m_descs)
148  {
149  bool append_desc = true;
150 
151  const string& index = CStructComment::GetPrefix(*new_desc);
152  //if (index.empty()) continue;
153 
154  for (auto& desc : descr.Set()) // push to create setdescr
155  {
156  if (!desc->IsUser()) continue;
157 
158  auto& user = desc->SetUser();
159 
160  const string& other = CStructComment::GetPrefix(*desc);
161  //if (other.empty()) continue;
162 
163  if (NStr::Equal(other, index))
164  {
165  append_desc = false;
166  // Merge
167  for (const auto& field : new_desc->GetUser().GetData())
168  {
169  user.SetFieldRef(field->GetLabel().GetStr())->SetValue(field->GetData().GetStr());
170  }
171  }
172  }
173  if (append_desc)
174  {
175  CRef<CSeqdesc> add_desc(new CSeqdesc);
176  add_desc->Assign(*new_desc);
177  descr.Set().push_back(add_desc);
178  }
179  }
180 }
181 
183  CSeq_entry& entry, const CStructComment& comments)
184 {
185  bool matchFound(false);
186  VisitAllBioseqs(entry, [&matchFound, comments](CBioseq& bioseq)
187  {
188  if (!bioseq.IsNa())
189  return;
190 
191  if (comments.m_id.NotEmpty())
192  {
193  bool matched = false;
194  for (const auto& id : bioseq.GetId())
195  {
196  if (CStructuredCommentsReader::SeqIdMatchesCommentId(*id, *comments.m_id))
197  {
198  matched = true;
199  break;
200  }
201  }
202  if (!matched) {
203  return;
204  }
205  }
206 
207  _AddStructuredComments(bioseq.SetDescr(), comments);
208  matchFound = true;
209  });
210  return matchFound;
211 }
212 
214 {
215  // assumption: all descriptors are structural comments
216  for (auto& desc : comments.m_descs) {
217  string prefix, suffix;
218  const auto& user = desc->GetUser();
219  for (const auto& data : user.GetData()) {
220  if (data->IsSetLabel() && data->GetLabel().IsStr()) {
221  const string& label = data->GetLabel().GetStr();
222  if (label == "StructuredCommentPrefix") {
223  prefix = data->GetData().GetStr();
224  }
225  else if (label == "StructuredCommentSuffix") {
226  suffix = data->GetData().GetStr();
227  }
228  }
229  }
230  if (!prefix.empty() && suffix.empty()) {
231  desc->SetUser().AddField("StructuredCommentSuffix", prefix);
232  }
233  }
234 }
235 
237 {
238  CTempString line;
239  reader.ReadLine();
240  bool vert = false;
241  if (!reader.AtEOF())
242  {
243  line = reader.GetCurrentLine();
244  vector<CTempString> values;
245  NStr::Split(line, "\t", values);
246  vert = values.size()<=2;
247  reader.UngetLine();
248  }
249  return vert;
250 }
251 
User-defined methods of the data storage class.
AutoPtr –.
Definition: ncbimisc.hpp:401
bool IsNa(void) const
Definition: Bioseq.cpp:345
static CLineErrorEx * Create(EProblem eProblem, EDiagSev eSeverity, int code, int subcode, const std::string &strSeqId, unsigned int uLine, const std::string &strErrorMessage=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), const TVecOfLines &vecOfOtherLines=TVecOfLines())
Use this because the constructor is protected.
Definition: line_error.cpp:329
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
static const string & GetPrefix(const objects::CSeqdesc &)
vector< CRef< objects::CSeqdesc > > m_descs
size_t LoadComments(ILineReader &reader, _container &cont, objects::CSeq_id::TParseFlags seqid_flags=objects::CSeq_id::fParse_Default)
objects::ILineErrorListener * m_logger
size_t LoadCommentsByRow(ILineReader &reader, CStructComment &cmt)
static void _CheckStructuredCommentsSuffix(CStructComment &comments)
static bool _AddStructuredComments(objects::CSeq_entry &entry, const CStructComment &comments)
void ProcessComments(objects::CSeq_entry &entry) const
static bool IsVertical(ILineReader &reader)
CTable2AsnStructuredCommentsReader(const std::string &filename, objects::ILineErrorListener *logger, bool verbose)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
@ eProblem_GeneralParsingError
Definition: line_error.hpp:105
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
char data[12]
Definition: iconv.c:80
string
Definition: cgiapp.hpp:687
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
CTempString GetCurrentLine(void) const
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
Definition: line_reader.cpp:49
virtual void UngetLine(void)=0
Unget current line, which must be valid.
void ReadLine(void)
Definition: line_reader.hpp:88
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
Definition: Seq_id.hpp:90
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5386
static const char label[]
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
if(yy_accept[yy_current_state])
Lightweight interface for getting lines of data with minimal memory copying.
void VisitAllSeqDesc(objects::CSeq_entry &entry, bool skip_nucprot, _M m)
Definition: visitors.hpp:48
void VisitAllBioseqs(objects::CSeq_entry &entry, _M &&m)
Definition: visitors.hpp:14
The Object manager core.
true_type verbose
Definition: processing.cpp:890
USING_SCOPE(objects)
static void sReportUnappliedStructuredComment(ILineErrorListener *pEC, const string &commentID)
Modified on Wed Sep 04 15:05:35 2024 by modify_doxy.py rev. 669887