NCBI C++ ToolKit
hgvs_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: hgvs_reader.cpp 92114 2020-12-22 16:22:47Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig
27  *
28  * File Description:
29  * HGVS file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbithr.hpp>
37 #include <corelib/ncbiutil.hpp>
38 #include <corelib/ncbiexpt.hpp>
39 #include <corelib/stream_utils.hpp>
40 
41 #include <util/static_map.hpp>
42 #include <util/line_reader.hpp>
43 
44 #include <serial/iterator.hpp>
45 #include <serial/objistrasn.hpp>
46 
47 // Objects includes
52 
57 
61 #include <objects/seq/Seqdesc.hpp>
64 #include <objects/seq/Seq_data.hpp>
65 
71 
73 
79 #include <objtools/error_codes.hpp>
80 
81 #include <objmgr/util/sequence.hpp>
83 #include <objmgr/scope.hpp>
87 
88 #include <algorithm>
89 
90 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
91 
93 BEGIN_objects_SCOPE // namespace ncbi::objects::
94 
95 // ----------------------------------------------------------------------------
97  : CReaderBase(flags),
98  m_Assembly(&assembly)
99 // ----------------------------------------------------------------------------
100 {
101 }
102 
103 
104 // ----------------------------------------------------------------------------
106  : CReaderBase(flags)
107 // ----------------------------------------------------------------------------
108 {
109 }
110 
111 
112 // ----------------------------------------------------------------------------
114 // ----------------------------------------------------------------------------
115 {
116 }
117 
118 // ----------------------------------------------------------------------------
121  ILineReader& lr,
122  ILineErrorListener* pEC )
123 // ----------------------------------------------------------------------------
124 {
125  CRef<CSeq_annot> annot(new CSeq_annot);
126 
127  // object manager
129  CGBDataLoader::RegisterInObjectManager( *objectManager );
130  CRef<CScope> scope(new CScope(*objectManager));
131  scope->AddDefaults();
132 
133  // hgvs parser
134  variation::CHgvsParser hgvsParser(*scope);
135  CRef<CSeq_id_Resolver> assmresolver;
136  if (m_Assembly.NotNull()) {
137  assmresolver.Reset(new CSeq_id_Resolver__ChrNamesFromGC(*m_Assembly, *scope));
138  hgvsParser.SetSeq_id_Resolvers().push_front(assmresolver);
139  }
140 
141  // helper to convert to feature
142  variation::CVariationUtil varUtil( *scope );
143 
144  // parse input
145  while (!lr.AtEOF()) {
146  string line = *(++lr);
147  m_uLineNumber++;
148 
149  // TODO split multiple hgvs names on one line (sep by whitespace, ";", ",")
151  NStr::ReplaceInPlace(line, "\r", kEmptyStr);
152  NStr::ReplaceInPlace(line, "\n", kEmptyStr);
153 
154  if (NStr::IsBlank(line) || NStr::StartsWith(line, "#")) {
155  continue;
156  }
157 
158  try {
159  CRef<CVariation> var = hgvsParser.AsVariation(line);
160 
161  CVariation::TExceptions exception_list;
162  if( var->IsSetExceptions() ) {
163  exception_list.insert(exception_list.end(), var->GetExceptions().begin(), var->GetExceptions().end());
164  }
165  if( var->IsSetPlacements() ) {
166  ITERATE(CVariation::TPlacements, place_it, var->GetPlacements() ) {
167  const CVariantPlacement& placement = **place_it;
168  if( placement.IsSetExceptions() ) {
169  exception_list.insert(exception_list.end(), placement.GetExceptions().begin(), placement.GetExceptions().end());
170  }
171  }
172  }
173 
174  ITERATE(CVariation::TExceptions, except_it, exception_list ) {
175 
176  const CVariationException& except = **except_it;
177 
178  if( except.IsSetCode() && except.IsSetMessage() ) {
179 
180  const string& code =
181  CVariationException::GetTypeInfo_enum_ECode()->FindName(except.GetCode(), true);
182  unique_ptr<CObjReaderLineException> err(
186  string("Warning [") + code + "] " + except.GetMessage(),
188  ProcessWarning(*err, pEC);
189  }
190  }
191 
192  varUtil.AsVariation_feats(*var, annot->SetData().SetFtable());
193  }
195  unique_ptr<CObjReaderLineException> err(
197  eDiag_Error,
198  0,
199  string("Error [") + e.GetErrCodeString() + "] " + e.GetMsg(),
201  ProcessError(*err, pEC);
202  }
203  }
204 
205  NON_CONST_ITERATE (CSeq_annot::C_Data::TFtable, itr, annot->SetData().SetFtable() ) {
206  CRef<CSeq_feat> feat = *itr;
207  CRef<CSeq_id> tempid(new CSeq_id());
208  tempid->Assign(*(feat->GetLocation().GetId()));
210  tempid->SetGi(idh.GetGi());
211  feat->SetLocation().SetId(*tempid);
212  }
213  return annot;
214 }
215 
216 // ---------------------------------------------------------------------------
217 void
219  vector< CRef<CSeq_annot> >& annots,
220  CNcbiIstream& istr,
221  ILineErrorListener* pMessageListener )
222 // ---------------------------------------------------------------------------
223 {
224  CStreamLineReader lr(istr);
225  ReadSeqAnnots(annots, lr, pMessageListener);
226 }
227 
228 // ---------------------------------------------------------------------------
229 void
231  vector< CRef<CSeq_annot> >& annots,
232  ILineReader& lr,
233  ILineErrorListener* pMessageListener )
234 // ----------------------------------------------------------------------------
235 {
236  annots.push_back(ReadSeqAnnot(lr, pMessageListener));
237 }
238 
239 // ----------------------------------------------------------------------------
242  ILineReader& lr,
243  ILineErrorListener* pMessageListener )
244 // ----------------------------------------------------------------------------
245 {
246  return Ref<CSerialObject>(ReadSeqAnnot(lr, pMessageListener).GetPointer());
247 }
248 
249 END_objects_SCOPE
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
virtual CRef< CSeq_annot > ReadSeqAnnot(ILineReader &, ILineErrorListener *=0)
Read an object from a given line reader, render it as a single Seq-annot, if possible.
virtual void ReadSeqAnnots(vector< CRef< CSeq_annot > > &, CNcbiIstream &, ILineErrorListener *=0)
virtual CRef< CSerialObject > ReadObject(ILineReader &, ILineErrorListener *=0)
Read an object from a given line reader, render it as the most appropriate Genbank object.
virtual ~CHgvsReader()
CConstRef< CGC_Assembly > m_Assembly
CHgvsReader(const CGC_Assembly &assembly, int=0)
Definition: hgvs_reader.cpp:96
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Definition: line_error.cpp:194
Defines and provides stubs for a general interface to a variety of file readers.
Definition: reader_base.hpp:63
unsigned int m_uLineNumber
void ProcessError(CObjReaderLineException &, ILineErrorListener *)
void ProcessWarning(CObjReaderLineException &, ILineErrorListener *)
CScope –.
Definition: scope.hpp:92
Resolve chromosome names based on GC_Assembly.
Simple implementation of ILineReader for i(o)streams.
CVariantPlacement –.
CVariationException –.
@ eProblem_GeneralParsingError
Definition: line_error.hpp:106
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
virtual const char * GetErrCodeString(void) const override
Get error code interpreted as text.
CSeq_id_Resolver::TResolvers & SetSeq_id_Resolvers()
In order of decreasing priority.
CRef< CVariation > AsVariation(const string &hgvs_expression, TOpFlags=fOpFlags_Default)
void AsVariation_feats(const CVariation &v, CSeq_annot::TData::TFtable &feats)
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
TGi GetGi(void) const
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
@ eGetId_ForceGi
return only a gi-based seq-id
Definition: sequence.hpp:99
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotNull(void) const THROWS_NONE
Check if pointer is not null – same effect as NotEmpty().
Definition: ncbiobj.hpp:1410
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define kEmptyStr
Definition: ncbistr.hpp:123
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
TGi & SetGi(void)
Select the variant.
Definition: Seq_id_.hpp:896
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
list< CRef< CVariantPlacement > > TPlacements
TCode GetCode(void) const
Get the Code member data.
const TMessage & GetMessage(void) const
Get the Message member data.
const TExceptions & GetExceptions(void) const
Get the Exceptions member data.
bool IsSetCode(void) const
Check if a value has been assigned to Code data member.
bool IsSetMessage(void) const
Check if a value has been assigned to Message data member.
list< CRef< CVariationException > > TExceptions
bool IsSetExceptions(void) const
Check if a value has been assigned to Exceptions data member.
Definition of all error codes used in objtools libraries.
Lightweight interface for getting lines of data with minimal memory copying.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines NCBI C++ exception handling.
Multi-threading – classes, functions, and features.
Useful/utility classes and methods.
The Object manager core.
Definition: inftrees.h:24
Modified on Sun Jun 16 04:31:11 2024 by modify_doxy.py rev. 669887