NCBI C++ ToolKit
fasta_object_loader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: fasta_object_loader.cpp 44796 2020-03-17 22:37:42Z evgeniev $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike DiCuccio, Roman Katargin
27 */
28 
29 
30 #include <ncbi_pch.hpp>
31 
32 #include <util/icanceled.hpp>
33 #include <serial/iterator.hpp>
34 
36 
41 
42 #include <gui/objutils/label.hpp>
44 
46 
47 
50 
52  : m_Params(params)
53 {
54 }
55 
56 CFastaObjectLoader::CFastaObjectLoader(const CFastaLoadParams& params, const vector<wxString>& filenames)
57  : m_Params(params), m_FileNames(filenames)
58 {
59 }
60 
62 {
63  return m_Objects;
64 }
65 
67 {
68  return "Loading FASTA Files";
69 }
70 
72 {
73  return true;
74 }
75 
77 {
78  Init();
79 
80  ITERATE(vector<wxString>, it, m_FileNames) {
81  if (canceled.IsCanceled())
82  return false;
83 
84  CRef<CErrorContainer> errCont;
85  const wxString& fn = *it;
86 
87  try {
88  errCont.Reset(new CErrorContainer(100));
89 
92  LoadFromStream(file.GetIstream(), objects, errCont);
93  x_UpdateHTMLResults(fn, errCont);
94 
95  for (auto& o : objects)
96  m_Objects.push_back(o);
97  }
98  catch (const CException& e) {
99  x_UpdateHTMLResults(fn, errCont, e.GetMsg());
100  }
101  catch (const exception& e) {
102  x_UpdateHTMLResults(fn, errCont, e.what());
103  }
104  }
105 
106  return true;
107 }
108 
110 {
112  m_Scope.Reset(new CScope(*obj_mgr));
113  m_Scope->AddDefaults();
114 }
115 
117 {
119 
120  /// interpret the sequence type:
121  if (params.GetSeqType() == 1) {
124  }
125  else if (params.GetSeqType() == 2) {
128  }
129 
130  if (params.GetForceLocalIDs())
132 
133  if (params.GetMakeDelta())
135 
136  if (params.GetIgnoreGaps())
138 
139  if (params.GetReadFirst())
141 
142  if (params.GetSkipInvalid())
144 
145  if (params.GetNoSplit())
147 
148 
151 
152  return flags;
153 }
154 
155 
157 {
160 
161  CRef<ILineReader> line_reader(ILineReader::New(istr));
162  CSeqFastaReader rdr(*m_Scope, *line_reader, flags);
163 
164  if (canceled)
165  rdr.SetCanceler(canceled);
166 
168  rdr.SaveMasks(&lcv);
169  vector<CConstRef<CSeq_id>> wellknown_ids;
170  CRef<CSeq_entry> entry = rdr.ReadSequences(&wellknown_ids, kMax_Int, errCont);
171 
172  // Add any lowercase masks as features
173  vector<CRef<CSeq_entry> > entries;
174  if (entry->IsSeq()) {
175  entries.push_back(entry);
176  }
177  else {
179  entry->SetSet().SetSeq_set()) {
180  entries.push_back(*iter);
181  }
182  }
183  for (unsigned int i = 0; i < entries.size(); ++i) {
184  const CSeq_loc& loc = *lcv[i];
185  CSeq_entry& ent = *entries[i];
186  // Add unless loc is null (all uppercase) or everything (all lowercase)
187  if (!loc.IsNull() &&
188  sequence::GetLength(loc, 0) < ent.GetSeq().GetInst().GetLength()) {
189 
190  ///
191  /// first, expand the location into a set of sublocs
192  /// ReadFasta will return a single packed seq-int
193  /// and it is much more usable if this can be expressed as a
194  /// set of simple intervals
195  ///
196  vector< CRef<CSeq_loc> > expanded_locs;
197  if (loc.IsPacked_int()) {
198  /// fast path - no extra allocation
200  CRef<CSeq_loc> loc_int(new CSeq_loc());
201  CRef<CSeq_interval> ref = *iter;
202  loc_int->SetInt(*ref);
203  expanded_locs.push_back(loc_int);
204  }
205  }
206  else {
207  /// generic path - create intervals
208  const CSeq_id& id = sequence::GetId(loc, NULL);
209  for (CSeq_loc_CI loc_iter(loc); loc_iter; ++loc_iter) {
210  CRef<CSeq_loc> loc_int(new CSeq_loc());
211  loc_int->SetInt().SetFrom(loc_iter.GetRange().GetFrom());
212  loc_int->SetInt().SetTo(loc_iter.GetRange().GetTo());
213  loc_int->SetId(id);
214  expanded_locs.push_back(loc_int);
215  }
216  }
217 
218 
219  CRef<CSeq_annot> annot(new CSeq_annot);
220  if (m_Params.GetLowercaseOption() == 0 ||
221  m_Params.GetLowercaseOption() == 1) {
222  /// the masked location comes back as a packed seq-int
223  /// we unpack this into a set of intervals and package
224  /// separate features for each
225  NON_CONST_ITERATE(vector< CRef<CSeq_loc> >, iter, expanded_locs) {
226  CRef<CSeq_feat> feat(new CSeq_feat);
227  feat->SetLocation(**iter);
228 
229  if (m_Params.GetLowercaseOption() == 0) {
230  feat->SetData().SetRegion("lowercase in FASTA file");
231  }
232  else {
233  feat->SetData().SetImp().SetKey("repeat_region");
234  feat->SetData().SetImp().SetDescr("lowercase in FASTA file");
235  }
236  annot->SetData().SetFtable().push_back(feat);
237  }
238  }
239  else {
240  NON_CONST_ITERATE(vector< CRef<CSeq_loc> >, iter, expanded_locs) {
241  annot->SetData().SetLocs().push_back(*iter);
242  }
243  }
244 
245  ent.SetSeq().SetAnnot().push_back(annot);
246  }
247  }
248 
249  if (!entries.empty()) {
250  string label;
251  if (m_Scope)
253 
254  objects.push_back(SObject(*entry, label));
255  }
256 
257  for (auto &id : wellknown_ids) {
258  string label;
259  if (m_Scope) {
261  }
262 
263  CRef<CSeq_id> id2(new CSeq_id());
264  id2->Assign(*id);
265  objects.push_back(SObject(*id2, label));
266  }
267 }
268 
270 {
272 
273  CRef<ILineReader> line_reader(ILineReader::New(istr));
274  CSeqFastaReader rdr(*m_Scope, *line_reader, flags);
275 
276  if (canceled)
277  rdr.SetCanceler(canceled);
278 
280  rdr.SaveMasks(&lcv);
281  CRef<CSeq_entry> entry = rdr.ReadAlignedSet(-1, errCont);
282 
283  if (!entry)
284  return;
285 
286  CRef<CSeq_align> align;
287  for (CTypeIterator<CSeq_align> it(*entry); it; ++it) {
288  CRef<CSeq_annot> annot(new CSeq_annot());
289  annot->SetData().SetAlign().push_back(CRef<CSeq_align>(&*it));
290 
291  string label;
292  if (m_Scope)
294 
295  objects.push_back(SObject(*annot, label));
296  }
297 }
298 
300 {
301  x_ShowErrorsDlg(wxT("FASTA import errors"));
302  return true;
303 }
304 
CErrorContainer.
void LoadFromStream(CNcbiIstream &istr, TObjects &objects, CErrorContainer *errCont=0, ICanceled *canceled=0)
virtual bool Execute(ICanceled &canceled)
CFastaLoadParams m_Params
vector< wxString > m_FileNames
virtual string GetDescription() const
virtual TObjects & GetObjects()
void LoadAlignsFromStream(CNcbiIstream &istr, TObjects &objects, CErrorContainer *errCont=0, ICanceled *canceled=0)
CFastaObjectLoader(const CFastaLoadParams &params)
CRef< objects::CScope > m_Scope
CRef –.
Definition: ncbiobj.hpp:618
void x_ShowErrorsDlg(const wxString &title)
void x_UpdateHTMLResults(const wxString &object, objects::ILineErrorListener *errCont, const string &exception="", const string &error_msg="", const wxString &objectName=wxT("File:"))
CScope –.
Definition: scope.hpp:92
FASTA sequences reader class that does post-processing: * Overrides the local ids; * Removes the well...
CRef< objects::CSeq_entry > ReadSequences(vector< CConstRef< objects::CSeq_id >> *wellknown_ids=nullptr, int max_seqs=kMax_Int, objects::ILineErrorListener *pMessageListener=0)
Read multiple sequences and remove the well-known sequences from the entry.
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
Interface for testing cancellation request in a long lasting operation.
Definition: icanceled.hpp:51
vector< SObject > TObjects
static uch flags
Operators to edit gaps in sequences.
USING_SCOPE(objects)
static CFastaReader::TFlags s_GetFlags(const CFastaLoadParams &params)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
bool GetReadFirst() const
bool GetForceLocalIDs() const
bool GetMakeDelta() const
int GetLowercaseOption() const
bool GetSkipInvalid() const
bool GetNoSplit() const
bool GetIgnoreGaps() const
static void GetLabel(const CObject &obj, string *label, ELabelType type=eDefault)
Definition: label.cpp:140
@ eDefault
Definition: label.hpp:73
vector< TMask > TMasks
Definition: fasta.hpp:160
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
Definition: line_reader.cpp:49
long TFlags
binary OR of EFlags
Definition: fasta.hpp:117
@ fDLOptional
Don't require a leading defline.
Definition: fasta.hpp:96
@ fHyphensIgnoreAndWarn
When a hyphen is encountered in seq data, ignore it but warn.
Definition: fasta.hpp:112
@ fRequireID
Reject deflines that lack IDs.
Definition: fasta.hpp:95
@ fAddMods
Parse defline mods and add to SeqEntry.
Definition: fasta.hpp:104
@ fForceType
Force specified type regardless of accession.
Definition: fasta.hpp:89
@ fParseRawID
Try to identify raw accessions.
Definition: fasta.hpp:97
@ fNoSplit
Don't split out ambiguous sequence regions.
Definition: fasta.hpp:99
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
Definition: fasta.hpp:87
@ fParseGaps
Make a delta sequence if gaps found.
Definition: fasta.hpp:91
@ fOneSeq
Just read the first sequence found.
Definition: fasta.hpp:92
@ fAssumeProt
Assume prots unless accns indicate otherwise.
Definition: fasta.hpp:88
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
virtual bool IsCanceled(void) const =0
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static const char label[]
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
list< CRef< CSeq_interval > > Tdata
const Tdata & Get(void) const
Get the member data.
bool IsPacked_int(void) const
Check if variant Packed_int is selected.
Definition: Seq_loc_.hpp:534
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
const TPacked_int & GetPacked_int(void) const
Get the variant data.
Definition: Seq_loc_.cpp:216
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
FILE * file
int i
#define wxT(x)
Definition: muParser.cpp:41
The Object manager core.
static int filenames
Definition: pcre2grep.c:247
static wxAcceleratorEntry entries[3]
Modified on Fri Sep 20 14:57:45 2024 by modify_doxy.py rev. 669887