NCBI C++ ToolKit
seq_writer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_writer.cpp 91306 2020-10-08 11:57:15Z gouriano $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file seq_writer.cpp
31  * Implementation of the CSeqFormatter class
32  */
33 
34 #include <ncbi_pch.hpp>
38 #include <objects/seq/Seqdesc.hpp>
40 #include <numeric> // for std::accumulate
42 #include <objmgr/scope.hpp>
43 
46 
47 CSeqFormatter::CSeqFormatter(const string& format_spec, CSeqDB& blastdb,
49  CSeqFormatterConfig config /* = CSeqFormatterConfig() */)
50  : m_Out(out), m_FmtSpec(format_spec), m_BlastDb(blastdb),
51  m_DataExtractor(blastdb,
52  config.m_SeqRange,
53  config.m_Strand,
54  config.m_FiltAlgoId,
55  config.m_FmtAlgoId,
56  config.m_LineWidth,
57  config.m_TargetOnly,
58  config.m_UseCtrlA)
59 {
60  // Validate the algo id
61  if (config.m_FiltAlgoId >= 0 || config.m_FmtAlgoId >= 0) {
62  vector<int> algo_ids;
63  if (config.m_FiltAlgoId >= 0)
64  algo_ids.push_back(config.m_FiltAlgoId);
65  if (config.m_FmtAlgoId >= 0)
66  algo_ids.push_back(config.m_FmtAlgoId);
67  vector<int> invalid_algo_ids =
69  if ( !invalid_algo_ids.empty()) {
70  NCBI_THROW(CInvalidDataException, eInvalidInput,
71  "Invalid filtering algorithm ID.");
72  }
73  }
74 
75  // Record where the offsets where the replacements must occur
76  for (SIZE_TYPE i = 0; i < m_FmtSpec.size(); i++) {
77  if (m_FmtSpec[i] == '%' && m_FmtSpec[i+1] == '%') {
78  // remove the escape character for '%'
79  m_FmtSpec.erase(i++, 1);
80  continue;
81  }
82 
83  if (m_FmtSpec[i] == '%') {
84  m_ReplOffsets.push_back(i);
85  m_ReplTypes.push_back(m_FmtSpec[i+1]);
86  }
87  }
88 
89  if (m_ReplOffsets.empty() || m_ReplTypes.size() != m_ReplOffsets.size()) {
90  NCBI_THROW(CInvalidDataException, eInvalidInput,
91  "Invalid format specification");
92  }
93 
94  m_Fasta = (m_ReplTypes[0] == 'f');
95 }
96 
98 {
99  bool retval=false;
100 
101  ITERATE(vector<char>, fmt, m_ReplTypes) {
102  switch (*fmt) {
103  case 's':
104  case 'h':
105  case 'm':
106  case 'e':
107  case 'd':
108  case 'b':
109  retval = true;
110  break;
111  }
112  }
113  return retval;
114 }
115 
116 void CSeqFormatter::x_Builder(vector<string>& data2write)
117 {
118  data2write.reserve(m_ReplTypes.size());
119 
120  ITERATE(vector<char>, fmt, m_ReplTypes) {
121  switch (*fmt) {
122 
123  case 's':
124  data2write.push_back(m_DataExtractor.ExtractSeqData());
125  break;
126 
127  case 'a':
128  data2write.push_back(m_DataExtractor.ExtractAccession());
129  break;
130 
131  case 'i':
132  data2write.push_back(m_DataExtractor.ExtractSeqId());
133  break;
134 
135  case 'g':
136  data2write.push_back(m_DataExtractor.ExtractGi());
137  break;
138 
139  case 'o':
140  data2write.push_back(m_DataExtractor.ExtractOid());
141  break;
142 
143  case 't':
144  data2write.push_back(m_DataExtractor.ExtractTitle());
145  break;
146 
147  case 'h':
148  data2write.push_back(m_DataExtractor.ExtractHash());
149  break;
150 
151  case 'l':
152  data2write.push_back(m_DataExtractor.ExtractSeqLen());
153  break;
154 
155  case 'T':
156  data2write.push_back(m_DataExtractor.ExtractTaxId());
157  break;
158 
159  case 'X':
160  data2write.push_back(m_DataExtractor.ExtractLeafTaxIds());
161  break;
162 
163  case 'P':
164  data2write.push_back(m_DataExtractor.ExtractPig());
165  break;
166 
167  case 'L':
168  data2write.push_back(m_DataExtractor.ExtractCommonTaxonomicName());
169  break;
170 
171  case 'C':
172  data2write.push_back(m_DataExtractor.ExtractLeafCommonTaxonomicNames());
173  break;
174 
175  case 'B':
176  data2write.push_back(m_DataExtractor.ExtractBlastName());
177  break;
178 
179  case 'K':
180  data2write.push_back(m_DataExtractor.ExtractSuperKingdom());
181  break;
182 
183  case 'S':
184  data2write.push_back(m_DataExtractor.ExtractScientificName());
185  break;
186 
187  case 'N':
188  data2write.push_back(m_DataExtractor.ExtractLeafScientificNames());
189  break;
190 
191  case 'm':
192  data2write.push_back(m_DataExtractor.ExtractMaskingData());
193  break;
194 
195  case 'e':
196  data2write.push_back(m_DataExtractor.ExtractMembershipInteger());
197  break;
198 
199  case 'n':
200  data2write.push_back(m_DataExtractor.ExtractLinksInteger());
201  break;
202 
203  case 'd':
204  data2write.push_back(m_DataExtractor.ExtractAsn1Defline());
205  break;
206 
207  case 'b':
208  data2write.push_back(m_DataExtractor.ExtractAsn1Bioseq());
209  break;
210 
211  default:
212  CNcbiOstrstream os;
213  os << "Unrecognized format specification: '%" << *fmt << "'";
214  NCBI_THROW(CInvalidDataException, eInvalidInput,
216  }
217  }
218 }
219 
221 {
222  if (m_Fasta) {
224  return;
225  }
226 
227  bool get_data = x_RequireData();
228  m_DataExtractor.SetSeqId(id, get_data);
229  vector<string> data2write;
230  x_Builder(data2write);
231  m_Out << x_Replacer(data2write) << endl;
232 }
233 
234 static string s_GetTitle(CConstRef<CBioseq> bioseq)
235 {
236  ITERATE(CSeq_descr::Tdata, desc, bioseq->GetDescr().Get()) {
237  if ((*desc)->Which() == CSeqdesc::e_Title) {
238  return (*desc)->GetTitle();
239  }
240  }
241  return string();
242 }
243 
245 {
246  static const string kTarget(" >gi|");
247  static const string kCtrlA = string(1, '\001') + string("gi|");
248  NON_CONST_ITERATE(CSeq_descr::Tdata, desc, bioseq->SetDescr().Set()) {
249  if ((*desc)->Which() == CSeqdesc::e_Title) {
250  NStr::ReplaceInPlace((*desc)->SetTitle(), kTarget, kCtrlA);
251  break;
252  }
253  }
254 }
255 
256 string GetBareId(const CSeq_id& id)
257 {
258  string retval;
259 
260  if (id.IsGi() || id.IsPrf() || id.IsPir()) {
261  retval = id.AsFastaString();
262  }
263  else {
264  retval = id.GetSeqIdString(true);
265  }
266 
267  return retval;
268 }
269 
271 {
272  CFastaOstream fasta(m_Out);
273  fasta.SetWidth(config.m_LineWidth);
275 
276  bool long_seqids = false;
278  if (app) {
279  const CNcbiRegistry& registry = app->GetConfig();
280  long_seqids = (registry.Get("BLAST", "LONG_SEQID") == "1");
281  }
282 
283  CRef<CBioseq> bioseq;
284  for (int i=0; blastdb.CheckOrFindOID(i); i++) {
285  bioseq.Reset(blastdb.GetBioseq(i));
286  if (bioseq.Empty()) {
287  continue;
288  }
289  // TODO: remove gnl|BL_ORD_ID
290  CRef<CSeq_id> id(*(bioseq->GetId().begin()));
291  if (id->IsGeneral() &&
292  id->GetGeneral().GetDb() == "BL_ORD_ID") {
293  m_Out << ">" << s_GetTitle(bioseq) << '\n';
295  fasta.WriteSequence(scope.AddBioseq(*bioseq));
296  }
297  else if (id->IsLocal()) {
298  string lcl_tmp = id->AsFastaString();
299  lcl_tmp = lcl_tmp.erase(0,4);
300  m_Out << ">" << lcl_tmp << " " << s_GetTitle(bioseq) << '\n';
302  fasta.WriteSequence(scope.AddBioseq(*bioseq));
303  }
304  else if (long_seqids) {
305 
306  if (config.m_UseCtrlA) {
307  s_ReplaceCtrlAsInTitle(bioseq);
308  }
309  fasta.Write(*bioseq, 0, true);
310  }
311  else {
312 
313  string separator = config.m_UseCtrlA ? "\001" : " >";
314 
315  m_Out << '>';
316  id = FindBestChoice(bioseq->GetId(), CSeq_id::Score);
317  m_Out << GetBareId(*id);
318 
319  string title = s_GetTitle(bioseq);
320 
321  if (!title.empty()) {
322  m_Out << ' ';
323 
324  NStr::ReplaceInPlace(title, " >", "\001");
325 
326  vector<string> tokens;
327  NStr::Split(title, "\001", tokens);
328  auto it = tokens.begin();
329  m_Out << *it;
330  ++it;
331  for (; it != tokens.end(); ++it) {
332  size_t pos = it->find (" ");
333  string str_id(*it, 0, pos != NPOS ? pos : it->length());
334  list< CRef<CSeq_id> > seqids;
335  CSeq_id::ParseFastaIds(seqids, str_id);
336 
337  // no valid sequence ids indicates that '>' was within the
338  // defline text
339  if (seqids.empty()) {
340  m_Out << " >" << *it;
341  continue;
342  }
343  m_Out << separator;
344  id = FindBestChoice(seqids, CSeq_id::Score);
345  m_Out << GetBareId(*id);
346  if (pos != NPOS) {
347  m_Out << it->substr(pos, it->length() - pos);
348  }
349  }
350  }
351  m_Out << endl;
352 
353  CScope scope(*CObjectManager::GetInstance());
354  fasta.WriteSequence(scope.AddBioseq(*bioseq));
355  }
356  }
357 }
358 
359 /// Auxiliary functor to compute the length of a string
360 struct StrLenAdd
361 {
362  SIZE_TYPE operator() (SIZE_TYPE a, const string& b) const {
363  return a + b.size();
364  }
365 };
366 
367 string
368 CSeqFormatter::x_Replacer(const vector<string>& data2write) const
369 {
370  SIZE_TYPE data2write_size = accumulate(data2write.begin(), data2write.end(),
371  0, StrLenAdd());
372 
373  string retval;
374  retval.reserve(m_FmtSpec.size() + data2write_size -
375  (m_ReplTypes.size() * 2));
376 
377  SIZE_TYPE fmt_idx = 0;
378  for (SIZE_TYPE i = 0, kSize = m_ReplOffsets.size(); i < kSize; i++) {
379  retval.append(&m_FmtSpec[fmt_idx], &m_FmtSpec[m_ReplOffsets[i]]);
380  retval.append(data2write[i]);
381  fmt_idx = m_ReplOffsets[i] + 2;
382  }
383  if (fmt_idx <= m_FmtSpec.size()) {
384  retval.append(&m_FmtSpec[fmt_idx], &m_FmtSpec[m_FmtSpec.size()]);
385  }
386 
387  return retval;
388 }
389 
390 void CSeqFormatter::SetConfig(TSeqRange range, objects::ENa_strand strand,
391  int filt_algo_id)
392 {
393  m_DataExtractor.SetConfig(range, strand, filt_algo_id);
394 }
395 
396 
397 END_NCBI_SCOPE
Declares classes which extract data from a BLAST database.
Definition of a customizable sequence writer interface.
string ExtractLeafCommonTaxonomicNames()
string ExtractFasta(const CBlastDBSeqId &seq_id)
void SetSeqId(const CBlastDBSeqId &seq_id, bool get_data=false)
Setting seqid.
Encapsulates identifier to retrieve data from a BLAST database.
FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp>
Definition: sequence.hpp:770
Defines invalid user input exceptions.
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:244
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CNcbiRegistry –.
Definition: ncbireg.hpp:913
CScope –.
Definition: scope.hpp:92
CSeqDB.
Definition: seqdb.hpp:161
vector< int > ValidateMaskAlgorithms(const vector< int > &algorithm_ids)
Validates the algorithm IDs passed to this function, returning a vector of those algorithm IDs not pr...
Definition: seqdb.cpp:1242
CRef< CBioseq > GetBioseq(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const
Get a CBioseq for a sequence.
Definition: seqdb.cpp:504
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
Definition: seqdb.cpp:728
void Write(CBlastDBSeqId &id)
Write the sequence data associated with the requested ID in the format specified in the constructor.
Definition: seq_writer.cpp:220
bool x_RequireData() const
Specifies whether or not data (e.g., Bioseq) is required.
Definition: seq_writer.cpp:97
CSeqFormatter(const string &fmt_spec, CSeqDB &blastdb, CNcbiOstream &out, CSeqFormatterConfig config=CSeqFormatterConfig())
Constructor.
Definition: seq_writer.cpp:47
vector< char > m_ReplTypes
Vector of convertor objects.
Definition: seq_writer.hpp:112
string x_Replacer(const vector< string > &data2write) const
Replace format specifiers for the data contained in data2write.
Definition: seq_writer.cpp:368
CNcbiOstream & m_Out
Stream to write output.
Definition: seq_writer.hpp:102
CBlastDBExtractor m_DataExtractor
Data extractor.
Definition: seq_writer.hpp:110
void DumpAll(CSeqDB &blastdb, CSeqFormatterConfig config=CSeqFormatterConfig())
Full database FASTA dump This is an optimized version that does not support range and mask retrieval.
Definition: seq_writer.cpp:270
void x_Builder(vector< string > &data2write)
Build data for write.
Definition: seq_writer.cpp:116
string m_FmtSpec
The output format specification.
Definition: seq_writer.hpp:104
vector< SIZE_TYPE > m_ReplOffsets
Vector of offsets where the replacements will take place.
Definition: seq_writer.hpp:108
CSeqDB & m_BlastDb
The BLAST database from which to extract data.
Definition: seq_writer.hpp:106
bool m_Fasta
Fasta output?
Definition: seq_writer.hpp:114
static CMemoryRegistry registry
Definition: cn3d_tools.cpp:81
std::ofstream out("events_result.xml")
main entry point for tests
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
string
Definition: cgiapp.hpp:687
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual void Write(const CSeq_entry_Handle &handle, const CSeq_loc *location=0)
Unspecified locations designate complete sequences; non-empty custom titles override the usual title ...
Definition: sequence.cpp:2727
void SetWidth(TSeqPos width)
Definition: sequence.cpp:3456
virtual void WriteSequence(const CBioseq_Handle &handle, const CSeq_loc *location=0, CSeq_loc::EOpFlags merge_flags=CSeq_loc::fMerge_AbuttingOnly)
Definition: sequence.cpp:3322
void SetAllFlags(TFlags flags)
Definition: sequence.hpp:858
@ fNoExpensiveOps
don't try too hard to find titles
Definition: sequence.hpp:780
@ fKeepGTSigns
don't convert '>' to '_' in title
Definition: sequence.hpp:777
@ fEnableGI
Use this flag to enable GI output in the defline.
Definition: sequence.hpp:786
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
virtual const string & Get(const string &section, const string &name, TFlags flags=0) const
Get the parameter value.
Definition: ncbireg.cpp:262
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3401
const TDb & GetDb(void) const
Get the Db member data.
Definition: Dbtag_.hpp:220
bool IsGeneral(void) const
Check if variant General is selected.
Definition: Seq_id_.hpp:877
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Seq_id_.hpp:775
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
int i
The Object manager core.
USING_SCOPE(objects)
static void s_ReplaceCtrlAsInTitle(CRef< CBioseq > bioseq)
Definition: seq_writer.cpp:244
string GetBareId(const CSeq_id &id)
Definition: seq_writer.cpp:256
static string s_GetTitle(CConstRef< CBioseq > bioseq)
Definition: seq_writer.cpp:234
Configuration object for CSeqFormatter.
Definition: seq_writer.hpp:43
Modified on Tue Dec 05 02:18:06 2023 by modify_doxy.py rev. 669887