NCBI C++ ToolKit
seq_fasta_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_fasta_reader.cpp 47479 2023-05-02 13:24:02Z ucko $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Victor Joukov, Vladislav Evgeniev
27 *
28 * File Description:
29 *
30 */
31 
32 #include <ncbi_pch.hpp>
33 
35 
36 #include <corelib/ncbiutil.hpp>
37 #include <objmgr/scope.hpp>
39 
42 
44 {
45  CTempString defline = m_LineReader->GetCurrentLine();
46 
47  TSeqPos range_start = 0, range_end = 0;
48  bool has_range = false;
49  SDefLineParseInfo parseInfo;
50  parseInfo.fBaseFlags = m_iFlags;
51  parseInfo.fFastaFlags = static_cast<CFastaDeflineReader::TFastaFlags>(GetFlags()) | fParseRawID;
52  parseInfo.maxIdLength = m_MaxIDLength;
53  parseInfo.lineNumber = LineNumber();
54 
55  list<CRef<CSeq_id> > ids;
56  try {
57  ParseDefLine(defline,
58  parseInfo,
59  m_ignorable,
60  ids,
61  has_range,
62  range_start,
63  range_end,
64  m_CurrentSeqTitles,
65  0);
66  }
67  catch (const exception&) {}
68 
69  for (auto it = ids.begin(); it != ids.end();) {
70  const CSeq_id& id = **it;
71  switch (id.Which()) {
72  case CSeq_id::e_not_set:
73  case CSeq_id::e_Local:
74  it = ids.erase(it);
75  break;
76  default:
77  ++it;
78  break;
79  }
80  }
81 
82  if (!ids.empty()) {
84  string seq_id_text = "lcl|";
85  if (has_range)
86  seq_id_text += "rng_";
87  seq_id_text += bestId->GetSeqIdString(true);
88  if (has_range)
89  seq_id_text += "-" + NStr::NumericToString(range_start + 1) + "-" + NStr::NumericToString(range_end + 1);
90  CRef<CSeq_id> seq_id(new CSeq_id(seq_id_text));
91  SetIDs().push_back(seq_id);
92  m_LocalIds[seq_id] = bestId;
93  }
94  else {
96  }
97 }
98 
99 void CSeqFastaReader::PostProcessIDs(const CBioseq::TId& defline_ids, const string& defline, bool has_range, TSeqPos range_start, TSeqPos range_end)
100 {
101  CFastaReader::PostProcessIDs(defline_ids, defline, has_range, range_start, range_end);
102 
103  if (defline_ids.empty())
104  return;
105 
106  CRef<CSeq_id> bestId = FindBestChoice(defline_ids, CSeq_id::BestRank);
107  if (!bestId->IsLocal())
108  return;
109 
110  CRef<CSeq_id> non_local_id = fasta_utils::IdentifyLocalId(bestId->GetSeqIdString(true));
111  if (non_local_id.Empty())
112  return;
113 
114  m_LocalIds[bestId] = non_local_id;
115 }
116 
117 CRef<CSeq_entry> CSeqFastaReader::ReadSequences(vector<CConstRef<CSeq_id>> *wellknown_ids, int max_seqs, ILineErrorListener * pMessageListener)
118 {
119  CRef<CSeq_entry> entries = CFastaReader::ReadSet(max_seqs, pMessageListener);
120 
122 
123  if (m_LocalIds.empty())
124  return entries;
125 
127 
128  return entries;
129 }
130 
131 
132 
CRef< objects::CSeq_entry > ReadSequences(vector< CConstRef< objects::CSeq_id >> *wellknown_ids=nullptr, int max_seqs=kMax_Int, objects::ILineErrorListener *pMessageListener=0)
Read multiple sequences and remove the well-known sequences from the entry.
fasta_utils::TSeq_idMap m_LocalIds
objects::CScope & m_Scope
virtual void GenerateID() override
virtual void PostProcessIDs(const objects::CBioseq::TId &defline_ids, const string &defline, bool has_range=false, TSeqPos range_start=kInvalidSeqPos, TSeqPos range_end=kInvalidSeqPos) override
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CRef< objects::CSeq_id > IdentifyLocalId(const string &fasta_id)
Tries to identify the specified local id (works for ids, prefixed with rng_ or mod_,...
CRef< objects::CSeq_entry > ReplaceWellknownSeqs(objects::CSeq_entry &entry, objects::CScope &scope, vector< CConstRef< objects::CSeq_id >> *wellknown_ids=nullptr, TSeq_idMap *local_ids=nullptr)
Removes the well-known sequences from the set and returns their ids (optional)
void UpdateOrgInformation(objects::CSeq_entry &entry, objects::CScope &scope, const TSeq_idMap &local_ids)
Updates the organism information for the Seq-entry object by copying the information from a well-know...
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
CRef< CSeq_entry > ReadSet(int max_seqs=kMax_Int, ILineErrorListener *pMessageListener=nullptr)
Read multiple sequences (by default, as many as are available.)
Definition: fasta.cpp:442
virtual void PostProcessIDs(const CBioseq::TId &defline_ids, const string &defline, bool has_range=false, TSeqPos range_start=kInvalidSeqPos, TSeqPos range_end=kInvalidSeqPos)
Definition: fasta.cpp:619
virtual void GenerateID(void)
Definition: fasta.cpp:703
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
static int BestRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:774
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Seq_id_.hpp:775
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Local
local use
Definition: Seq_id_.hpp:95
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
Useful/utility classes and methods.
USING_SCOPE(objects)
static wxAcceleratorEntry entries[3]
Modified on Thu Apr 25 08:19:13 2024 by modify_doxy.py rev. 669887