NCBI C++ ToolKit
Blast_def_line_set.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: Blast_def_line_set.cpp 87080 2019-07-23 19:26:44Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: .......
27  *
28  * File Description:
29  * .......
30  *
31  * Remark:
32  * This code was originally generated by application DATATOOL
33  * using the following specifications:
34  * 'blastdb.asn'.
35  */
36 
37 // standard includes
38 #include <ncbi_pch.hpp>
39 
40 #include <corelib/ncbiutil.hpp>
44 
45 // generated includes
47 
48 // generated classes
49 
51 
52 BEGIN_objects_SCOPE // namespace ncbi::objects::
53 
54 // destructor
56 {
57 }
58 
59 /// Compare two deflines for ordering purposes.
60 ///
61 /// Given a CSeq_id, the ranking function produces an integer based on
62 /// the type of id involved. Each defline is searched for the Seq-id
63 /// for which the ranking function produces the lowest number. Then
64 /// the best ranks of each defline are compared. The defline with the
65 /// lower rank is considered to come first in the ordering.
66 ///
67 /// @param d1 A defline.
68 /// @param d2 Another one.
69 /// @param rank_func A ranking function.
70 /// @return true if d1 comes before d2 in the defined ordering.
71 static
73  const CRef<CBlast_def_line>& d2,
74  int (*rank_func)(const CRef<CSeq_id>&))
75 {
76  if (d1.Empty() || d2.Empty()) {
77  return false;
78  }
79 
80  bool rv = false;
81 
82  if (d1->CanGetSeqid() && d2->CanGetSeqid()) {
83 
84  // First, use Seq-id by-type ranking.
85 
86  CRef<CSeq_id> c1 = FindBestChoice(d1->GetSeqid(), rank_func);
87  CRef<CSeq_id> c2 = FindBestChoice(d2->GetSeqid(), rank_func);
88 
89  int diff = rank_func(c1) - rank_func(c2);
90 
91  if (diff < 0) {
92  return true;
93  }
94 
95  if (diff > 0) {
96  return false;
97  }
98 
99  // Make sure the "N*" RefSeq accessions precede the "X*" accessions.
100  if (c1->IsOther() && c2->IsOther()) {
101  const string& acc1 = c1->GetOther().GetAccession();
102  const string& acc2 = c2->GetOther().GetAccession();
103  if (acc1.find("WP") == 0 && acc2.find("NP_") == 0)
104  return true;
105  else if (acc1.find("WP_") == 0 && acc2.find("YP_") == 0)
106  return true;
107  else if (acc1.find("YP_") == 0 && acc2.find("WP_") == 0)
108  return false;
109  else if (acc1.find("NP_") == 0 && acc2.find("WP_") == 0)
110  return false;
111  else if ((acc1.find("WP_") == 0 || acc1.find("NP_") == 0 || acc1.find("YP_") == 0 || acc1.find("AP_") == 0)
112  && acc2.find("XP_") == 0)
113  return true;
114  else if ((acc1.find("WP_") == 0 || acc1.find("NP_") == 0 || acc1.find("YP_") == 0)
115  && acc2.find("AP_") == 0)
116  return true;
117  else if (acc1.find("AP_") == 0 &&
118  (acc2.find("NP_") == 0 || acc2.find("YP_") == 0 || acc2.find("WP_") == 0))
119  return false;
120  else if (acc1.find("YP_") == 0 && acc2.find("AP_") == 0)
121  return true;
122  else if (acc1.find("XP_") == 0 &&
123  (acc2.find("NP_") == 0 || acc2.find("AP_") == 0 || acc2.find("YP_") == 0 || acc2.find("WP") == 0))
124  return false;
125  else if (acc1.find("NM_") == 0 && acc2.find("XM_") ==0)
126  return true;
127  else if (acc1.find("XM_") == 0 && acc2.find("NM_") ==0)
128  return false;
129 
130  }
131 
132  // Second, use GI numerical ranking (least value first). I
133  // avoid the possibility of circular rankings by ranking GI
134  // above non-GI here, although this should not happen unless
135  // GIs are given a rank value equal to that of another type.
136  // Finally, if no ranking is possible here, a string
137  // comparison is done.
138 
139  const CSeq_id & cs1 = *d1->GetSeqid().front();
140  const CSeq_id & cs2 = *d2->GetSeqid().front();
141 
142  if (cs1.IsGi()) {
143  if (cs2.IsGi()) {
144  rv = cs1.GetGi() < cs2.GetGi();
145  } else {
146  rv = true;
147  }
148  } else {
149  if (cs2.IsGi()) {
150  rv = false;
151  } else {
152  // Neither is a GI - this will typically only happen
153  // for databases that have no GI.
154 
155  rv = cs1.AsFastaString() < cs2.AsFastaString();
156  }
157  }
158  }
159 
160  return rv;
161 }
162 
163 /// Ranking function for protein Blast-def-lines
164 inline
166  const CRef<CBlast_def_line>& d2)
167 {
168  return s_DeflineCompare(d1, d2, CSeq_id::FastaAARank);
169 }
170 
171 inline
173  const CRef<CBlast_def_line>& d2)
174 {
175  return s_DeflineCompare(d1, d2, CSeq_id::BlastRank);
176 }
177 
178 
179 /// Ranking function for nucleotide Blast-def-lines
180 inline
182  const CRef<CBlast_def_line>& d2)
183 {
184  return s_DeflineCompare(d1, d2, CSeq_id::FastaNARank);
185 }
186 
187 void
188 CBlast_def_line_set::SortBySeqIdRank(bool is_protein, bool useBlastRank)
189 {
190  if (CanGet()) {
191  if(useBlastRank && is_protein){
192  Set().sort(s_DeflineBlastRankAA);
193  }
194  else {
195  Set().sort(is_protein ? s_DeflineCompareAA : s_DeflineCompareNA);
196  }
197  }
198 }
199 
201 {
202  for (auto& defline: Set()) {
203  defline->SetSeqid().remove_if([] (const CRef<CSeq_id>& id) {
204  if (id && id->IsGi()) return true;
205  return false;
206  });
207  }
208 }
209 
210 
211 void
213 {
214  if (gi <= ZERO_GI) {
215  return;
216  }
217 
218  CRef<CBlast_def_line> first_defline;
219 
220  NON_CONST_ITERATE(Tdata, defline, Set()) {
221  ITERATE(CBlast_def_line::TSeqid, id, (*defline)->GetSeqid()) {
222  if ((*id)->IsGi() && (*id)->GetGi() == gi) {
223  first_defline = *defline;
224  break;
225  }
226  }
227  if (first_defline) {
228  Set().erase(defline); // remove the item from the list...
229  break;
230  }
231  }
232 
233  if (first_defline) {
234  // ... and put it in the front
235  Set().push_front(first_defline);
236  }
237 }
238 
239 void GetLinkoutTypes(vector<TLinkoutTypeString>& rv)
240 {
241  rv.clear();
242  // N.B.: only add those linkout types that are actively supported
243  rv.push_back(make_pair(eFromType, string("eFromType")));
244  rv.push_back(make_pair(eUnigene, string("eUnigene")));
245  rv.push_back(make_pair(eStructure, string("eStructure")));
246  rv.push_back(make_pair(eGeo, string("eGeo")));
247  rv.push_back(make_pair(eGene, string("eGene")));
248  rv.push_back(make_pair(eFromVerifiedMaterial, string("eFromVerifiedMaterial")));
249  rv.push_back(make_pair(eMapviewer, string("eMapviewer")));
250  rv.push_back(make_pair(eGenomicSeq, string("eGenomicSeq")));
251  rv.push_back(make_pair(eBioAssay, string("eBioAssay")));
252  rv.push_back(make_pair(eReprMicrobialGenomes, string("eReprMicrobialGenomes")));
253  rv.push_back(make_pair(eGenomeDataViewer, string("eGenomeDataViewer")));
254  rv.push_back(make_pair(eTranscript, string("eTranscript")));
255 }
256 
257 END_objects_SCOPE // namespace ncbi::objects::
258 
260 
261 /* Original file checksum: lines: 65, chars: 1918, CRC32: 1109a8b4 */
User-defined methods of the data storage class.
void GetLinkoutTypes(vector< TLinkoutTypeString > &rv)
Return the available linkout types in a human readable format.
bool s_DeflineCompareNA(const CRef< CBlast_def_line > &d1, const CRef< CBlast_def_line > &d2)
Ranking function for nucleotide Blast-def-lines.
bool s_DeflineCompareAA(const CRef< CBlast_def_line > &d1, const CRef< CBlast_def_line > &d2)
Ranking function for protein Blast-def-lines.
bool s_DeflineBlastRankAA(const CRef< CBlast_def_line > &d1, const CRef< CBlast_def_line > &d2)
static bool s_DeflineCompare(const CRef< CBlast_def_line > &d1, const CRef< CBlast_def_line > &d2, int(*rank_func)(const CRef< CSeq_id > &))
Compare two deflines for ordering purposes.
User-defined methods of the data storage class.
void PutTargetGiFirst(TGi gi)
Place the CBlast_def_line object which contains the requested gi as the first in the list (if found)
void SortBySeqIdRank(bool is_protein, bool useBlastRank=false)
Sort the deflines according to the toolkit established ranking of Seq-ids.
void RemoveGIs()
Removes all SeqIDs with GIs in this defline set.
CRef –.
Definition: ncbiobj.hpp:618
Blast defline related defines.
@ eReprMicrobialGenomes
Linkout for the representative microbial genomes.
@ eMapviewer
The main blast link goes to entrez but we put on a linkout icon that goes to mapviewer.
@ eGenomeDataViewer
Linkout for Genome Data Viewer (GDV)
@ eUnigene
Add Linkout for UniGene.
@ eGenomicSeq
Is a genomic sequence.
@ eStructure
Add Linkout for structure.
@ eGeo
Add Linkout for Geo.
@ eFromType
Identifies sequences from type (Entrez query: sequence from type[filter])
@ eFromVerifiedMaterial
Identifies sequences from verified material (Entrez query: 177353[BioProject])
@ eBioAssay
Add Linkout for BioAssay (structure group resource)
@ eGene
Add Linkout for Gene.
@ eTranscript
Linkout for transcript sequences only (GDV)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define ZERO_GI
Definition: ncbimisc.hpp:1088
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
static int BlastRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:750
static int FastaNARank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:748
static int FastaAARank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:746
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
list< CRef< CSeq_id > > TSeqid
Tdata & Set(void)
Assign a value to data member.
bool CanGet(void) const
Check if it is safe to call Get method.
list< CRef< CBlast_def_line > > Tdata
bool IsOther(void) const
Check if variant Other is selected.
Definition: Seq_id_.hpp:871
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
const TOther & GetOther(void) const
Get the variant data.
Definition: Seq_id_.cpp:347
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
const TAccession & GetAccession(void) const
Get the Accession member data.
Useful/utility classes and methods.
Modified on Sat Dec 09 04:49:36 2023 by modify_doxy.py rev. 669887