NCBI C++ ToolKit
prj_helper.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: prj_helper.cpp 47479 2023-05-02 13:24:02Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Victor Joukov, Dmitry Rudnev
27  *
28  * File Description: helper for adding items to a GBench project
29  *
30  */
31 
32 
33 #include <ncbi_pch.hpp>
34 #include <gui/core/prj_helper.hpp>
36 #include <gui/objutils/label.hpp>
37 
43 
44 #include <algo/sequence/util.hpp>
47 
49 
50 
51 
52 static void sPrepareSeqAlign(CSeq_align& align, list< CRef<CSeq_align> >& new_aligns)
53 {
54  _ASSERT(align.IsSetScore());
55 
56  ITERATE (CSeq_align::TScore, it, align.GetScore()) {
57  const CScore& score = **it;
58  if (score.GetId().IsStr() && score.GetId().GetStr() == "use_this_gi") {
59  /// we perform a replacment from the gi indicated as the query
60  /// NB: ASSUMPTIONS MADE
61  /// - BLAST encodes the identical gi using this score
62  /// - The gi to be replaced is the second
63  TGi new_gi = GI_FROM(int, score.GetValue().GetInt());
64  CRef<CSeq_align> new_align(new CSeq_align);
65  new_align->Assign(align);
66 
67  switch (new_align->GetSegs().Which()) {
69  CSeq_id& id = *new_align->SetSegs().SetDenseg().SetIds()[1];
70  id.SetGi(new_gi);
71  break;
72  }}
73  default:
74  /// we don't handle the others yet; fail silently as there is
75  /// no real change, but place an assert to catch this when debugging
76  _ASSERT(false);
77  continue;
78  break;
79  }
80 
81  /// erase the use_this_gi scores from this new alignment
82  CSeq_align::TScore::iterator i = new_align->SetScore().begin();
83  for ( ; i != new_align->SetScore().end(); ) {
84  const CScore& s = **i;
85  if (s.GetId().IsStr() && s.GetId().GetStr() == "use_this_gi") {
86  i = new_align->SetScore().erase(i);
87  } else {
88  ++i;
89  }
90  }
91  /// save it in place and iterate
92  new_aligns.push_back(new_align);
93  }
94  }
95 }
96 
97 
98 static const string kBLASTProjectItemTag = "blast_project_item";
99 
100 
101 // create BLAST result project items from a given RID
102 // if the results contain any unresolvable/local sequences,
103 // add the sequences to the project
104 // mostly based on code borrowed from CDLGbprjCreator::CreateProject(rid)
106  CSeq_align_set& results,
107  bool isFindComp,
108  CRef<blast::CRemoteBlast> RemoteBlast,
109  const set<string>& filter,
110  CDataLoadingAppJob& job,
111  SRIDStatInfo& RIDStatInfo)
112 {
113  list<CConstRef<CSeq_id> > unresolved_subjects;
114  const string& rid(RemoteBlast->GetRID());
115  const string& rid_title(RemoteBlast->GetTitle());
116 
117  typedef CSeq_annot::TData::TAlign TAlign;
118 
119  TAlign& an_align = results.Set();
120  string sProjectItemBaseName(rid_title.empty() ? "BLAST Results for: " + rid_title : "BLAST Results, RID: " + rid);
121 
122  bool is_db_search = RemoteBlast->IsDbSearch();
123 
124  bool write_local_seq = false;
125  CRef<CBlast4_queries> qq = RemoteBlast->GetQueries();
126  auto num_queries = qq->GetNumQueries();
127  if (num_queries == 1 && qq->IsSeq_loc_list()) {
128  const list<CRef<CSeq_loc> > seq_loc_list = qq->GetSeq_loc_list();
129  CRef<CSeq_loc> seq_loc = seq_loc_list.front();
130  if (!seq_loc->IsWhole()) {
131  CRange<TSeqPos> rng = seq_loc->GetTotalRange();
132  RIDStatInfo.m_QueryRangeSet = true;
133  RIDStatInfo.m_QueryBeg = rng.GetFrom();
134  RIDStatInfo.m_QueryEnd = rng.GetTo();
135  }
136  }
137 
138  bool report_total = true;
139  typedef map< string, CRef<CSeq_align_set> > TAlignSetMap;
140  TAlignSetMap align_set_map;
141 
142  for ( TAlign::iterator it = an_align.begin(); it != an_align.end(); ++it) {
143  CRef<CSeq_align> align = *it;
144  const CSeq_id& seq_id = align->GetSeq_id(0);
145  const CSeq_id& target_id = align->GetSeq_id(1);
146 
147  // if a filter is defined, use only targets present in the filter
148  string target_id_str = target_id.AsFastaString();
149  if (filter.size() && filter.find(target_id_str) == filter.end()) {
150  continue;
151  }
152  // sort alignments according to query sequence
153  string id_str;
154  seq_id.GetLabel(&id_str);
155  CRef<CSeq_align_set>& curr_set = align_set_map[id_str];
156  if ( !curr_set ) {
157  curr_set.Reset(new CSeq_align_set);
158  }
159  curr_set->Set().push_back(*it);
160  if (align->IsSetScore()) {
161  list< CRef<CSeq_align> > new_aligns;
162 
163  sPrepareSeqAlign(*align, new_aligns);
164 
165  if (new_aligns.size()) {
166  an_align.insert(it, new_aligns.begin(), new_aligns.end());
167  }
168  }
169  if (report_total) {
170  TSeqPos beg = align->GetSeqStart(0);
171  TSeqPos end = align->GetSeqStop(0);
172  if (RIDStatInfo.m_QueryId.empty()) {
173  RIDStatInfo.m_QueryId = seq_id.AsFastaString();
174  RIDStatInfo.m_TotalBeg = beg;
175  RIDStatInfo.m_TotalEnd = end;
176  write_local_seq = write_local_seq || (seq_id.Which() == CSeq_id::e_Local);
177  } else if (RIDStatInfo.m_QueryId != seq_id.AsFastaString()) {
178  // Alignment does not have a single query, can not report
179  RIDStatInfo.m_QueryId = "";
180  // write_local_seq = false;
181  report_total = false;
182  }
183  if (beg < RIDStatInfo.m_TotalBeg) RIDStatInfo.m_TotalBeg = beg;
184  if (end > RIDStatInfo.m_TotalEnd) RIDStatInfo.m_TotalEnd = end;
185  }
186  // If target_id is unresolvable through usual means, that is either
187  // bl2seq or regular Scope can not get its handle, put it away for
188  // later resolution attempt
189  if(!is_db_search || !scope->GetBioseqHandle(target_id)) {
190  unresolved_subjects.push_back(CConstRef<CSeq_id>(&target_id));
191  }
192  // check for source sequences being local
193  write_local_seq = write_local_seq || (seq_id.Which() == CSeq_id::e_Local);
194  }
195 
196 
197  // Compartmentalize alignment for each query here
198  TAlignSetMap cleanup_align_set_map;
199  if (isFindComp) {
200  // Use CAlignCleanup to clean up alignments (truncating and grouping)
201  CAlignCleanup cleanup(*scope);
202  cleanup.SortInputsByScore(true);
203  cleanup.PreserveRows(false);
204  cleanup.FillUnaligned(false);
205 
206  ITERATE(TAlignSetMap, aln_set_iter, align_set_map) {
207  CAlignCleanup::TAligns aligns_out_tmp;
208  cleanup.Cleanup(aln_set_iter->second->Get(), aligns_out_tmp,
210 
211  if ( !aligns_out_tmp.empty() && aligns_out_tmp.size() != aln_set_iter->second->Get().size() ) {
213  aln_set->Set() = aligns_out_tmp;
214  cleanup_align_set_map.insert(
215  TAlignSetMap::value_type(aln_set_iter->first, aln_set));
216  }
217  }
218  }
219 
220  NON_CONST_ITERATE(TAlignSetMap, aln_set_iter, align_set_map) {
221  // cerr << "Entry for " << aln_set_iter->first << ":" << MSerial_AsnText << *(aln_set_iter->second) << endl;
222  string annot_base_name(sProjectItemBaseName + ", Query: " + aln_set_iter->first);
223 
224  /// Check if there is a corresponding cleaned-up alignment set.
225  /// If yes, put it before the original one.
226  TAlignSetMap::iterator cleaned_iter = cleanup_align_set_map.find(aln_set_iter->first);
227  if (cleaned_iter != cleanup_align_set_map.end()) {
228  CRef<CSeq_annot> annot(new CSeq_annot);
229  annot->SetData().SetAlign() = cleaned_iter->second->Set();
230  annot->SetNameDesc("Cleaned Alignments - " + annot_base_name);
233  item->SetObject(*annot);
234  item->SetLabel(annot_base_name);
236  job.AddProjectItem(*item);
237 
238  }
239  CRef<CSeq_annot> annot(new CSeq_annot);
240  annot->SetData().SetAlign() = aln_set_iter->second->Set();
242  annot->SetNameDesc(annot_base_name);
243 
245  item->SetObject(*annot);
246  item->SetLabel(annot_base_name);
248  job.AddProjectItem(*item);
249  }
250 
251  string label;
252  // add local query sequences
253  if (write_local_seq && qq->IsBioseq_set()) {
254  // TODO: do we need to exclude bioseq with non-local id
255  CBioseq_set& bss = qq->SetBioseq_set();
257  const CSeq_entry& entry = **iter;
258  if (entry.IsSeq() && entry.GetSeq().IsSetId()) {
259  auto ids = entry.GetSeq().GetId();
260  bool found_in_scope = false;
261  for (auto seq_id : ids) {
262  if (scope->GetBioseqHandle(*seq_id)) {
263  found_in_scope = true;
264  break;
265  }
266  }
267  if (found_in_scope) continue;
268  }
269  CRef<CProjectItem> new_item(new CProjectItem());
270  (*iter)->GetLabel(&label, CSeq_entry::eBoth);
271  new_item->SetLabel(label);
272  new_item->AddTag(kBLASTProjectItemTag);
273  label.clear();
274  new_item->SetObject(*(*iter));
275  job.AddProjectItem(*new_item);
276  }
277  }
278  // resolve and create additional items for unresolved targets
279  if (unresolved_subjects.size()) {
280  if(is_db_search) {
281  // Use Blast DB scope to get the sequences
282  // so that resulting Genome Workbench project is
283  // self-contained.
285  CRef<objects::CScope> blast_scope(new CScope(*objMgr));
286  CRef<CBlast4_database> bdb(RemoteBlast->GetDatabases());
287  blast::SDataLoaderConfig bdlc(bdb->GetName(), bdb->IsProtein());
288  blast::CBlastScopeSource bss(bdlc, objMgr);
289  bss.AddDataLoaders(blast_scope);
290  ITERATE(list<CConstRef<CSeq_id> >, it, unresolved_subjects) {
291  CBioseq_Handle bsh = blast_scope->GetBioseqHandle(**it);
292  if (bsh) {
294  CRef<CSeq_entry> entry(new CSeq_entry);
295  entry->SetSeq(static_cast<CBioseq&>(*(const_cast<CSerialObject*>(static_cast<const CSerialObject*>(&*bioseq)))));
296  CRef<CProjectItem> new_item(new CProjectItem());
297  entry->GetLabel(&label, CSeq_entry::eBoth);
298  new_item->SetLabel(label);
299  new_item->AddTag(kBLASTProjectItemTag);
300  label.clear();
301  new_item->SetObject(*entry);
302  job.AddProjectItem(*new_item);
303  }
304  }
305  } else {
306  // Add subject sequences from RID to project
307  list<CRef<CBioseq> > subjects = RemoteBlast->GetSubjectSequences();
308  NON_CONST_ITERATE(list<CRef<CBioseq> >, it, subjects) {
309  CRef<CBioseq> bioseq_ref(*it);
310  CRef<CSeq_entry> entry(new CSeq_entry);
311  entry->SetSeq(*bioseq_ref);
312  CRef<CProjectItem> new_item(new CProjectItem());
313  entry->GetLabel(&label, CSeq_entry::eBoth);
314  new_item->SetLabel(label);
315  new_item->AddTag(kBLASTProjectItemTag);
316  label.clear();
317  new_item->SetObject(*entry);
318  job.AddProjectItem(*new_item);
319  }
320  }
321  }
322 }
323 
324 
325 
326 
User-defined methods of the data storage class.
Declares CBlastScopeSource class to create properly configured CScope objects to invoke the BLAST dat...
class CAlignCleanup implements an alignment cleanup utility based on the C++ alignment manager.
list< CRef< CSeq_align > > TAligns
CBioseq_Handle –.
bool IsProtein() const
Returns true if the database is protein.
size_t GetNumQueries() const
Retrieve the number of queries in this object.
CDataLoadingAppJob - a base class for Jobs loading data into projects.
void AddProjectItem(objects::CProjectItem &item)
void AddTag(const string &tag)
Definition: ProjectItem.hpp:96
void SetObject(CSerialObject &object)
wrapper for setting the object pointed to by this item
CScope –.
Definition: scope.hpp:92
Definition: Score.hpp:57
TSeqPos GetSeqStop(TDim row) const
Definition: Seq_align.cpp:273
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
TSeqPos GetSeqStart(TDim row) const
Definition: Seq_align.cpp:252
void SetNameDesc(const string &name)
Definition: Seq_annot.cpp:66
void SetCreateDate(const CTime &dt)
Definition: Seq_annot.cpp:121
Definition: Seq_entry.hpp:56
@ eBoth
Definition: Seq_entry.hpp:94
void GetLabel(string *label, ELabelType type) const
Definition: Seq_entry.cpp:274
Base class for all serializable objects.
Definition: serialbase.hpp:150
CTime –.
Definition: ncbitime.hpp:296
Definition: map.hpp:338
size_type size() const
Definition: set.hpp:132
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
static void cleanup(void)
Definition: ct_dynamic.c:30
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
static void AddProjectItemsFromRID(CRef< objects::CScope > scope, CSeq_align_set &results, bool isFindComp, CRef< blast::CRemoteBlast > RemoteBlast, const set< string > &filter, CDataLoadingAppJob &job, SRIDStatInfo &RIDStatInfo)
Definition: prj_helper.cpp:105
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
static const char label[]
bool IsSeq_loc_list(void) const
Check if variant Seq_loc_list is selected.
const TName & GetName(void) const
Get the Name member data.
TBioseq_set & SetBioseq_set(void)
Select the variant.
bool IsBioseq_set(void) const
Check if variant Bioseq_set is selected.
const TSeq_loc_list & GetSeq_loc_list(void) const
Get the variant data.
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetLabel(const TLabel &value)
Assign a value to Label data member.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
Tdata & Set(void)
Assign a value to data member.
TScore & SetScore(void)
Assign a value to Score data member.
Definition: Seq_align_.hpp:902
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_align_.hpp:691
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
TInt GetInt(void) const
Get the variant data.
Definition: Score_.hpp:411
const TValue & GetValue(void) const
Get the Value member data.
Definition: Score_.hpp:465
bool IsSetScore(void) const
for whole alignment Check if a value has been assigned to Score data member.
Definition: Seq_align_.hpp:884
const TScore & GetScore(void) const
Get the Score member data.
Definition: Seq_align_.hpp:896
const TId & GetId(void) const
Get the Id member data.
Definition: Score_.hpp:444
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
@ e_Local
local use
Definition: Seq_id_.hpp:95
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
list< CRef< CSeq_align > > TAlign
Definition: Seq_annot_.hpp:194
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
Definition: Bioseq_.hpp:278
int i
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
static void sPrepareSeqAlign(CSeq_align &align, list< CRef< CSeq_align > > &new_aligns)
Definition: prj_helper.cpp:52
static const string kBLASTProjectItemTag
Definition: prj_helper.cpp:98
#define _ASSERT
Modified on Tue Apr 23 07:37:29 2024 by modify_doxy.py rev. 669887