1 /* $Id: prj_helper.cpp 47479 2023-05-02 13:24:02Z ucko $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Victor Joukov, Dmitry Rudnev
27  *
28  * File Description: helper for adding items to a GBench project
29  *
30  */
33 #include <ncbi_pch.hpp>
34 #include <gui/core/prj_helper.hpp>
36 #include <gui/objutils/label.hpp>
44 #include <algo/sequence/util.hpp>
52 static void sPrepareSeqAlign(CSeq_align& align, list< CRef<CSeq_align> >& new_aligns)
53 {
54  _ASSERT(align.IsSetScore());
56  ITERATE (CSeq_align::TScore, it, align.GetScore()) {
57  const CScore& score = **it;
58  if (score.GetId().IsStr() && score.GetId().GetStr() == "use_this_gi") {
59  /// we perform a replacment from the gi indicated as the query
61  /// - BLAST encodes the identical gi using this score
62  /// - The gi to be replaced is the second
63  TGi new_gi = GI_FROM(int, score.GetValue().GetInt());
64  CRef<CSeq_align> new_align(new CSeq_align);
65  new_align->Assign(align);
67  switch (new_align->GetSegs().Which()) {
69  CSeq_id& id = *new_align->SetSegs().SetDenseg().SetIds()[1];
70  id.SetGi(new_gi);
71  break;
72  }}
73  default:
74  /// we don't handle the others yet; fail silently as there is
75  /// no real change, but place an assert to catch this when debugging
76  _ASSERT(false);
77  continue;
78  break;
79  }
81  /// erase the use_this_gi scores from this new alignment
82  CSeq_align::TScore::iterator i = new_align->SetScore().begin();
83  for ( ; i != new_align->SetScore().end(); ) {
84  const CScore& s = **i;
85  if (s.GetId().IsStr() && s.GetId().GetStr() == "use_this_gi") {
86  i = new_align->SetScore().erase(i);
87  } else {
88  ++i;
89  }
90  }
91  /// save it in place and iterate
92  new_aligns.push_back(new_align);
93  }
94  }
95 }
98 static const string kBLASTProjectItemTag = "blast_project_item";
101 // create BLAST result project items from a given RID
102 // if the results contain any unresolvable/local sequences,
103 // add the sequences to the project
104 // mostly based on code borrowed from CDLGbprjCreator::CreateProject(rid)
106  CSeq_align_set& results,
107  bool isFindComp,
108  CRef<blast::CRemoteBlast> RemoteBlast,
109  const set<string>& filter,
110  CDataLoadingAppJob& job,
111  SRIDStatInfo& RIDStatInfo)
112 {
113  list<CConstRef<CSeq_id> > unresolved_subjects;
114  const string& rid(RemoteBlast->GetRID());
115  const string& rid_title(RemoteBlast->GetTitle());
117  typedef CSeq_annot::TData::TAlign TAlign;
119  TAlign& an_align = results.Set();
120  string sProjectItemBaseName(rid_title.empty() ? "BLAST Results for: " + rid_title : "BLAST Results, RID: " + rid);
122  bool is_db_search = RemoteBlast->IsDbSearch();
124  bool write_local_seq = false;
125  CRef<CBlast4_queries> qq = RemoteBlast->GetQueries();
126  auto num_queries = qq->GetNumQueries();
127  if (num_queries == 1 && qq->IsSeq_loc_list()) {
128  const list<CRef<CSeq_loc> > seq_loc_list = qq->GetSeq_loc_list();
129  CRef<CSeq_loc> seq_loc = seq_loc_list.front();
130  if (!seq_loc->IsWhole()) {
131  CRange<TSeqPos> rng = seq_loc->GetTotalRange();
132  RIDStatInfo.m_QueryRangeSet = true;
133  RIDStatInfo.m_QueryBeg = rng.GetFrom();
134  RIDStatInfo.m_QueryEnd = rng.GetTo();
135  }
136  }
138  bool report_total = true;
139  typedef map< string, CRef<CSeq_align_set> > TAlignSetMap;
140  TAlignSetMap align_set_map;
142  for ( TAlign::iterator it = an_align.begin(); it != an_align.end(); ++it) {
143  CRef<CSeq_align> align = *it;
144  const CSeq_id& seq_id = align->GetSeq_id(0);
145  const CSeq_id& target_id = align->GetSeq_id(1);
147  // if a filter is defined, use only targets present in the filter
148  string target_id_str = target_id.AsFastaString();
149  if (filter.size() && filter.find(target_id_str) == filter.end()) {
150  continue;
151  }
152  // sort alignments according to query sequence
153  string id_str;
154  seq_id.GetLabel(&id_str);
155  CRef<CSeq_align_set>& curr_set = align_set_map[id_str];
156  if ( !curr_set ) {
157  curr_set.Reset(new CSeq_align_set);
158  }
159  curr_set->Set().push_back(*it);
160  if (align->IsSetScore()) {
161  list< CRef<CSeq_align> > new_aligns;
163  sPrepareSeqAlign(*align, new_aligns);
165  if (new_aligns.size()) {
166  an_align.insert(it, new_aligns.begin(), new_aligns.end());
167  }
168  }
169  if (report_total) {
170  TSeqPos beg = align->GetSeqStart(0);
171  TSeqPos end = align->GetSeqStop(0);
172  if (RIDStatInfo.m_QueryId.empty()) {
173  RIDStatInfo.m_QueryId = seq_id.AsFastaString();
174  RIDStatInfo.m_TotalBeg = beg;
175  RIDStatInfo.m_TotalEnd = end;
176  write_local_seq = write_local_seq || (seq_id.Which() == CSeq_id::e_Local);
177  } else if (RIDStatInfo.m_QueryId != seq_id.AsFastaString()) {
178  // Alignment does not have a single query, can not report
179  RIDStatInfo.m_QueryId = "";
180  // write_local_seq = false;
181  report_total = false;
182  }
183  if (beg < RIDStatInfo.m_TotalBeg) RIDStatInfo.m_TotalBeg = beg;
184  if (end > RIDStatInfo.m_TotalEnd) RIDStatInfo.m_TotalEnd = end;
185  }
186  // If target_id is unresolvable through usual means, that is either
187  // bl2seq or regular Scope can not get its handle, put it away for
188  // later resolution attempt
189  if(!is_db_search || !scope->GetBioseqHandle(target_id)) {
190  unresolved_subjects.push_back(CConstRef<CSeq_id>(&target_id));
191  }
192  // check for source sequences being local
193  write_local_seq = write_local_seq || (seq_id.Which() == CSeq_id::e_Local);
194  }
197  // Compartmentalize alignment for each query here
198  TAlignSetMap cleanup_align_set_map;
199  if (isFindComp) {
200  // Use CAlignCleanup to clean up alignments (truncating and grouping)
201  CAlignCleanup cleanup(*scope);
202  cleanup.SortInputsByScore(true);
203  cleanup.PreserveRows(false);
204  cleanup.FillUnaligned(false);
206  ITERATE(TAlignSetMap, aln_set_iter, align_set_map) {
207  CAlignCleanup::TAligns aligns_out_tmp;
208  cleanup.Cleanup(aln_set_iter->second->Get(), aligns_out_tmp,
211  if ( !aligns_out_tmp.empty() && aligns_out_tmp.size() != aln_set_iter->second->Get().size() ) {
213  aln_set->Set() = aligns_out_tmp;
214  cleanup_align_set_map.insert(
215  TAlignSetMap::value_type(aln_set_iter->first, aln_set));
216  }
217  }
218  }
220  NON_CONST_ITERATE(TAlignSetMap, aln_set_iter, align_set_map) {
221  // cerr << "Entry for " << aln_set_iter->first << ":" << MSerial_AsnText << *(aln_set_iter->second) << endl;
222  string annot_base_name(sProjectItemBaseName + ", Query: " + aln_set_iter->first);
224  /// Check if there is a corresponding cleaned-up alignment set.
225  /// If yes, put it before the original one.
226  TAlignSetMap::iterator cleaned_iter = cleanup_align_set_map.find(aln_set_iter->first);
227  if (cleaned_iter != cleanup_align_set_map.end()) {
228  CRef<CSeq_annot> annot(new CSeq_annot);
229  annot->SetData().SetAlign() = cleaned_iter->second->Set();
230  annot->SetNameDesc("Cleaned Alignments - " + annot_base_name);
233  item->SetObject(*annot);
234  item->SetLabel(annot_base_name);
236  job.AddProjectItem(*item);
238  }
239  CRef<CSeq_annot> annot(new CSeq_annot);
240  annot->SetData().SetAlign() = aln_set_iter->second->Set();
242  annot->SetNameDesc(annot_base_name);
245  item->SetObject(*annot);
246  item->SetLabel(annot_base_name);
248  job.AddProjectItem(*item);
249  }
251  string label;
252  // add local query sequences
253  if (write_local_seq && qq->IsBioseq_set()) {
254  // TODO: do we need to exclude bioseq with non-local id
255  CBioseq_set& bss = qq->SetBioseq_set();
257  const CSeq_entry& entry = **iter;
258  if (entry.IsSeq() && entry.GetSeq().IsSetId()) {
259  auto ids = entry.GetSeq().GetId();
260  bool found_in_scope = false;
261  for (auto seq_id : ids) {
262  if (scope->GetBioseqHandle(*seq_id)) {
263  found_in_scope = true;
264  break;
265  }
266  }
267  if (found_in_scope) continue;
268  }
269  CRef<CProjectItem> new_item(new CProjectItem());
270  (*iter)->GetLabel(&label, CSeq_entry::eBoth);
271  new_item->SetLabel(label);
272  new_item->AddTag(kBLASTProjectItemTag);
273  label.clear();
274  new_item->SetObject(*(*iter));
275  job.AddProjectItem(*new_item);
276  }
277  }
278  // resolve and create additional items for unresolved targets
279  if (unresolved_subjects.size()) {
280  if(is_db_search) {
281  // Use Blast DB scope to get the sequences
282  // so that resulting Genome Workbench project is
283  // self-contained.
285  CRef<objects::CScope> blast_scope(new CScope(*objMgr));
286  CRef<CBlast4_database> bdb(RemoteBlast->GetDatabases());
287  blast::SDataLoaderConfig bdlc(bdb->GetName(), bdb->IsProtein());
288  blast::CBlastScopeSource bss(bdlc, objMgr);
289  bss.AddDataLoaders(blast_scope);
290  ITERATE(list<CConstRef<CSeq_id> >, it, unresolved_subjects) {
291  CBioseq_Handle bsh = blast_scope->GetBioseqHandle(**it);
292  if (bsh) {
294  CRef<CSeq_entry> entry(new CSeq_entry);
295  entry->SetSeq(static_cast<CBioseq&>(*(const_cast<CSerialObject*>(static_cast<const CSerialObject*>(&*bioseq)))));
296  CRef<CProjectItem> new_item(new CProjectItem());
297  entry->GetLabel(&label, CSeq_entry::eBoth);
298  new_item->SetLabel(label);
299  new_item->AddTag(kBLASTProjectItemTag);
300  label.clear();
301  new_item->SetObject(*entry);
302  job.AddProjectItem(*new_item);
303  }
304  }
305  } else {
306  // Add subject sequences from RID to project
307  list<CRef<CBioseq> > subjects = RemoteBlast->GetSubjectSequences();
308  NON_CONST_ITERATE(list<CRef<CBioseq> >, it, subjects) {
309  CRef<CBioseq> bioseq_ref(*it);
310  CRef<CSeq_entry> entry(new CSeq_entry);
311  entry->SetSeq(*bioseq_ref);
312  CRef<CProjectItem> new_item(new CProjectItem());
313  entry->GetLabel(&label, CSeq_entry::eBoth);
314  new_item->SetLabel(label);
315  new_item->AddTag(kBLASTProjectItemTag);
316  label.clear();
317  new_item->SetObject(*entry);
318  job.AddProjectItem(*new_item);
319  }
320  }
321  }
322 }
