1 /* $Id: sequpd.cpp 43960 2019-09-27 14:43:29Z asztalos $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Andrea Asztalos
27  */
30 #include <ncbi_pch.hpp>
39 #include <serial/iterator.hpp>
42 #include <objmgr/seqdesc_ci.hpp>
43 #include <objmgr/bioseq_ci.hpp>
44 #include <objmgr/impl/synonyms.hpp>
45 #include <objmgr/scope.hpp>
46 #include <objmgr/seq_vector.hpp>
63 {
65  if (!idh.IsGi()) {
66  return idh;
67  }
69  CSeq_id_Handle gb_idh, lcl_idh;
70  CConstRef<CSynonymsSet> synonyms = bsh.GetSynonyms();
71  if (synonyms && !synonyms->empty()) {
72  for (auto& it : *synonyms) {
73  if (it.Which() == CSeq_id::e_Genbank) {
74  gb_idh = it;
75  }
76  else if (it.Which() == CSeq_id::e_Local) {
77  lcl_idh = it;
78  }
79  }
80  }
82  if (gb_idh) {
83  idh.Swap(gb_idh);
84  }
85  else if (lcl_idh) {
86  idh.Swap(lcl_idh);
87  }
89  return idh;
90 }
92 bool sequpd::HaveIdenticalResidues(const objects::CBioseq_Handle& bsh1, const objects::CBioseq_Handle& bsh2)
93 {
94  if (!bsh1 && !bsh2) {
95  return true;
96  }
97  else if (!bsh1 || !bsh2) {
98  return false;
99  }
100  else if (bsh1.GetBioseqLength() != bsh2.GetBioseqLength()) {
101  return false;
102  }
103  else if (bsh1.IsNucleotide() && bsh2.IsProtein()) {
104  return false;
105  }
106  else if (bsh1.IsProtein() && bsh2.IsNucleotide()) {
107  return false;
108  }
110  CSeqVector old_seqvec = bsh1.GetSeqVector(CBioseq_Handle::eCoding_Iupac, eNa_strand_plus);
111  CSeqVector upd_seqvec = bsh2.GetSeqVector(CBioseq_Handle::eCoding_Iupac, eNa_strand_plus);
113  string old_seq;
114  old_seqvec.GetSeqData(0, bsh1.GetBioseqLength(), old_seq);
115  string upd_seq;
116  upd_seqvec.GetSeqData(0, bsh2.GetBioseqLength(), upd_seq);
118  return NStr::EqualNocase(old_seq, upd_seq);
119 }
121 static void s_FixCollidingIDs_Annot(CBioseq& bseq, CSeq_entry::TAnnot& annot, const vector<CRef<CSeq_id>>& upd_ids)
122 {
123  CRef<CSeq_id> new_id = bseq.GetId().front();
125  for (CTypeIterator<CSeq_id> id_iter(**ait); id_iter; ++id_iter) {
126  CSeq_id& id = *id_iter;
127  for (auto& it : upd_ids) {
128  if (id.Compare(it.GetObject()) == CSeq_id::e_YES) {
129  id.Assign(*new_id);
130  break;
131  }
132  }
133  }
134  }
135 }
137 static const char* kUpdateSuffix = "_update";
140 {
141  bool has_conflict = false;
142  for (auto& upd_id : bseq.GetId()) {
143  for (auto& old_id : seq_ids) {
144  if (upd_id->Compare(*old_id) == CSeq_id::e_YES) {
145  has_conflict = true;
146  break;
147  }
148  }
149  if (has_conflict) {
150  break;
151  }
152  }
154  if (!has_conflict) {
155  return;
156  }
158  // save the original IDs of the update sequence
159  vector<CRef<CSeq_id>> update_ids;
160  for (auto& it : bseq.GetId()) {
161  CRef<CSeq_id> newid(new CSeq_id);
162  newid->Assign(it.GetObject());
163  update_ids.push_back(newid);
164  }
166  string lclID_label, gbID_label, accID_label;
167  if (bseq.GetId().size() == 1) {
168  bseq.GetId().front()->GetLabel(&gbID_label, CSeq_id::eContent);
169  }
170  else {
171  ITERATE(CBioseq::TId, upd_id, bseq.GetId()) {
172  if ((*upd_id)->IsLocal()) {
173  (*upd_id)->GetLabel(&lclID_label, CSeq_id::eContent);
174  }
175  else if ((*upd_id)->IsGenbank()) {
176  (*upd_id)->GetLabel(&gbID_label, CSeq_id::eContent);
177  }
178  else if ((*upd_id)->IsOther()) {
179  int version;
180  (*upd_id)->GetLabel(&accID_label, &version, CSeq_id::eContent);
181  accID_label.push_back('.');
182  accID_label.append(NStr::NumericToString(version));
183  }
184  }
185  }
187  CRef<CSeq_id> newUpdate_Id;
188  if (!gbID_label.empty()) {
189  newUpdate_Id.Reset(new CSeq_id(CSeq_id::e_Local, gbID_label + kUpdateSuffix));
190  }
191  else if (!lclID_label.empty()) {
192  newUpdate_Id.Reset(new CSeq_id(CSeq_id::e_Local, lclID_label + kUpdateSuffix));
193  }
194  else if (!accID_label.empty()) {
195  newUpdate_Id.Reset(new CSeq_id(CSeq_id::e_Local, accID_label + kUpdateSuffix));
196  }
198  if (!newUpdate_Id) {
199  NCBI_THROW(CSeqUpdateException, eInternal, "Update IDs could not be fixed.");
200  }
202  bseq.ResetId();
203  bseq.SetId().push_back(newUpdate_Id);
204  s_FixCollidingIDs_Annot(bseq, bseq.SetAnnot(), update_ids);
206  CSeq_entry* parent = bseq.GetParentEntry();
207  if (!parent) {
208  return;
209  }
211  CSeq_entry* parent_set = parent->GetParentEntry();
212  if (parent_set && parent_set->IsSet() && parent_set->GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
213  s_FixCollidingIDs_Annot(bseq, parent_set->SetAnnot(), update_ids);
214  }
215 }
217 // return the matching old sequence for the update sequence if there is one
218 static CBioseq_Handle s_GetMatchingSequence(CSeq_inst::EMol type, CSeq_entry_Handle& oldSeq, const CBioseq& upd_bseq, bool& collide, const sequpd::TSeqIdHMap& matches);
221 {
222  for (CTypeIterator<CBioseq> it(updEntry); it; ++it) {
223  CBioseq& updSeq = *it;
224  if (updSeq.GetInst().IsSetMol()) {
225  if (!(CSeq_inst::IsNa(updSeq.GetInst().GetMol()) && CSeq_inst::IsNa(type)) &&
226  !(CSeq_inst::IsAa(updSeq.GetInst().GetMol()) && CSeq_inst::IsAa(type))) {
227  continue;
228  }
229  }
230  else {
231  // no matching sequence has been found for the update sequence
232  CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*updSeq.GetId().front());
233  unmatched.push_back(idh);
234  }
236  bool collide = false;
237  CBioseq_Handle match_bsh = s_GetMatchingSequence(type, oldSeq, updSeq, collide, matches);
238  if (match_bsh) {
239  if (collide) {
240  // fix ID conflicts
241  const CBioseq::TId& old_ids = match_bsh.GetCompleteBioseq()->GetId();
242  sequpd::FixCollidingIDs_Bioseq(updSeq, old_ids);
243  }
245  CSeq_id_Handle old_idh = sequpd::GetGoodSeqIdHandle(match_bsh);
246  if (matches.find(old_idh) != matches.end()) {
247  NCBI_THROW(CSeqUpdateException, eReading, "Non-unique sequence IDs in update sequences!");
248  }
249  else {
250  CSeq_id_Handle upd_idh = CSeq_id_Handle::GetHandle(*updSeq.GetId().front());
251  matches.emplace(old_idh, upd_idh);
252  }
253  }
254  else {
255  // no matching sequence has been found for the update sequence
256  CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*updSeq.GetId().front());
257  unmatched.push_back(idh);
258  }
259  }
260 }
262 static bool s_MatchSeqIds(CScope& scope, const CSeq_id& old_id, const CSeq_id& upd_id, bool& collide);
264 static CBioseq_Handle s_GetMatchingSequence(CSeq_inst::EMol type, CSeq_entry_Handle& oldSeq, const CBioseq& upd_bseq, bool& collide, const sequpd::TSeqIdHMap& matches)
265 {
266  for (CBioseq_CI b_iter(oldSeq, type); b_iter; ++b_iter) {
267  CSeq_id_Handle old_id_handle = sequpd::GetGoodSeqIdHandle(*b_iter);
268  if (matches.find(old_id_handle) != matches.end())
269  continue;
271  CConstRef<CBioseq> oldBseq = b_iter->GetBioseqCore();
272  if (oldBseq->IsSetId() && upd_bseq.IsSetId()) {
273  for (auto& id1 : oldBseq->GetId()) {
274  for (auto& id2 : upd_bseq.GetId()) {
275  if (s_MatchSeqIds(oldSeq.GetScope(), *id1, *id2, collide))
276  return *b_iter;
277  }
278  }
279  }
280  }
281  return CBioseq_Handle();
282 }
286 static bool s_MatchSeqIds(CScope& scope, const CSeq_id& old_id, const CSeq_id& upd_id, bool& collide)
287 {
288  collide = false;
289  CSeq_id::E_SIC cmp_type = old_id.Compare(upd_id);
290  if (cmp_type == CSeq_id::e_YES) {
291  collide = true;
292  return true;
293  }
294  else if (cmp_type == CSeq_id::e_DIFF) { // different types, compare their contents
295  string old_label(kEmptyStr), upd_label(kEmptyStr);
296  old_id.GetLabel(&old_label, CSeq_id::eContent);
297  upd_id.GetLabel(&upd_label, CSeq_id::eContent);
298  SIZE_TYPE old_dot = NStr::Find(old_label, ".");
299  SIZE_TYPE upd_dot = NStr::Find(upd_label, ".");
300  if (old_dot != NPOS) {
301  old_label = old_label.substr(0, old_dot);
302  }
303  if (upd_dot != NPOS) {
304  upd_label = upd_label.substr(0, upd_dot);
305  }
306  if (NStr::EqualCase(old_label, upd_label)) {
307  return true;
308  }
309  }
310  else if (cmp_type == CSeq_id::e_NO && old_id.IsLocal() && upd_id.IsLocal()) {
311  CConstRef<CSeq_id> orig_id = s_GetOriginalId(scope.GetBioseqHandle(old_id));
312  if (orig_id && orig_id->Compare(upd_id) == CSeq_id::e_YES) {
313  return true;
314  }
315  }
317  return false;
318 }
321 {
322  if (bsh.IsAa()) {
323  return bsh.GetLocalIdOrNull();
324  }
326  CConstRef<CSeq_id> orig_id;
327  for (CSeqdesc_CI desc_it(bsh, CSeqdesc::e_User, 1); desc_it; ++desc_it) {
328  const CUser_object& usr = desc_it->GetUser();
330  ITERATE(CUser_object::TData, it, usr.GetData()) {
331  if ((*it)->IsSetLabel() && (*it)->GetLabel().IsStr()
332  && NStr::EqualNocase((*it)->GetLabel().GetStr(), "LocalId")
333  && (*it)->IsSetData()
334  && (*it)->GetData().IsStr()) {
336  string id_str = (*it)->GetData().GetStr();
337  orig_id.Reset(new CSeq_id(CSeq_id::e_Local, id_str));
338  break;
339  }
340  }
341  }
342  }
344  return orig_id;
345 }
347 static vector<CConstRef<CSeq_align> > s_RunBlast2NASeq(const CBioseq_Handle& sh, const CBioseq_Handle& qh, bool accept_atleast_one, ICanceled* canceled);
348 static vector<CConstRef<CSeq_align> > s_RunBlast2NWSeq(const CBioseq_Handle& sh, const CBioseq_Handle& qh);
349 static vector<CConstRef<CSeq_align> > s_RunBlast2AASeq(const CBioseq_Handle& sh, const CBioseq_Handle& qh);
351 vector<CConstRef<CSeq_align> > sequpd::RunBlast2Seq(const CBioseq_Handle& sh, const CBioseq_Handle& qh, bool accept_atleast_one, ICanceled* canceled)
352 {
353  if (!sh || !qh)
354  return vector<CConstRef<CSeq_align> >();
356  if (sh.IsNucleotide() && qh.IsNucleotide()) {
357  return s_RunBlast2NASeq(sh, qh, accept_atleast_one, canceled);
358  }
359  else if (sh.IsProtein() && qh.IsProtein()) {
360  return s_RunBlast2NWSeq(sh, qh);
361  }
362  else {
363  NCBI_THROW(CSeqUpdateException, eAlignment, "Mismatch in sequence type, cannot form alignment.");
364  }
366  return vector<CConstRef<CSeq_align> >();
367 }
369 namespace
370 {
371  class CGPipeAlignmentScorer : public IAlignmentScorer {
372  public:
373  enum EAlignScoreTypes {
374  /// add BLAST-style 'num_ident' score
375  fScore_Identities = 0x001,
377  /// add a 'mismatch' core with a count of mismatches
378  fScore_Mismatches = 0x002,
380  /// add a 'gap_count' score
381  fScore_GapCount = 0x004,
383  /// add scores for ungapped and gapped percent identity
384  fScore_PercentIdentity = 0x008,
386  /// add a score for percent coverage of query (sequence 0)
387  fScore_PercentCoverage = 0x010,
389  /// default flags: everything
390  fScore_Default = 0xffffffff
391  };
395  void ScoreAlignments(TAlignResultsRef results, CScope& scope)
396  {
398  result_iter, results->Get()) {
400  assm_iter, result_iter->second->Get()) {
402  query_iter, assm_iter->second) {
403  NON_CONST_ITERATE(CSeq_align_set::Tdata, it, query_iter->second->Set()) {
404  CSeq_align& align = **it;
405  s_AddStandardAlignmentScores(scope, align, fScore_Default);
407  /// additionally, add the gaponly version, used in gbDNA
408  CScoreBuilder sb;
410  }
411  }
412  }
413  }
414  }
415  private:
416  static void s_AddStandardAlignmentScores(CScope& scope, CSeq_align& align, int flags);
417  };
419  void CGPipeAlignmentScorer::s_AddStandardAlignmentScores(CScope& scope, CSeq_align& align, int flags)
420  {
421  CScoreBuilder sb;
425  /// this automatically adds num_ident and num_mismatch
426  sb.AddScore(scope, align,
428  sb.AddScore(scope, align,
430  }
431  else if (flags & fScore_Identities) {
432  sb.AddScore(scope, align, CSeq_align::eScore_IdentityCount);
433  }
434  else if (flags & fScore_Mismatches) {
435  sb.AddScore(scope, align, CSeq_align::eScore_MismatchCount);
436  }
437  }
439  if (flags & fScore_GapCount) {
440  /// FIXME: add eScore_GapCount to CSeq_align, CScoreBuilder
441  //sb.AddScore(scope, align, CScoreBuilder::eScore_GapCount);
442  int gap_count = sb.GetGapCount(align);
443  align.SetNamedScore("gap_count", gap_count);
444  }
448  }
449  }
450 } // namespace
452 static vector<CConstRef<CSeq_align> > s_RunBlast2NASeq(const CBioseq_Handle& sh, const CBioseq_Handle& qh, bool accept_atleast_one, ICanceled* canceled)
453 {
454  vector<CConstRef<CSeq_align> > align_vector;
456  if (&(sh.GetScope()) != &(qh.GetScope())) {
457  LOG_POST(Error << "Both sequences should be in the same scope");
458  return align_vector;
459  }
461  // both sequences should be in the same scope
462  CNgAligner ng_aligner(sh.GetScope());
464  CRef<CSeq_loc> query_seqloc = qh.GetRangeSeq_loc(0, 0);
465  CRef<CSeq_loc> subject_seqloc = sh.GetRangeSeq_loc(0, 0);
466  if (query_seqloc.IsNull() || subject_seqloc.IsNull()) {
467  return align_vector;
468  }
471  query->SetLocList().push_back(query_seqloc);
472  ng_aligner.SetQuery(query);
475  subject->SetLocList().push_back(subject_seqloc);
476  ng_aligner.SetSubject(subject);
478  auto cb = [](SBlastProgress* prog) -> Boolean
479  {
480  if (!prog || !prog->user_data)
481  return false;
482  return reinterpret_cast<ICanceled*>(prog->user_data)->IsCanceled();
483  };
485  TSeqPos seqLength = sh.GetBioseqLength();
486  bool useHiWordAligner = (seqLength > 12000);
487  if (useHiWordAligner) {
488  CRef<blast::CBlastNucleotideOptionsHandle> opts(new blast::CBlastNucleotideOptionsHandle);
489  opts->SetTraditionalBlastnDefaults();
490  blast::CBlastOptions& options = opts->SetOptions();
492  options.SetWordSize(1200);
493  options.SetEvalueThreshold(1e-6);
494  options.SetBestHitOverhang(0.1); // best_hit_score_edge
495  options.SetBestHitScoreEdge(0.1); // best_hit_overhang
497  CRef<CBlastAligner> blastAligner(new CBlastAligner(*opts, 0));
498  if (canceled)
499  blastAligner->SetInterruptCallback(cb, canceled);
501  ng_aligner.AddAligner(blastAligner);
503  }
505  CRef<blast::CBlastNucleotideOptionsHandle> opts(new blast::CBlastNucleotideOptionsHandle);
506  opts->SetTraditionalBlastnDefaults();
507  blast::CBlastOptions& options = opts->SetOptions();
509  options.SetWordSize(12);
510  options.SetEvalueThreshold(1e-6);
511  options.SetBestHitOverhang(0.1); // best_hit_score_edge
512  options.SetBestHitScoreEdge(0.1); // best_hit_overhang
514  CRef<CBlastAligner> blastAligner(new CBlastAligner(*opts, useHiWordAligner ? 1 : 0));
516  if (canceled)
517  blastAligner->SetInterruptCallback(cb, canceled);
519  ng_aligner.AddAligner(blastAligner);
521  //ng_aligner.AddAligner(new CInversionMergeAligner(1));
523  // adding scores
525  ng_aligner.AddScorer(new CGPipeAlignmentScorer());
526  ng_aligner.AddScorer(new CCommonComponentScorer());
528  // add filters
529  ng_aligner.AddFilter(new CQueryFilter(0, "pct_identity_gapopen_only >= 99.5 AND pct_coverage >= 99"));
530  ng_aligner.AddFilter(new CQueryFilter(1, "pct_identity_gapopen_only >= 95 AND pct_coverage >= 95"));
531  ng_aligner.AddFilter(new CQueryFilter(2, "pct_identity_gapopen_only >= 95 AND pct_coverage >= 50"));
532  ng_aligner.AddFilter(new CQueryFilter(3, "pct_identity_gapopen_only >= 80 AND pct_coverage >= 25"));
533  if (accept_atleast_one) {
534  ng_aligner.AddFilter(new CQueryFilter(4, "align_length > 2"));
535  }
537  CRef<CSeq_align_set> align = ng_aligner.Align();
539  if (align && align->IsSet()) {
540  ITERATE(CSeq_align_set::Tdata, al_it, align->Get()) {
541  align_vector.push_back(CConstRef<CSeq_align>((*al_it)));
542  }
543  }
545  return align_vector;
546 }
548 static vector<CConstRef<CSeq_align> > s_RunBlast2NWSeq(const CBioseq_Handle& sh, const CBioseq_Handle& qh)
549 {
550  vector<CConstRef<CSeq_align> > align_vector;
555  string sSeq, qSeq;
556  sVec.GetSeqData(0, sVec.size(), sSeq);
557  sVec.GetSeqData(0, qVec.size(), qSeq);
558  NStr::ToUpper(sSeq);
559  NStr::ToUpper(qSeq);
561  CRef<CNWAligner> aligner(new CNWAligner(sSeq.c_str(), sSeq.size(), qSeq.c_str(), qSeq.size(), &NCBISM_Blosum62));
562  CNWAligner::TScore score = aligner->Run();
564  CNWFormatter formatter(*aligner);
565  CRef<CSeq_align> align =
566  formatter.AsSeqAlign(0, eNa_strand_plus, 0, eNa_strand_plus);
568  align->SetSegs().SetDenseg().SetIds()[0]->Assign(*sh.GetSeqId());
569  align->SetSegs().SetDenseg().SetIds()[1]->Assign(*qh.GetSeqId());
571  align_vector.push_back(align);
573  return align_vector;
574 }
576 static vector<CConstRef<CSeq_align> > s_RunBlast2AASeq(const CBioseq_Handle& sh, const CBioseq_Handle& qh)
577 {
578  vector<CConstRef<CSeq_align> > align_vector;
580  CRef<CSeq_loc> query_seqloc = qh.GetRangeSeq_loc(0, 0, eNa_strand_plus);
581  CRef<CSeq_loc> subject_seqloc = sh.GetRangeSeq_loc(0, 0, eNa_strand_plus); //is strand correct?
583  blast::SSeqLoc query(query_seqloc.GetPointerOrNull(), &qh.GetScope());
584  blast::SSeqLoc subject(subject_seqloc.GetPointerOrNull(), &sh.GetScope());
586  try {
587  blast::CBlastProteinOptionsHandle prot_opts_handle;
588  prot_opts_handle.SetEvalueThreshold(1e-6);
589  prot_opts_handle.SetWordThreshold(100.0);
590  prot_opts_handle.Validate();
592  blast::CBl2Seq blaster(query, subject, prot_opts_handle);
593  blast::TSeqAlignVector seqaligns = blaster.Run();
595  if (!seqaligns.empty()) {
596  ITERATE(blast::TSeqAlignVector, it, seqaligns) {
597  if ((*it)->IsSet()) {
598  ITERATE(CSeq_align_set::Tdata, al_it, (*it)->Get()) {
599  align_vector.push_back(CConstRef<CSeq_align>((*al_it)));
600  }
601  }
602  }
603  }
604  }
605  catch (const blast::CBlastException& e) {
606  ERR_POST(Error << string(e.what()));
607  NCBI_THROW(CSeqUpdateException, eAlignment, "Options or input parameters were not accepted for Blast");
608  }
610  return align_vector;
611 }
613 bool sequpd::CompareAlignments(const CSeq_align& align_first, const CSeq_align& align_sec)
614 {
615  const auto length_first = align_first.GetAlignLength();
616  const auto length_sec = align_sec.GetAlignLength();
617  return length_first >= length_sec;
618 }
