NCBI C++ ToolKit
sequence_set.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sequence_set.cpp 94031 2021-06-16 20:14:38Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Nathan Bouk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiexpt.hpp>
34 #include <math.h>
36 
37 #include <objmgr/util/sequence.hpp>
38 #include <objmgr/scope.hpp>
40 #include <objmgr/annot_ci.hpp>
46 #include <objects/seq/Seq_ext.hpp>
51 
57 
63 
65 
68 
71 USING_SCOPE(blast);
72 
73 
74 CBlastDbSet::CBlastDbSet(const string& BlastDb) : m_BlastDb(BlastDb)
75 {
76  ;
77 }
78 
79 void CBlastDbSet::SetNegativeGiList(const vector<TGi>& GiList)
80 {
82  ITERATE(vector<TGi>, GiIter, GiList) {
83  m_NegativeGiList->AppendGi(*GiIter);
84  }
85 }
86 
87 
88 void CBlastDbSet::SetPositiveGiList(const vector<TGi>& GiList)
89 {
91  ITERATE(vector<TGi>, GiIter, GiList) {
92  m_PositiveGiList->AppendGi(*GiIter);
93  }
94 }
95 
96 
99  const CBlastOptionsHandle& BlastOpts)
100 {
101  NCBI_THROW(CException, eInvalid,
102  "CreateQueryFactory is not supported for type BlastDb");
103  return CRef<IQueryFactory>();
104 }
105 
106 
109  const CBlastOptionsHandle& BlastOpts,
110  const CAlignResultsSet& Alignments, int Threshold)
111 {
112  NCBI_THROW(CException, eInvalid,
113  "CreateQueryFactory is not supported for type BlastDb");
114  return CRef<IQueryFactory>();
115 }
116 
117 
120  const CBlastOptionsHandle& BlastOpts)
121 {
122  if(m_BlastDb.empty()) {
123  NCBI_THROW(CException, eInvalid,
124  "CBLastDb::CreateLocalDbAdapter: BlastDb is empty.");
125  }
126  CRef<CSearchDatabase> SearchDb;
128 
129  if(! m_Filter.empty()) {
131  }
132 
135  }
136 
138  SearchDb->SetGiList(m_PositiveGiList);
139  }
140 
141  CRef<CLocalDbAdapter> Result;
142  Result.Reset(new CLocalDbAdapter(*SearchDb));
143  return Result;
144 }
145 
146 
147 
149 {
150  ;
151 }
152 
153 
154 list<CRef<CSeq_id> >& CSeqIdListSet::SetIdList()
155 {
156  return m_SeqIdList;
157 }
158 
159 
161 {
162  m_SeqMasker = SeqMasker;
163 }
164 
165 
166 void CSeqIdListSet::GetGiList(vector<TGi>& GiList, CScope& Scope,
167  const CAlignResultsSet& Alignments, int Threshold)
168 {
169  ITERATE(list<CRef<CSeq_id> >, IdIter, m_SeqIdList) {
170 
171  if(Alignments.QueryExists(**IdIter)) {
172  CConstRef<CQuerySet> QuerySet = Alignments.GetQuerySet(**IdIter);
173  int BestRank = QuerySet->GetBestRank();
174  if(BestRank != -1 && BestRank <= Threshold) {
175  continue;
176  }
177  }
178 
179  TGi Gi;
180  Gi = sequence::GetGiForId(**IdIter, Scope);
181  if(Gi != ZERO_GI && Gi != INVALID_GI) {
182  GiList.push_back(Gi);
183  }
184  }
185 }
186 
187 
189  CSeqMasker* SeqMasker,
190  CScope& Scope)
191 {
192  unique_ptr<CSeqMasker::TMaskList> Masks, DustMasks;
195 
196  try {
197  Handle = Scope.GetBioseqHandle(Id);
198  Vector = Handle.GetSeqVector(Handle.eCoding_Iupac, Handle.eStrand_Plus);
199  } catch(CException& e) {
200  ERR_POST(Error << "CSeqIdListSet::CreateQueryFactory GetSeqVector error: " << e.ReportAll());
201  throw e;
202  }
203 
204  CSymDustMasker DustMasker;
205 
206  try {
207  Masks.reset((*SeqMasker)(Vector));
208  DustMasks = DustMasker(Vector);
209  } catch(CException& e) {
210  ERR_POST(Error << "CSeqIdListSet::CreateQueryFactory Dust Masking Failure: " << e.ReportAll());
211  throw e;
212  }
213 
214  if(!DustMasks->empty()) {
215  copy(DustMasks->begin(), DustMasks->end(),
216  insert_iterator<CSeqMasker::TMaskList>(*Masks, Masks->end()));
217  }
218 
219  if(Masks->empty()) {
220  return CRef<CSeq_loc>();
221  }
222 
223  CRef<CSeq_loc> MaskLoc(new CSeq_loc);
224  ITERATE(CSeqMasker::TMaskList, IntIter, *Masks) {
225  CSeq_loc IntLoc;
226  IntLoc.SetInt().SetId().Assign(Id);
227  IntLoc.SetInt().SetFrom() = IntIter->first;
228  IntLoc.SetInt().SetTo() = IntIter->second;
229  MaskLoc->Add(IntLoc);
230  }
231 
233  &Scope);
234  MaskLoc->ChangeToPackedInt();
235 
236  return MaskLoc;
237 }
238 
239 
241  CScope& Scope)
242 {
244  if(!Handle) {
245  cerr << "s_GetClipLoc: Could not get Handle for " << MSerial_AsnText << Id << endl;
246  cerr << Id.GetSeqIdString(true) << endl;
247  return CRef<CSeq_loc>();
248  }
249 
250  // Extract the Seq-annot.locs, for the clip region.
251  CRef<CSeq_loc> ClipLoc;
252 
254  CAnnot_CI AnnotIter(Handle, Sel);
255 
256  while(AnnotIter) {
257  if (AnnotIter->IsFtable() &&
258  AnnotIter->IsNamed() &&
259  AnnotIter->GetName() == "NCBI_GPIPE") {
260  CConstRef<CSeq_annot> Annot = AnnotIter->GetCompleteSeq_annot();
261 
263  Annot->GetData().GetFtable()) {
264  CConstRef<CSeq_feat> Feat = *FeatIter;
265  if (Feat->CanGetLocation() &&
266  Feat->CanGetData() &&
267  Feat->GetData().IsRegion() &&
268  (Feat->GetData().GetRegion() == "high_quality" ||
269  Feat->GetData().GetRegion() == "hight_quality") ) {
270 
271  ClipLoc.Reset(new CSeq_loc);
272  ClipLoc->Assign(Feat->GetLocation());
273  }
274  }
275  }
276  ++AnnotIter;
277  }
278 
279 
280  if(ClipLoc.IsNull() && Handle.HasAnnots() ) {
281  CConstRef<CBioseq> Bioseq = Handle.GetCompleteBioseq();
282  ITERATE(CBioseq::TAnnot, AnnotIter, Bioseq->GetAnnot()) {
283  if( (*AnnotIter)->GetData().IsLocs() ) {
285  LocIter, (*AnnotIter)->GetData().GetLocs()) {
286  if( (*LocIter)->IsInt() &&
287  (*LocIter)->GetInt().GetId().Equals(Id) ) {
288  ClipLoc.Reset(new CSeq_loc);
289  ClipLoc->Assign(**LocIter);
290  }
291  }
292  }
293  }
294  }
295 
296 
297  return ClipLoc;
298 }
299 
300 
302  CScope& Scope)
303 {
305  if(!Handle) {
306  cerr << "s_GetUngapLoc: Could not get Handle for " << MSerial_AsnText << Id << endl;
307  cerr << Id.GetSeqIdString(true) << endl;
308  return CRef<CSeq_loc>();
309  }
310 
311  // Extract the Seq-annot.locs, for the not-gap region.
312  CRef<CSeq_loc> UngapLoc;
313 
314  if(!Handle.IsSetInst_Ext()) {
315  CRef<CSeq_loc> WholeLoc(new CSeq_loc);
316  WholeLoc->SetWhole().Assign(Id);
317  return WholeLoc;
318  }
319 
320  const CSeq_ext& Ext = Handle.GetInst_Ext();
321 
322  if(!Ext.IsDelta()) {
323  CRef<CSeq_loc> WholeLoc(new CSeq_loc);
324  WholeLoc->SetWhole().Assign(Id);
325  return WholeLoc;
326  }
327 
328  UngapLoc.Reset(new CSeq_loc);
329 
330  TSeqPos Curr = 0;
331  ITERATE(CDelta_ext::Tdata, SeqIter, Ext.GetDelta().Get()) {
332  const CDelta_seq& Seq = **SeqIter;
333 
334  if(Seq.IsLiteral() && Seq.GetLiteral().IsSetLength()) {
335  Curr += Seq.GetLiteral().GetLength();
336  continue;
337  }
338  else if(Seq.IsLoc() && Seq.GetLoc().IsInt()) {
339  TSeqPos Length = Seq.GetLoc().GetInt().GetLength();
340 
341  CSeq_loc IntLoc;
342  IntLoc.SetInt().SetId().Assign(Id);
343  IntLoc.SetInt().SetFrom() = Curr;
344  IntLoc.SetInt().SetTo() = Curr + Length - 1;
345  UngapLoc->Add(IntLoc);
346 
347  Curr += Length;
348  }
349  }
350 
352  &Scope);
353  UngapLoc->ChangeToPackedInt();
354 
355  return UngapLoc;
356 }
357 
358 
361  const CBlastOptionsHandle& BlastOpts)
362 {
363  if(m_SeqIdList.empty()) {
364  NCBI_THROW(CException, eInvalid,
365  "CSeqIdListSet::CreateQueryFactory: Id List is empty.");
366  }
367 
368  TSeqLocVector FastaLocVec;
369  ITERATE(list<CRef<CSeq_id> >, IdIter, m_SeqIdList) {
370 
371  CRef<CSeq_loc> WholeLoc;
372  WholeLoc = s_GetClipLoc(**IdIter, Scope);
373  if(WholeLoc.IsNull()) {
374  //WholeLoc = s_GetUngapLoc(**IdIter, Scope);
375  WholeLoc.Reset(new CSeq_loc);
376  WholeLoc->SetWhole().Assign(**IdIter);
377  }
378  string FilterStr = BlastOpts.GetFilterString();
379  if(m_SeqMasker == NULL || FilterStr.find('m') == string::npos ) {
380  SSeqLoc WholeSLoc(*WholeLoc, Scope);
381  FastaLocVec.push_back(WholeSLoc);
382  } else {
383  CRef<CSeq_loc> MaskLoc;
384  MaskLoc = s_GetMaskLoc(**IdIter, m_SeqMasker, Scope);
385 
386  if(MaskLoc.IsNull() /* || Vec.size() < 100*/ ) {
387  SSeqLoc WholeSLoc(*WholeLoc, Scope);
388  FastaLocVec.push_back(WholeSLoc);
389  } else {
390  SSeqLoc MaskSLoc(*WholeLoc, Scope, *MaskLoc);
391  FastaLocVec.push_back(MaskSLoc);
392  }
393  }
394  }
395 
396  CRef<IQueryFactory> Result;
397  if(!FastaLocVec.empty())
398  Result.Reset(new CObjMgr_QueryFactory(FastaLocVec));
399  return Result;
400 }
401 
402 
405  const CBlastOptionsHandle& BlastOpts,
406  const CAlignResultsSet& Alignments, int Threshold)
407 {
408  if(m_SeqIdList.empty()) {
409  NCBI_THROW(CException, eInvalid,
410  "CSeqIdListSet::CreateQueryFactory: Id List is empty.");
411  }
412 
413  TSeqLocVector FastaLocVec;
414  ITERATE(list<CRef<CSeq_id> >, IdIter, m_SeqIdList) {
415 
416  if(Alignments.QueryExists(**IdIter)) {
417  CConstRef<CQuerySet> QuerySet = Alignments.GetQuerySet(**IdIter);
418  int BestRank = QuerySet->GetBestRank();
419  if(BestRank != -1 && BestRank <= Threshold) {
420  continue;
421  }
422  }
423 
424  _TRACE("Blast Including ID: " << (*IdIter)->GetSeqIdString(true));
425 
426 
427  CRef<CSeq_loc> WholeLoc;
428  WholeLoc = s_GetClipLoc(**IdIter, Scope);
429  if(WholeLoc.IsNull()) {
430  //WholeLoc = s_GetUngapLoc(**IdIter, Scope);
431  WholeLoc.Reset(new CSeq_loc);
432  WholeLoc->SetWhole().Assign(**IdIter);
433  }
434 
435  string FilterStr = BlastOpts.GetFilterString();
436  if(m_SeqMasker == NULL || FilterStr.find('m') == string::npos ) {
437  SSeqLoc WholeSLoc(*WholeLoc, Scope);
438  FastaLocVec.push_back(WholeSLoc);
439  } else {
440  CRef<CSeq_loc> MaskLoc;
441  MaskLoc = s_GetMaskLoc(**IdIter, m_SeqMasker, Scope);
442 
443  if(MaskLoc.IsNull()) {
444  SSeqLoc WholeSLoc(*WholeLoc, Scope);
445  FastaLocVec.push_back(WholeSLoc);
446  } else {
447  SSeqLoc MaskSLoc(*WholeLoc, Scope, *MaskLoc);
448  FastaLocVec.push_back(MaskSLoc);
449  }
450  }
451  }
452 
453  CRef<IQueryFactory> Result;
454  if(!FastaLocVec.empty())
455  Result.Reset(new CObjMgr_QueryFactory(FastaLocVec));
456  return Result;
457 }
458 
459 
460 
463  const CBlastOptionsHandle& BlastOpts)
464 {
465  if(m_SeqIdList.empty()) {
466  NCBI_THROW(CException, eInvalid,
467  "CSeqIdListSet::CreateLocalDbAdapter: Id List is empty.");
468  }
469 
470  CRef<CLocalDbAdapter> Result;
471  CRef<IQueryFactory> QueryFactory = CreateQueryFactory(Scope, BlastOpts);
472  Result.Reset(new CLocalDbAdapter(QueryFactory,
473  CConstRef<CBlastOptionsHandle>(&BlastOpts)));
474  return Result;
475 }
476 
477 
478 
480 {
481  ;
482 }
483 
484 
485 list<CRef<CSeq_loc> >& CSeqLocListSet::SetLocList()
486 {
487  return m_SeqLocList;
488 }
489 
490 
492 {
493  m_SeqMasker = SeqMasker;
494 }
495 
496 
497 void CSeqLocListSet::GetGiList(vector<TGi>& GiList, CScope& Scope,
498  const CAlignResultsSet& Alignments, int Threshold)
499 {
500  ITERATE(list<CRef<CSeq_loc> >, LocIter, m_SeqLocList) {
501 
502  const CSeq_id* Id = (*LocIter)->GetId();
503 
504  if(Id == NULL)
505  continue;
506 
507  if(Alignments.QueryExists(*Id)) {
508  CConstRef<CQuerySet> QuerySet = Alignments.GetQuerySet(*Id);
509  int BestRank = QuerySet->GetBestRank();
510  if(BestRank != -1 && BestRank <= Threshold) {
511  continue;
512  }
513  }
514 
515  TGi Gi = INVALID_GI;
516  try {
517  Gi = sequence::GetGiForId(*Id, Scope);
518  } catch(...) { Gi = INVALID_GI; }
519  if(Gi != ZERO_GI && Gi != INVALID_GI) {
520  GiList.push_back(Gi);
521  }
522  }
523 }
524 
525 
528  const CBlastOptionsHandle& BlastOpts)
529 {
530  if(m_SeqLocList.empty()) {
531  NCBI_THROW(CException, eInvalid,
532  "CSeqLocListSet::CreateQueryFactory: Loc List is empty.");
533  }
534 
535  TSeqLocVector FastaLocVec;
536  ITERATE(list<CRef<CSeq_loc> >, LocIter, m_SeqLocList) {
537  const CSeq_id* Id = (*LocIter)->GetId();
538  if(Id == NULL)
539  continue;
540 
541  ERR_POST(Info << "Blast Including Loc: " << Id->AsFastaString() << " " << (*LocIter)->GetTotalRange() );
542 
543  CRef<CSeq_loc> BaseLoc(new CSeq_loc);
544  BaseLoc->Assign(**LocIter);
545  CRef<CSeq_loc> ClipLoc = s_GetClipLoc(*Id, Scope);
546  if(!ClipLoc.IsNull()) {
547  CRef<CSeq_loc> Inters;
548  Inters = BaseLoc->Intersect(*ClipLoc, CSeq_loc::fSortAndMerge_All, NULL);
549  if(!Inters.IsNull()) {
550  BaseLoc->Assign(*Inters);
551  }
552  }
553 
554  string FilterStr = BlastOpts.GetFilterString();
555  if(m_SeqMasker == NULL || FilterStr.find('m') == string::npos ) {
556  SSeqLoc BaseSLoc(*BaseLoc, Scope);
557  FastaLocVec.push_back(BaseSLoc);
558  } else {
559  CRef<CSeq_loc> MaskLoc;
560  MaskLoc = s_GetMaskLoc(*Id, m_SeqMasker, Scope);
561 
562  if(MaskLoc.IsNull() /* || Vec.size() < 100*/ ) {
563  SSeqLoc BaseSLoc(*BaseLoc, Scope);
564  FastaLocVec.push_back(BaseSLoc);
565  } else {
566  SSeqLoc MaskSLoc(*BaseLoc, Scope, *MaskLoc);
567  FastaLocVec.push_back(MaskSLoc);
568  }
569  }
570  }
571 
572  CRef<IQueryFactory> Result;
573  if(!FastaLocVec.empty())
574  Result.Reset(new CObjMgr_QueryFactory(FastaLocVec));
575  return Result;
576 }
577 
578 
581  const CBlastOptionsHandle& BlastOpts,
582  const CAlignResultsSet& Alignments, int Threshold)
583 {
584  if(m_SeqLocList.empty()) {
585  NCBI_THROW(CException, eInvalid,
586  "CSeqLocListSet::CreateQueryFactory: Loc List is empty.");
587  }
588 
589  TSeqLocVector FastaLocVec;
590  ITERATE(list<CRef<CSeq_loc> >, LocIter, m_SeqLocList) {
591  const CSeq_id* Id = (*LocIter)->GetId();
592  if(Id == NULL)
593  continue;
594 
595  if(Alignments.QueryExists(*Id)) {
596  CConstRef<CQuerySet> QuerySet = Alignments.GetQuerySet(*Id);
597  int BestRank = QuerySet->GetBestRank();
598  if(BestRank != -1 && BestRank <= Threshold) {
599  continue;
600  }
601  }
602 
603  ERR_POST(Info << "Blast Including Loc: " << Id->AsFastaString() << " " << (*LocIter)->GetTotalRange() );
604 
605 
606  CRef<CSeq_loc> BaseLoc(new CSeq_loc), ClipLoc;
607  BaseLoc->Assign(**LocIter);
608  ClipLoc = s_GetClipLoc(*Id, Scope);
609  if(!ClipLoc.IsNull()) {
610  CRef<CSeq_loc> Inters;
611  Inters = BaseLoc->Intersect(*ClipLoc, CSeq_loc::fSortAndMerge_All, NULL);
612  if(!Inters.IsNull()) {
613  BaseLoc->Assign(*Inters);
614  }
615  }
616 
617  string FilterStr = BlastOpts.GetFilterString();
618  if(m_SeqMasker == NULL || FilterStr.find('m') == string::npos ) {
619  SSeqLoc BaseSLoc(*BaseLoc, Scope);
620  FastaLocVec.push_back(BaseSLoc);
621  } else {
622  CRef<CSeq_loc> MaskLoc;
623  MaskLoc = s_GetMaskLoc(*Id, m_SeqMasker, Scope);
624 
625  if(MaskLoc.IsNull()) {
626  SSeqLoc BaseSLoc(*BaseLoc, Scope);
627  FastaLocVec.push_back(BaseSLoc);
628  } else {
629  SSeqLoc MaskSLoc(*BaseLoc, Scope, *MaskLoc);
630  FastaLocVec.push_back(MaskSLoc);
631  }
632  }
633 
634  }
635 
636  CRef<IQueryFactory> Result;
637  if(!FastaLocVec.empty())
638  Result.Reset(new CObjMgr_QueryFactory(FastaLocVec));
639  return Result;
640 }
641 
642 
645  const CBlastOptionsHandle& BlastOpts)
646 {
647  if(m_SeqLocList.empty()) {
648  NCBI_THROW(CException, eInvalid,
649  "CSeqLocListSet::CreateLocalDbAdapter: Loc List is empty.");
650  }
651 
652  CRef<CLocalDbAdapter> Result;
653  CRef<IQueryFactory> QueryFactory = CreateQueryFactory(Scope, BlastOpts);
654  Result.Reset(new CLocalDbAdapter(QueryFactory,
655  CConstRef<CBlastOptionsHandle>(&BlastOpts)));
656  return Result;
657 }
658 
659 
660 
662  : m_FastaStream(FastaStream), m_LowerCaseMasking(true),
663  m_Start(-1), m_Count(-1)
664 {
665  ;
666 }
667 
668 
669 CFastaFileSet::CFastaFileSet(CNcbiIstream* FastaStream, int Start, int Count)
670  : m_FastaStream(FastaStream), m_LowerCaseMasking(true),
671  m_Start(Start), m_Count(Count)
672 {
673  ;
674 }
675 
676 
677 void CFastaFileSet::EnableLowerCaseMasking(bool LowerCaseMasking)
678 {
679  m_LowerCaseMasking = LowerCaseMasking;
680 }
681 
682 
685  const CBlastOptionsHandle& BlastOpts)
686 {
687  if(m_FastaStream == NULL) {
688  NCBI_THROW(CException, eInvalid,
689  "CFastaFileSet::CreateQueryFactory: Fasta Stream is NULL.");
690  }
691 
692  m_FastaStream->clear();
693  m_FastaStream->seekg(0, std::ios::beg);
694  CFastaReader FastaReader(*m_FastaStream);
695  Scope.AddTopLevelSeqEntry(*(FastaReader.ReadSet()));
696 
697  SDataLoaderConfig LoaderConfig(false);
698  CBlastInputSourceConfig InputConfig(LoaderConfig);
699  InputConfig.SetLowercaseMask(m_LowerCaseMasking);
700  InputConfig.SetBelieveDeflines(true);
701 
702  m_FastaStream->clear();
703  m_FastaStream->seekg(0, std::ios::beg);
704  CBlastFastaInputSource FastaSource(*m_FastaStream, InputConfig);
705  const EProgram kProgram = eBlastn;
706  CBlastInput Input(&FastaSource, GetQueryBatchSize(kProgram));
707 
708  TSeqLocVector FastaLocVec = Input.GetAllSeqLocs(Scope);
709  //ITERATE(TSeqLocVector, LocIter, FastaLocVec) {
710  // cerr << *LocIter->seqloc->GetId() << endl;
711  //}
712 
713  m_FastaStream->clear();
714  m_FastaStream->seekg(0, std::ios::beg);
715 
716  CRef<IQueryFactory> Result(new CObjMgr_QueryFactory(FastaLocVec));
717  return Result;
718 }
719 
720 
723  const CBlastOptionsHandle& BlastOpts,
724  const CAlignResultsSet& Alignments, int Threshold)
725 {
726  if(m_FastaStream == NULL) {
727  NCBI_THROW(CException, eInvalid,
728  "CFastaFileSet::CreateQueryFactory: Fasta Stream is NULL.");
729  }
730 
731  m_FastaStream->clear();
732  m_FastaStream->seekg(0, std::ios::beg);
733  CFastaReader FastaReader(*m_FastaStream);
734  CRef<CSeq_entry> Entry = FastaReader.ReadSet();
735  try {
736  bool PreExisting = false;
737  if(Entry->IsSet() && Entry->GetSet().GetSeq_set().front()->GetSeq().GetFirstId() != NULL) {
738  const CSeq_id& Id = *Entry->GetSet().GetSeq_set().front()->GetSeq().GetFirstId();
740  if(Handle)
741  PreExisting = true;
742  }
743  if(!PreExisting)
744  Scope.AddTopLevelSeqEntry(*Entry);
745  } catch(...) {
746  ERR_POST(Info << "Eating the Scope Fasta Dup Insert Exception");
747  }
748  SDataLoaderConfig LoaderConfig(false);
749  CBlastInputSourceConfig InputConfig(LoaderConfig);
750  InputConfig.SetLowercaseMask(m_LowerCaseMasking);
751  InputConfig.SetBelieveDeflines(true);
752 
753  m_FastaStream->clear();
754  m_FastaStream->seekg(0, std::ios::beg);
755  CBlastFastaInputSource FastaSource(*m_FastaStream, InputConfig);
756  const EProgram kProgram = eBlastn;
757  CBlastInput Input(&FastaSource, GetQueryBatchSize(kProgram));
758 
759  TSeqLocVector FastaLocVec = Input.GetAllSeqLocs(Scope);
760 
761  if(m_Count > 0) {
762  int i = 0;
763  TSeqLocVector::iterator Curr;
764  for(Curr = FastaLocVec.begin(); Curr != FastaLocVec.end(); ) {
765  //cerr << " * " << Curr->seqloc->GetId()->AsFastaString() << " * " << endl;
766  if( i < m_Start) {
767  Curr = FastaLocVec.erase(Curr);
768  i++;
769  continue;
770  }
771  else if( i > m_Start + m_Count) {
772  Curr = FastaLocVec.erase(Curr);
773  i++;
774  continue;
775  }
776  else {
777  ++Curr;
778  i++;
779  continue;
780  }
781  }
782  m_Start += m_Count;
783  }
784 
785 
786  TSeqLocVector::iterator Curr;
787  for(Curr = FastaLocVec.begin(); Curr != FastaLocVec.end(); ) {
788  if(Alignments.QueryExists(*Curr->seqloc->GetId())) {
789  CConstRef<CQuerySet> QuerySet = Alignments.GetQuerySet(*Curr->seqloc->GetId());
790  int BestRank = QuerySet->GetBestRank();
791  if(BestRank != -1 && BestRank <= Threshold) {
792  Curr = FastaLocVec.erase(Curr);
793  continue;
794  }
795  }
796  ++Curr;
797  }
798 
799 
800  m_FastaStream->clear();
801  m_FastaStream->seekg(0, std::ios::beg);
802 
803  if(FastaLocVec.empty())
804  return CRef<IQueryFactory>();
805 
806  CRef<IQueryFactory> Result(new CObjMgr_QueryFactory(FastaLocVec));
807  return Result;
808 }
809 
810 
813  const CBlastOptionsHandle& BlastOpts)
814 {
815  CRef<CLocalDbAdapter> Result;
816  CRef<IQueryFactory> QueryFactory = CreateQueryFactory(Scope, BlastOpts);
817  Result.Reset(new CLocalDbAdapter(QueryFactory, CConstRef<CBlastOptionsHandle>(&BlastOpts)));
818  return Result;
819 }
820 
821 
823 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
USING_SCOPE(objects)
CRef< CSeq_loc > s_GetMaskLoc(const CSeq_id &Id, CSeqMasker *SeqMasker, CScope &Scope)
CRef< CSeq_loc > s_GetClipLoc(const CSeq_id &Id, CScope &Scope)
CRef< CSeq_loc > s_GetUngapLoc(const CSeq_id &Id, CScope &Scope)
Data loader implementation that uses the blast databases.
Declares the CBl2Seq (BLAST 2 Sequences) class.
@ eSoftSubjMasking
Definition: blast_def.h:237
Interface for reading SRA sequences into blast input.
Auxiliary classes/functions for BLAST input library.
int GetQueryBatchSize(EProgram program, bool is_ungapped=false, bool remote=false, bool use_default=true, string task="", bool mt_mode=false)
Retrieve the appropriate batch size for the specified task.
Declares the CBlastNucleotideOptionsHandle class.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
Declares CBlastScopeSource class to create properly configured CScope objects to invoke the BLAST dat...
Definitions of special type used in BLAST.
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
Definition: blast_types.hpp:56
@ eBlastn
Nucl-Nucl (traditional blastn)
Definition: blast_types.hpp:58
CRef< CQuerySet > GetQuerySet(const objects::CSeq_id &Id)
bool QueryExists(const objects::CSeq_id &Id) const
Definition: result_set.cpp:506
CAnnot_CI –.
Definition: annot_ci.hpp:59
CBioseq_Handle –.
void SetPositiveGiList(const vector< TGi > &GiList)
CRef< CInputGiList > m_PositiveGiList
CRef< blast::CLocalDbAdapter > CreateLocalDbAdapter(objects::CScope &Scope, const blast::CBlastOptionsHandle &BlastOpts)
string m_BlastDb
CRef< CInputGiList > m_NegativeGiList
CRef< blast::IQueryFactory > CreateQueryFactory(objects::CScope &Scope, const blast::CBlastOptionsHandle &BlastOpts)
void SetNegativeGiList(const vector< TGi > &GiList)
Class representing a text file containing sequences in fasta format.
Class that centralizes the configuration data for sequences to be converted.
Definition: blast_input.hpp:48
Generalized converter from an abstract source of biological sequence data to collections of blast inp...
Handle to the options to the BLAST algorithm.
CConstRef –.
Definition: ncbiobj.hpp:1266
CDelta_seq –.
Definition: Delta_seq.hpp:66
CFastaFileSet(CNcbiIstream *FastaStream)
void EnableLowerCaseMasking(bool LowerCaseMasking)
CRef< blast::CLocalDbAdapter > CreateLocalDbAdapter(objects::CScope &Scope, const blast::CBlastOptionsHandle &BlastOpts)
CRef< blast::IQueryFactory > CreateQueryFactory(objects::CScope &Scope, const blast::CBlastOptionsHandle &BlastOpts)
CNcbiIstream * m_FastaStream
Base class for reading FASTA sequences.
Definition: fasta.hpp:80
Gi List for database construction.
void AppendGi(TGi gi, int oid=-1)
Append a GI.
Interface to create a BlastSeqSrc suitable for use in CORE BLAST from a a variety of BLAST database/s...
NCBI C++ Object Manager dependant implementation of IQueryFactory.
CScope –.
Definition: scope.hpp:92
Blast Search Subject.
bool NotEmpty() const
Return true if there are elements present.
list< CRef< objects::CSeq_id > > & SetIdList()
CRef< blast::IQueryFactory > CreateQueryFactory(objects::CScope &Scope, const blast::CBlastOptionsHandle &BlastOpts)
CRef< blast::CLocalDbAdapter > CreateLocalDbAdapter(objects::CScope &Scope, const blast::CBlastOptionsHandle &BlastOpts)
list< CRef< objects::CSeq_id > > m_SeqIdList
void GetGiList(vector< TGi > &GiList, objects::CScope &Scope, const CAlignResultsSet &Alignments, int Threshold)
CSeqMasker * m_SeqMasker
void SetSeqMasker(CSeqMasker *SeqMasker)
CRef< blast::IQueryFactory > CreateQueryFactory(objects::CScope &Scope, const blast::CBlastOptionsHandle &BlastOpts)
list< CRef< objects::CSeq_loc > > m_SeqLocList
CSeqMasker * m_SeqMasker
void SetSeqMasker(CSeqMasker *SeqMasker)
list< CRef< objects::CSeq_loc > > & SetLocList()
CRef< blast::CLocalDbAdapter > CreateLocalDbAdapter(objects::CScope &Scope, const blast::CBlastOptionsHandle &BlastOpts)
void GetGiList(vector< TGi > &GiList, objects::CScope &Scope, const CAlignResultsSet &Alignments, int Threshold)
Main interface to window based masker functionality.
Definition: seq_masker.hpp:53
vector< TMaskedInterval > TMaskList
A type representing the total of masking information about a sequence.
Definition: seq_masker.hpp:74
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_ext –.
Definition: Seq_ext.hpp:66
TSeqPos GetLength(void) const
Looks for low complexity parts of sequences according to the symmetric version of DUST algorithm.
Definition: symdust.hpp:61
Declares the CDiscNucleotideOptionsHandle class.
Operators to edit gaps in sequences.
#define true
Definition: bool.h:35
void SetNegativeGiList(CSeqDBGiList *gilist)
Mutator for the negative gi list.
void SetFilteringAlgorithm(int filt_algorithm_id)
Temporary fix for backwards compatibility with other 6.0 SCs.
char * GetFilterString() const
Returns FilterString.
void SetGiList(CSeqDBGiList *gilist)
Mutator for the gi list.
@ eBlastDbIsNucleotide
nucleotide
#define INVALID_GI
Definition: ncbimisc.hpp:1089
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define NULL
Definition: ncbistd.hpp:225
#define Handle
Definition: ncbistd.hpp:119
#define _TRACE(message)
Definition: ncbidbg.hpp:122
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
string ReportAll(TDiagPostFlags flags=eDPF_Exception) const
Report all exceptions.
Definition: ncbiexpt.cpp:370
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
void ChangeToPackedInt(void)
Works only if location is currently an interval, point, packed-int (handled trivially),...
Definition: Seq_loc.cpp:3670
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
CRef< CSeq_loc > Intersect(const CSeq_loc &other, TOpFlags flags, ISynonymMapper *syn_mapper) const
Find the intersection with the seq-loc, merge/sort resulting ranges depending on flags.
Definition: Seq_loc.cpp:5183
@ fSortAndMerge_All
Definition: Seq_loc.hpp:334
CRef< CSeq_loc > Seq_loc_Merge(const CSeq_loc &loc, CSeq_loc::TOpFlags flags, CScope *scope)
Merge ranges in the seq-loc.
TGi GetGiForId(const objects::CSeq_id &id, CScope &scope, EGetIdType flags=0)
Given a Seq-id retrieve the corresponding GI.
Definition: sequence.cpp:668
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
bool IsNamed(void) const
bool IsFtable(void) const
CConstRef< CSeq_annot > GetCompleteSeq_annot(void) const
Complete and return const reference to the current seq-annot.
const string & GetName(void) const
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
const TRegion & GetRegion(void) const
Get the variant data.
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool CanGetData(void) const
Check if it is safe to call GetData method.
Definition: Seq_feat_.hpp:919
bool CanGetLocation(void) const
Check if it is safe to call GetLocation method.
Definition: Seq_feat_.hpp:1111
bool IsRegion(void) const
Check if variant Region is selected.
@ e_Region
named region (globin locus)
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
const TLiteral & GetLiteral(void) const
Get the variant data.
Definition: Delta_seq_.cpp:124
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
bool IsLoc(void) const
Check if variant Loc is selected.
Definition: Delta_seq_.hpp:257
TLength GetLength(void) const
Get the Length member data.
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
bool IsSetLength(void) const
must give a length in residues Check if a value has been assigned to Length data member.
list< CRef< CSeq_loc > > TLocs
Definition: Seq_annot_.hpp:197
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
const TLoc & GetLoc(void) const
Get the variant data.
Definition: Delta_seq_.cpp:102
const TFtable & GetFtable(void) const
Get the variant data.
Definition: Seq_annot_.hpp:621
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
bool IsLiteral(void) const
Check if variant Literal is selected.
Definition: Delta_seq_.hpp:263
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
int i
Magic spell ;-) needed for some weird compilers... very empiric.
Defines NCBI C++ exception handling.
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
The Object manager core.
Defines exception class and several constants for SeqDB.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Definition: entry.h:57
SAnnotSelector –.
Configuration structure for the CBlastScopeSource.
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
#define const
Definition: zconf.h:232
Modified on Fri Sep 20 14:57:35 2024 by modify_doxy.py rev. 669887