NCBI C++ ToolKit
igblast.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* ===========================================================================
2  *
3  * PUBLIC DOMAIN NOTICE
4  * National Center for Biotechnology Information
5  *
6  * This software/database is a "United States Government Work" under the
7  * terms of the United States Copyright Act. It was written as part of
8  * the author's official duties as a United States Government employee and
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author: Ning Ma
26  *
27  */
28 
29 /** @file igblast.cpp
30  * Implementation of CIgBlast.
31  */
32 
33 #include <ncbi_pch.hpp>
43 
44 
45 /** @addtogroup AlgoBlast
46  *
47  * @{
48  */
49 
52 BEGIN_SCOPE(blast)
53 
64 
65 static void s_ReadLinesFromFile(const string& fn, vector<string>& lines)
66 {
67  CNcbiIfstream fs(fn.c_str(), IOS_BASE::in);
68  lines.clear();
69 
70  if (CFile(fn).Exists() && ! fs.fail()) {
71  char line[256];
72  while(true) {
73  fs.getline(line, 256);
74  if (fs.eof()) break;
75  if (line[0] == '#') continue;
76  string l(line);
77  lines.push_back(l);
78  }
79  }
80  fs.close();
81 };
82 
84 {
85  vector<string> lines;
86 
87  // read domain info from pdm or ndm file
88  const string suffix = (ig_opt->m_IsProtein) ? ".pdm." : ".ndm.";
89  string fn(SeqDB_ResolveDbPath(ig_opt->m_IgDataPath + "/" + ig_opt->m_Origin + "/"
90  + ig_opt->m_Origin + suffix + ig_opt->m_DomainSystem));
91 
92  if (ig_opt->m_CustomInternalData != NcbiEmptyString) {
93  //use custom data
94  fn = ig_opt->m_CustomInternalData;
95  }
96  if (fn == "") {
97  NCBI_THROW(CBlastException, eInvalidArgument,
98  "Domain annotation data file could not be found in [internal_data] directory");
99  }
100  s_ReadLinesFromFile(fn, lines);
101  int index = 0;
102  ITERATE(vector<string>, l, lines) {
103  vector<string> tokens;
104  NStr::Split(*l, " \t\n\r", tokens, NStr::fSplit_Tokenize);
105  if (!tokens.empty()) {
106  m_DomainIndex[tokens[0]] = index;
107  for (int i=1; i<11; ++i) {
108  m_DomainData.push_back(NStr::StringToInt(tokens[i]));
109  }
110  index += 10;
111  m_DomainChainType[tokens[0]] = tokens[11];
112  int frame = NStr::StringToInt(tokens[12]);
113  if (frame != -1) {
114  m_FrameOffset[tokens[0]] = frame;
115  }
116  }
117  }
118 
119 
120  // read J frame info from aux files
121  if (ig_opt->m_IsProtein) return;
122  fn = ig_opt->m_AuxFilename;
123  s_ReadLinesFromFile(fn, lines);
124  if (lines.size() == 0) {
125  ERR_POST(Warning << "Auxilary data file could not be found");
126  }
127  ITERATE(vector<string>, l, lines) {
128  vector<string> tokens;
129  NStr::Split(*l, " \t\n\r", tokens, NStr::fSplit_Tokenize);
130  if (!tokens.empty()) {
131  int frame = NStr::StringToInt(tokens[1]);
132  if (frame != -1) {
133  m_FrameOffset[tokens[0]] = frame;
134  }
135  if (tokens.size() == 3) { //just backward compatible as there was no such field
136  m_DJChainType[tokens[0]] = tokens[2];
137  } else if (tokens.size() == 4) { //just backward compatible as there was no such field
138  m_DJChainType[tokens[0]] = tokens[2];
139  m_JDomainInfo[tokens[0]] = NStr::StringToInt(tokens[3]);
140  } else if (tokens.size() == 5) { //just backward compatible as there was no such field
141  m_DJChainType[tokens[0]] = tokens[2];
142  m_JDomainInfo[tokens[0]] = NStr::StringToInt(tokens[3]);
143  m_Fwr4EndOffset[tokens[0]] = NStr::StringToInt(tokens[4]);
144  }
145 
146  }
147  }
148 
149  //read D frame definition
150  lines.clear();
151  fn = NcbiEmptyString;
152  if (ig_opt->m_DFrameFileName != NcbiEmptyString) {
153  fn = ig_opt->m_DFrameFileName;
154  s_ReadLinesFromFile(fn, lines);
155  if (lines.size() == 0) {
156  ERR_POST(Warning << "D gene frame definition file could not be found");
157  }
158  ITERATE(vector<string>, l, lines) {
159  vector<string> tokens;
160  NStr::Split(*l, " \t\n\r", tokens, NStr::fSplit_Tokenize);
161  if (!tokens.empty()) {
162  int frame = NStr::StringToInt(tokens[1]);
163  if (frame != -1) {
164  m_FrameOffset[tokens[0]] = frame;
165  }
166 
167  }
168  }
169 
170  }
171 };
172 
175  if ((*result)->HasAlignments()) {
176  CSeq_align_set::Tdata & align_list = (*result)->SetSeqAlign()->Set();
177  CSeq_align_set::Tdata::iterator it = align_list.begin();
178  while (it != align_list.end()) {
179  if((int)((*it)->GetAlignLength()) - (int)((*it)->GetTotalGapCount(0)) < length){
180  it = align_list.erase(it);
181  } else {
182  ++it;
183  }
184  }
185  }
186  }
187 }
188 
189 
191 
193 
194  if ((*result)->HasAlignments()) {
195  CSeq_align_set::Tdata & align_list = (*result)->SetSeqAlign()->Set();
196  int desired_len = 0;
197  int actual_len = 0;
198  int top_hit_actual_len = 0;
199  int count = 0;
200  ENa_strand extend_strand = eNa_strand_plus;
201  int highest_score = 0;
202 
203  NON_CONST_ITERATE(CSeq_align_set::Tdata, align, align_list) {
204 
205  // cerr << "before=" << MSerial_AsnText << **align << endl;
206 
207  //extend germline match up to some positions at 5' end. Extend length is
208  //set by comparing to top hit or top hit equivalents
209 
210  int score = 0;
211  (*align)->GetNamedScore(CSeq_align::eScore_Score, score);
212 
213  if (score >= highest_score) { //top hits
214  highest_score = score;
215  extend_strand = (*align)->GetSeqStrand(0);
216  desired_len = min(extend_length5end,
217  (*align)->GetSegs().GetDenseg().GetStarts()[1]);
218 
219  if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
220  int query_len = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength();
221  int allowed_len = min ((*align)->GetSegs().GetDenseg().GetStarts()[1],
222  query_len - ((*align)->GetSegs().GetDenseg().GetStarts()[0] +
223  (int)(*align)->GetSegs().GetDenseg().GetLens()[0]));
224  top_hit_actual_len = min(desired_len, allowed_len);
225 
226 
227  } else {
228 
229  top_hit_actual_len = min(desired_len,
230  min((*align)->GetSegs().GetDenseg().GetStarts()[0],
231  (*align)->GetSegs().GetDenseg().GetStarts()[1]));
232 
233  }
234  }
235 
236  if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
237  int query_len = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength();
238  int allowed_len = min ((*align)->GetSegs().GetDenseg().GetStarts()[1],
239  query_len - ((*align)->GetSegs().GetDenseg().GetStarts()[0] +
240  (int)(*align)->GetSegs().GetDenseg().GetLens()[0]));
241  actual_len = min(top_hit_actual_len, min(desired_len, allowed_len));
242 
243 
244  } else {
245 
246  actual_len = min(top_hit_actual_len, min(desired_len,
247  min((*align)->GetSegs().GetDenseg().GetStarts()[0],
248  (*align)->GetSegs().GetDenseg().GetStarts()[1])));
249 
250  }
251 
252  count ++;
253  //only extend if it has the same strand as the top hit
254  if (actual_len > 0 && (*align)->GetSeqStrand(0) == extend_strand) {
255  if (extend_strand == eNa_strand_minus) {
256 
257  (*align)->SetSegs().SetDenseg().SetStarts()[1] -= actual_len;
258  (*align)->SetSegs().SetDenseg().SetLens()[0] += actual_len;
259 
260  } else {
261 
262  (*align)->SetSegs().SetDenseg().SetStarts()[0] -= actual_len;
263  (*align)->SetSegs().SetDenseg().SetStarts()[1] -= actual_len;
264  (*align)->SetSegs().SetDenseg().SetLens()[0] += actual_len;
265 
266  }
267  }
268  // cerr << "after=" << MSerial_AsnText << **align << endl;
269  }
270 
271  }
272  }
273 }
274 
276 
278 
279  if ((*result)->HasAlignments()) {
280  CSeq_align_set::Tdata & align_list = (*result)->SetSeqAlign()->Set();
281  int desired_len = 0;
282  int actual_len = 0;
283  int top_hit_actual_len = 0;
284  ENa_strand extend_strand = eNa_strand_plus;
285  int highest_score = 0;
286 
287  NON_CONST_ITERATE(CSeq_align_set::Tdata, align, align_list) {
288 
289  // cerr << "before=" << MSerial_AsnText << **align << endl;
290 
291  //extend germline match up to some positions at 5' end. Extend length is
292  //set by comparing to top hit or top hit equivalents
293 
294  int score = 0;
295  (*align)->GetNamedScore(CSeq_align::eScore_Score, score);
296 
297  if (score >= highest_score) { //top hits
298  highest_score = score;
299  extend_strand = (*align)->GetSeqStrand(0);
300 
301  int j_stop = m_Scope->GetBioseqHandle((*align)->GetSeq_id(1)).GetBioseqLength() - 1;
302  int j_align_stop = (*align)->GetSegs().GetDenseg().GetSeqStop(1);
303  desired_len = min(extend_length3end,
304  j_stop - j_align_stop);
305 
306  if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
307 
308  int query_align_start = (*align)->GetSegs().GetDenseg().GetSeqStart(0);
309  int allowed_query_length = query_align_start;
310 
311  top_hit_actual_len = min(desired_len, allowed_query_length);
312  } else {
313  int query_stop = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength() - 1;
314  int allowed_query_length = query_stop - (*align)->GetSegs().GetDenseg().GetSeqStop(0);
315  top_hit_actual_len = min(desired_len, allowed_query_length);
316 
317  }
318  }
319 
320  if ((*align)->GetSeqStrand(0) == eNa_strand_minus) {
321 
322  int query_align_start = (*align)->GetSegs().GetDenseg().GetSeqStart(0);
323  int allowed_query_length = query_align_start;
324  actual_len = min(allowed_query_length, top_hit_actual_len);
325 
326  } else {
327  int query_stop = m_Scope->GetBioseqHandle((*align)->GetSeq_id(0)).GetBioseqLength() - 1;
328  int allowed_query_length = query_stop - (*align)->GetSegs().GetDenseg().GetSeqStop(0);
329  actual_len = min(top_hit_actual_len, allowed_query_length);
330 
331  }
332 
333  //only extend if it has the same strand as the top hit
334  if (actual_len > 0 && (*align)->GetSeqStrand(0) == extend_strand) {
335  if (extend_strand == eNa_strand_minus) {
336 
337  int num_seg = (*align)->GetSegs().GetDenseg().GetNumseg();
338  int num_dim = (*align)->GetSegs().GetDenseg().GetDim();
339  (*align)->SetSegs().SetDenseg().SetStarts()[num_seg*num_dim - 2] -= actual_len;
340  (*align)->SetSegs().SetDenseg().SetLens()[num_seg-1] += actual_len;
341 
342  } else {
343  int num_seg = (*align)->GetSegs().GetDenseg().GetNumseg();
344  (*align)->SetSegs().SetDenseg().SetLens()[num_seg-1] += actual_len;
345 
346  }
347  }
348  // cerr << "after=" << MSerial_AsnText << **align << endl;
349  }
350 
351  }
352  }
353 }
354 
357 {
358  vector<CRef <CIgAnnotation> > annots;
359  CRef<CSearchResultSet> final_results;
362  ::Create((m_IgOptions->m_IsProtein)? eBlastp: eBlastn));
364 
365  /*** search V germline */
366  {
367  x_SetupVSearch(qf, opts_hndl);
368  CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[0]);
370  results[0] = blast.Run();
373  }
377  x_AnnotateV(results[0], annots);
378  }
379 
380  /*** search internal V for domain annotation */
382  //restore default settings for internal db search
383  if (m_IgOptions->m_IsProtein) {
385  } else {
386  opts_hndl->SetOptions().SetMismatchPenalty(-1);
387  opts_hndl->SetOptions().SetWordSize(9);
388  opts_hndl->SetOptions().SetGapOpeningCost(4);
389  opts_hndl->SetOptions().SetGapExtensionCost(1);
390  }
391  opts_hndl->SetEvalueThreshold(20);
392  opts_hndl->SetHitlistSize(20); // use a larger number to ensure annotation
393  CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[3]);
395  results[3] = blast.Run();
398  }
401  x_AnnotateDomain(results[0], results[3], annots);
402  } else {
403  x_AnnotateDomain(results[0], results[0], annots);
404  }
405 
406  opts_hndl.Reset(CBlastOptionsFactory
407  ::Create((m_IgOptions->m_IsProtein)? eBlastp: eBlastn));
408 
409 
410  /*** search DJ germline */
411  int num_genes = (m_IgOptions->m_IsProtein) ? 1 : 3;
412  if (num_genes > 1) {
413 
414  for (int gene = 1; gene < num_genes; ++gene) {
415  x_SetupDJSearch(annots, qf, opts_hndl, gene);
416  CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[gene]);
417  try {
419  results[gene] = blast.Run();
420  if (gene == 2){
422  x_ExtendAlign3end(results[gene]);
423  }
425  }
427  } catch(...) {
428  num_genes = 1;
429  break;
430  }
431  }
432  x_ProcessDJResult(results[0], results[1], results[2], annots);
433 
435  x_AnnotateDJ(results[1], results[2], annots);
436  } else {
437  x_AnnotateJ(results[2], annots);
438  //redo d gene search and not allow dj overlap
439  x_SetupNoOverlapDSearch(annots, results[1], qf, opts_hndl, 1);
440  CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[1]);
441  try {
443  results[1] = blast.Run();
444 
446  } catch(...) {
447  cerr << "blast failed" << endl;
448  }
449  x_ProcessDGeneResult(results[0], results[1], results[2],annots);
450  x_AnnotateD(results[1], annots);
451  }
452  }
453 
454  /*** collect germline search results */
455  for (int gene = 0; gene < num_genes; ++gene) {
456  s_AppendResults(results[gene], m_IgOptions->m_NumAlign[gene], gene, final_results);
457  }
458 
459 
460  //C gene blast
461  if(m_IgOptions->m_NumAlign[3] > 0 && m_IgOptions->m_Db[4] && (results[2] && !(results[2]->empty()))) {
462  x_SetupCRegionSearch(annots, qf, opts_hndl);
463  CLocalBlast blast(qf, opts_hndl, m_IgOptions->m_Db[4]);
464  try {
466  result = blast.Run();
467 
468  }catch(...) {
469  cerr << "blast failed" << endl;
470  }
471  if (result) {
474  x_ProcessCResult(result, annots);
475  x_AnnotateC(result, results[2], annots);
476  s_AppendResults(result, m_IgOptions->m_NumAlign[3], 3, final_results);
477 
478  }
479  }
480 
481 
482 
483  /*** search user specified db */
484  bool skipped = false;
485  if (m_IsLocal) {
486  if (&(*m_LocalDb) != &(*(m_IgOptions->m_Db[0]))) {
487  x_SetupDbSearch(annots, qf);
488  CLocalBlast blast(qf, m_Options, m_LocalDb);
490  result = blast.Run();
491  } else {
492  skipped = true;
493  }
494  } else {
495  x_SetupDbSearch(annots, qf);
496 
497  CRef<CRemoteBlast> blast;
498  if (m_RemoteDb.NotEmpty()) {
500  blast.Reset(new CRemoteBlast(qf, m_Options, *m_RemoteDb));
502  blast->SetEntrezQuery(m_EntrezQuery.c_str());
503  }
504  } else {
505  blast.Reset(new CRemoteBlast(qf, m_Options, m_Subject));
506  }
507  blast->Submit();
508  m_RID=blast->GetRID();
509  GetDiagContext().Extra().Print("RID", m_RID);
510  result = blast->GetResultSet();
511  }
512 
513  if (! skipped) {
516  s_AppendResults(result, -1, -1, final_results);
517  }
518 
519  /*** set chain type info */
520  x_SetChainType(final_results, annots);
521 
522  /*** attach annotation info back to the results */
523  x_SetAnnotation(annots, final_results);
524 
525  return final_results;
526 };
527 
528 // Compare two seqaligns according to their evalue and coverage and name
529 //compare name since blast does not guarantee order of same score hits
531 {
532  int sx, sy;
535  if (sx != sy) return (sx > sy);
536 
537  sx = x->GetAlignLength();
538  sy = y->GetAlignLength();
539  if (sx != sy) {
540  return (sx >= sy);
541  }
542 
543  string x_id = NcbiEmptyString;
544  string y_id = NcbiEmptyString;
545  x->GetSeq_id(1).GetLabel(&x_id, CSeq_id::eContent);
546  y->GetSeq_id(1).GetLabel(&y_id, CSeq_id::eContent);
547  return (x_id < y_id);
548 
549 };
550 
551 
552 
554  CRef<CBlastOptionsHandle> &opts_hndl)
555 {
556  CBlastOptions & opts = opts_hndl->SetOptions();
557  if (m_IgOptions->m_IsProtein) {
559  } else {
560  int penalty = m_IgOptions->m_V_penalty;
561  opts.SetMatchReward(1);
562  opts.SetMismatchPenalty(penalty);
564  if (penalty == -1) {
565  opts.SetGapOpeningCost(4);
566  opts.SetGapExtensionCost(1);
567  }
568  }
570  opts_hndl->SetFilterString("F");
571  opts_hndl->SetHitlistSize(15+ m_IgOptions->m_NumAlign[0]);
573 
574 };
577  CRef<CBlastOptionsHandle> &opts_hndl){
578 
579  CBlastOptions & opts = opts_hndl->SetOptions();
580  opts.SetMatchReward(1);
581  opts.SetWordSize(7);
582  opts.SetMismatchPenalty(-3);
583  opts.SetGapOpeningCost(5);
584  opts.SetGapExtensionCost(2);
585  opts_hndl->SetEvalueThreshold(1e-5);
586  opts_hndl->SetFilterString("F");
587  opts_hndl->SetHitlistSize(5+m_IgOptions->m_NumAlign[3]);
588 
589  // Mask V through J
590  int iq = 0;
591  ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
593  CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
594  int len = query->GetLength();
595  TMaskedQueryRegions mask_list;
596  if ((*annot)->m_GeneInfo[0] == -1 || (*annot)->m_GeneInfo[4] == -1 || (*annot)->m_GeneInfo[5] == -1) {
597  // This is not a ig sequence or there is no v gene or J gene per previous search. Mask it out
598  CRef<CSeqLocInfo> mask(new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
599  mask_list.push_back(mask);
600  } else {
601  // Excluding the V gene through J gene
602  bool ms = (*annot)->m_MinusStrand;
603  if (ms) {
604  //-2 due to typically extra 1 bp beyond J coding region
605  CRef<CSeqLocInfo> mask(new CSeqLocInfo(new CSeq_interval(*q_id, (*annot)->m_GeneInfo[4] + 2, len-1), 0));
606  mask_list.push_back(mask);
607  } else {
608  //-2 due to typically extra 1 bp beyond J coding region
609  CRef<CSeqLocInfo> mask(new CSeqLocInfo(new CSeq_interval(*q_id, 0, (*annot)->m_GeneInfo[5] - 2), 0));
610  mask_list.push_back(mask);
611  }
612 
613  }
614  m_Query->SetMaskedRegions(iq, mask_list);
615  ++iq;
616  }
617 
618  // Generate query factory
620 
621 }
624  CRef<CBlastOptionsHandle> &opts_hndl,
625  int db_type)
626 {
627  // Only igblastn will search DJ
628  CBlastOptions & opts = opts_hndl->SetOptions();
629  opts.SetMatchReward(1);
630  if (db_type == 2){ //J genes are longer so if can afford more reliable identification
631  opts.SetWordSize(j_wordsize);
633  } else {
636  }
637 
638  opts.SetGapOpeningCost(5);
639  opts.SetGapExtensionCost(2);
640  opts_hndl->SetEvalueThreshold((db_type == 2) ? 1000.0 : 100000.0);
641  opts_hndl->SetFilterString("F");
642  opts_hndl->SetHitlistSize(max(max(50,
643  m_IgOptions->m_NumAlign[1]),
644  m_IgOptions->m_NumAlign[2]));
645 
646  // Mask query for D, J search
647  int iq = 0;
648  ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
650  CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
651  int len = query->GetLength();
652  if ((*annot)->m_GeneInfo[0] == -1) {
653  // This is not a germline sequence. Mask it out
654  TMaskedQueryRegions mask_list;
656  new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
657  mask_list.push_back(mask);
658  m_Query->SetMaskedRegions(iq, mask_list);
659  } else {
660  // Excluding the V gene except the last max_v_j_overlap bp for D and J gene search;
661  // also limit the J match to max_allowed_V_end_to_J_end beyond V gene.
662  int v_overlap;
664  v_overlap = max_v_j_overlap;
665  } else {
666  v_overlap = 0;
667  }
668  bool ms = (*annot)->m_MinusStrand;
669  int begin = (ms)?
670  (*annot)->m_GeneInfo[0] - max_allowed_V_end_to_J_end: (*annot)->m_GeneInfo[1] - 1 - v_overlap;
671  int end = (ms)?
672  (*annot)->m_GeneInfo[0] + v_overlap: (*annot)->m_GeneInfo[1] + max_allowed_V_end_to_J_end;
673  if (begin > 0 && begin <= len-1) {
675  new CSeqLocInfo(new CSeq_interval(*q_id, 0, begin), 0));
676  m_Query->AddMask(iq, mask);
677  }
678  if (end < len -1 && end >= 0) {
680  new CSeqLocInfo(new CSeq_interval(*q_id, end, len-1), 0));
681  m_Query->AddMask(iq, mask);
682  }
683  }
684  ++iq;
685  }
686 
687  // Generate query factory
689 };
690 
691 
693  CRef<CSearchResultSet> &previous_d_results,
695  CRef<CBlastOptionsHandle> &opts_hndl,
696  int db_type)
697 {
698  // Only igblastn will search DJ
699  CBlastOptions & opts = opts_hndl->SetOptions();
700  opts.SetMatchReward(1);
703  opts.SetGapOpeningCost(5);
704  opts.SetGapExtensionCost(2);
705  opts_hndl->SetEvalueThreshold(100000.0);
706  opts_hndl->SetFilterString("F");
707  opts_hndl->SetHitlistSize(max(max(50,
708  m_IgOptions->m_NumAlign[1]),
709  m_IgOptions->m_NumAlign[2]));
710 
711  // Mask query for D
712  int iq = 0;
713  ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
715  CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
716  int len = query->GetLength();
717  CRef<CSeq_align_set> align_d(0);
718  if ((*previous_d_results)[iq].HasAlignments()){
719  align_d = (*previous_d_results)[iq].SetSeqAlign();
720  }
721 
722  if ((*annot)->m_GeneInfo[0] == -1 || !align_d || align_d.Empty() || align_d->IsEmpty()) {
723  // This is not a ig sequence or there is no d gene per previous search. Mask it out
724  TMaskedQueryRegions mask_list;
726  new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
727  mask_list.push_back(mask);
728  m_Query->SetMaskedRegions(iq, mask_list);
729  } else {
730  // Excluding the V gene and J gene
731  bool ms = (*annot)->m_MinusStrand;
732  int v_end_or_j_begin = (ms)?
733  max((*annot)->m_GeneInfo[0] - max_allowed_V_end_to_J_end, (*annot)->m_GeneInfo[5] - 1): (*annot)->m_GeneInfo[1] -1;
734  int j_begin_or_v_end = (ms)?
735  (*annot)->m_GeneInfo[0]: min((*annot)->m_GeneInfo[4], (*annot)->m_GeneInfo[1] + max_allowed_V_end_to_J_end);
736  if (v_end_or_j_begin > 0) {
738  new CSeqLocInfo(new CSeq_interval(*q_id, 0, v_end_or_j_begin), 0));
739  m_Query->AddMask(iq, mask);
740  }
741  if (j_begin_or_v_end < len-1 && j_begin_or_v_end > 0) {
743  new CSeqLocInfo(new CSeq_interval(*q_id, j_begin_or_v_end, len-1), 0));
744  m_Query->AddMask(iq, mask);
745  }
746  }
747  ++iq;
748  }
749 
750  // Generate query factory
752 };
753 
756 {
757  // Options already passed in as m_Options. Only set up (mask) the query
758  int iq = 0;
759  ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
761  CSeq_id *q_id = const_cast<CSeq_id *>(&*query->GetQueryId());
762  int len = query->GetLength();
763  TMaskedQueryRegions mask_list;
764  if ((*annot)->m_GeneInfo[0] ==-1) {
765  // This is not a germline sequence. Mask it out
767  new CSeqLocInfo(new CSeq_interval(*q_id, 0, len-1), 0));
768  mask_list.push_back(mask);
769  } else if (m_IgOptions->m_FocusV) {
770  // Restrict to V gene
771  int begin = (*annot)->m_GeneInfo[0];
772  int end = (*annot)->m_GeneInfo[1];
773  if (begin > 0) {
775  new CSeqLocInfo(new CSeq_interval(*q_id, 0, begin-1), 0));
776  mask_list.push_back(mask);
777  }
778  if (end < len) {
780  new CSeqLocInfo(new CSeq_interval(*q_id, end, len-1), 0));
781  mask_list.push_back(mask);
782  }
783  }
784  m_Query->SetMaskedRegions(iq, mask_list);
785  ++iq;
786  }
788 };
789 
790 // Compare the second seqalign to see if it is as good as the first one
791 static bool s_IsSeqAlignAsGood(const CRef<CSeq_align> &x,
792  const CRef<CSeq_align> &y)
793 {
794  double sx, sy;
797  if (sx < 0.999999 * sy || sy < 0.999999 * sx) return false;
798  int ix, iy;
801  if (ix > iy) return false;
802  int dx, dy;
803  dx = x->GetAlignLength();
804  dy = y->GetAlignLength();
805  return (dx <= dy);
806 }
807 
808 // Remove lcl| from seqid label
809 static string s_RemoveLocalPrefix(const string & sid)
810 {
811  if (sid.substr(0, 4) == "lcl|") return(sid.substr(4, sid.length()));
812  return sid;
813 }
814 
815 static string s_MakeTopHitsId(const CSeq_align_set::Tdata& align_list, int num_align) {
816  string ids = NcbiEmptyString;
817  CRef<CSeq_align> align = align_list.front();
818  int count = 0;
819  ITERATE(CSeq_align_set::Tdata, it, align_list) {
820 
821  if (count < num_align && s_IsSeqAlignAsGood(align, (*it))) {
822  string this_id = s_RemoveLocalPrefix((*it)->GetSeq_id(1).AsFastaString());
823  if (ids.find(this_id) == string::npos) {
824 
825  //no redundant id
826  if (ids != NcbiEmptyString) {
827  ids += ",";
828  }
829  ids += this_id;
830  count ++;
831  }
832  } else {
833  break;
834  }
835  }
836  return ids;
837 }
838 
840  vector<CRef <CIgAnnotation> > &annots)
841 {
843 
844  CIgAnnotation *annot = new CIgAnnotation();
845  annots.push_back(CRef<CIgAnnotation>(annot));
846 
847  if ((*result)->HasAlignments()) {
848  const CSeq_align_set::Tdata & align_list = (*result)->GetSeqAlign()->Get();
849  CRef<CSeq_align> align = align_list.front();
850  annot->m_GeneInfo[0] = align->GetSeqStart(0);
851  annot->m_GeneInfo[1] = align->GetSeqStop(0)+1;
852  annot->m_TopGeneIds[0] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[0]);
853  }
854  }
855 };
856 
857 // Test if the alignment is already in the align_list
858 static bool s_SeqAlignInSet(CSeq_align_set::Tdata & align_list, CRef<CSeq_align> &align)
859 {
860  ITERATE(CSeq_align_set::Tdata, it, align_list) {
861  if ((*it)->GetSeq_id(1).Match(align->GetSeq_id(1)) &&
862  (*it)->GetSeqStart(1) == align->GetSeqStart(1) &&
863  (*it)->GetSeqStop(1) == align->GetSeqStop(1)) return true;
864  }
865  return false;
866 };
867 
868 // Compare two seqaligns according to their evalue and coverage
870  const CRef<CSeq_align> &y)
871 {
872  double sx, sy;
875  if (sx < 0.999999 * sy) return true;
876  if (sy < 0.999999 * sx) return false;
877  int ix, iy;
880  if (ix != iy) return (ix > iy);
881 
882  int dx, dy;
883  dx = x->GetAlignLength();
884  dy = y->GetAlignLength();
885  if (dx != dy) {
886  return (dx >= dy);
887  }
888  string x_id = NcbiEmptyString;
889  string y_id = NcbiEmptyString;
890  x->GetSeq_id(1).GetLabel(&x_id, CSeq_id::eContent);
891  y->GetSeq_id(1).GetLabel(&y_id, CSeq_id::eContent);
892  return (x_id < y_id);
893 };
894 
895 // Compare two seqaligns according to their evalue and coverage
897 {
898  int sx, sy;
901  if (sx != sy) return (sx > sy);
902  sx = x->GetAlignLength();
903  sy = y->GetAlignLength();
904  return (sx >= sy);
905 
906 };
907 
908 
909 
910 // Test if D and J annotation not compatible
911 static bool s_DJNotCompatible(const CSeq_align &d, const CSeq_align &j, bool ms, int margin)
912 {
913  int ds = d.GetSeqStart(0);
914  int de = d.GetSeqStop(0);
915  int js = j.GetSeqStart(0);
916  int je = j.GetSeqStop(0);
917 
918  //D gene needs to have minimal match in addition to overlap with J gene
919  //D gene needs to end before J gene ends
920  if (ms) {
921  if (ds < js || de < je + margin) return true;
922  } else {
923  if (ds > js - margin || de > je) return true;
924  }
925  return false;
926 };
927 
928 /*
929 static bool s_IsTopMatchJD(CSearchResults& res_J, CIgAnnotationInfo& annotation_info){
930  bool result = true; //default
931  CRef<CSeq_align_set> align_J;
932  if (res_J.HasAlignments()) {
933  align_J.Reset(const_cast<CSeq_align_set *>
934  (&*(res_J.GetSeqAlign())));
935  CSeq_align_set::Tdata & align_list = align_J->Set();
936  CSeq_align_set::Tdata::iterator it = align_list.begin();
937  int prev_score = 0;
938  result = false;
939  while (it != align_list.end()) {
940  int current_score;
941  (*it)->GetNamedScore(CSeq_align::eScore_Score, current_score);
942  if(current_score >= prev_score){
943  string j_id;
944  (*it)->GetSeq_id(1).GetLabel(&j_id, CSeq_id::eContent);
945  string j_chain_type = annotation_info.GetDJChainType(j_id);
946  if (j_chain_type == "N/A"){
947  //assume J gene id style
948 
949  string sid = NStr::ToUpper(j_id);
950  if (sid.substr(0, 2) == "TR" && sid[3] == 'J') {
951  j_chain_type = "J" + sid.substr(2,1);
952  } else if (sid[0] == 'J') {
953  j_chain_type = sid.substr(0,2);
954  }
955  }
956  if (j_chain_type == "JD"){
957  result = true;
958  break;
959  }
960 
961  } else {
962  break;
963  }
964  prev_score = current_score;
965  ++it;
966  }
967 
968  }
969  return result;
970 };
971 */
973  CRef<CSeq_align_set>& align_J,
974  string q_ct,
975  bool q_ms,
976  ENa_strand q_st,
977  int q_ve,
978  int iq,
979  bool va_or_vd_as_heavy_chain) {
980 
981  int allowed_VJ_distance = max_allowed_VJ_distance_with_D;
982  /* preprocess D */
983  if (align_D && !align_D->Get().empty()) {
984  CSeq_align_set::Tdata & align_list = align_D->Set();
985  CSeq_align_set::Tdata::iterator it = align_list.begin();
986  /* chain type test */
987  if (q_ct!="VH" && q_ct!="VD" && q_ct!="VA" && q_ct!="VB" ) {
988  while (it != align_list.end()) {
989  it = align_list.erase(it);
990  }
991  allowed_VJ_distance = max_allowed_VJ_distance_without_D;
992  } else if (q_ct =="VA" || q_ct =="VD") {
993  if (va_or_vd_as_heavy_chain) {
994  //VA could behave like VD and is allowed to rearrange to JA or DD/JD
995  q_ct = "VD";
996  //annot->m_ChainType[0] = "VD";
997  } else {
998  q_ct = "VA";
999  while (it != align_list.end()) {
1000  it = align_list.erase(it);
1001  }
1002  allowed_VJ_distance = max_allowed_VJ_distance_without_D;
1003  }
1004  }
1005  //test compatability between V and D
1006  it = align_list.begin();
1007  while (it != align_list.end()) {
1008  bool keep = true;
1009  /* chain type test */
1010  if (q_ct!="N/A") {
1011  char s_ct = q_ct[1];
1012  string d_id;
1013  (*it)->GetSeq_id(1).GetLabel(&d_id, CSeq_id::eContent);
1014  string d_chain_type = m_AnnotationInfo.GetDJChainType(d_id);
1015  if (d_chain_type != "N/A"){
1016  if (d_chain_type[1] != q_ct[1]) keep = false;
1017  } else { //assume D gene id style
1018  string sid = (*it)->GetSeq_id(1).AsFastaString();
1019  sid = NStr::ToUpper(sid);
1020  if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
1021  if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
1022  && sid[3] == 'D') {
1023  s_ct = sid[2];
1024  }
1025  if (s_ct!='B' && s_ct!='D') s_ct = q_ct[1];
1026  if (s_ct != q_ct[1]) keep = false;
1027  }
1028  }
1029 
1030  /* remove failed seq_align */
1031  if (!keep) it = align_list.erase(it);
1032  else ++it;
1033  }
1034 
1035 
1036  /* strand test */
1037  bool strand_found = false;
1038  ITERATE(CSeq_align_set::Tdata, it, align_list) {
1039  if ((*it)->GetSeqStrand(0) == q_st) {
1040  strand_found = true;
1041  break;
1042  }
1043  }
1044  if (strand_found) {
1045  it = align_list.begin();
1046  while (it != align_list.end()) {
1047  if ((*it)->GetSeqStrand(0) != q_st) {
1048  it = align_list.erase(it);
1049  } else ++it;
1050  }
1051  }
1052  /* v end test */
1053  it = align_list.begin();
1054  while (it != align_list.end()) {
1055  bool keep = false;
1056  int q_ds = (*it)->GetSeqStart(0);
1057  int q_de = (*it)->GetSeqStop(0);
1058  if (q_ms) keep = (q_de >= q_ve - max_allowed_VD_distance && q_ds <= q_ve - m_IgOptions->m_Min_D_match);
1059  else keep = (q_ds <= q_ve + max_allowed_VD_distance && q_de >= q_ve + m_IgOptions->m_Min_D_match);
1060  if (!keep) it = align_list.erase(it);
1061  else ++it;
1062  }
1063  /* sort according to score */
1064  align_list.sort(s_CompareSeqAlignByScoreAndName);
1065  }
1066 
1067  /* preprocess J */
1068  if (align_J && !align_J->Get().empty()) {
1069  CSeq_align_set::Tdata & align_list = align_J->Set();
1070  CSeq_align_set::Tdata::iterator it = align_list.begin();
1071  while (it != align_list.end()) {
1072  bool keep = true;
1073  /* chain type test */
1074  if (q_ct!="N/A") {
1075  char s_ct = q_ct[1];
1076  string j_id;
1077  (*it)->GetSeq_id(1).GetLabel(&j_id, CSeq_id::eContent);
1078  string j_chain_type = m_AnnotationInfo.GetDJChainType(j_id);
1079  if (j_chain_type != "N/A"){
1080  if (j_chain_type[1] != q_ct[1]) keep = false;
1081  } else { //assume J gene id style
1082  string sid = (*it)->GetSeq_id(1).AsFastaString();
1083  sid = NStr::ToUpper(sid);
1084  if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
1085  if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
1086  && sid[3] == 'J') {
1087  s_ct = sid[2];
1088  } else if (sid[0] == 'J') {
1089  s_ct = sid[1];
1090  }
1091  if (s_ct!='H' && s_ct!='L' && s_ct!='K' &&
1092  s_ct!='A' && s_ct!='B' && s_ct!='D' && s_ct!='G') s_ct = q_ct[1];
1093  if (s_ct != q_ct[1]) keep = false;
1094  }
1095  } else {
1096  keep = false;
1097  }
1098  /* strand test */
1099  if ((*it)->GetSeqStrand(0) != q_st) keep = false;
1100  /* subject start test */
1101  if ((int)(*it)->GetSeqStart(1) > max_allowed_j_deletion) keep = false;
1102  /* v end test */
1103  int q_js = (*it)->GetSeqStart(0);
1104  int q_je = (*it)->GetSeqStop(0);
1105  if (q_ms) {
1106  if (q_je < q_ve - allowed_VJ_distance || q_js > q_ve - j_wordsize) keep = false;
1107  } else {
1108  if (q_js > q_ve + allowed_VJ_distance || q_je < q_ve + j_wordsize) keep = false;
1109  }
1110  /* remove failed seq_align */
1111  if (!keep) it = align_list.erase(it);
1112  else ++it;
1113  }
1114  /* sort according to score */
1115  align_list.sort(ScorePositionSort(m_Scope));
1116  }
1117 
1118  /* which one to keep, D or J? */
1119  if (align_D.NotEmpty() && !align_D->IsEmpty() &&
1120  align_J.NotEmpty() && !align_J->IsEmpty()) {
1121  CSeq_align_set::Tdata & al_D = align_D->Set();
1122  CSeq_align_set::Tdata & al_J = align_J->Set();
1123  CSeq_align_set::Tdata::iterator it;
1124  bool keep_J = s_CompareSeqAlignByScore(*(al_J.begin()), *(al_D.begin()));
1125  if (keep_J) {
1126  it = al_D.begin();
1127  while (it != al_D.end()) {
1128  if (s_DJNotCompatible(**it, **(al_J.begin()), q_ms, m_IgOptions->m_Min_D_match)) {
1129  it = al_D.erase(it);
1130  } else ++it;
1131  }
1132 
1134  m_IgOptions->m_D_penalty == -4) {
1135  //deleting j only for overlap case otherwise it's handeled later
1136  if (align_D.NotEmpty() && !align_D->IsEmpty()){
1137  it = al_J.begin();
1138  while (it != al_J.end()) {
1139  if (s_DJNotCompatible(**(al_D.begin()), **it, q_ms, m_IgOptions->m_Min_D_match)) {
1140  it = al_J.erase(it);
1141  } else ++it;
1142  }
1143  }
1144  }
1145  } else {
1146  it = al_J.begin();
1147 
1148  while (it != al_J.end()) {
1149  if (s_DJNotCompatible(**(al_D.begin()), **it, q_ms, m_IgOptions->m_Min_D_match)) {
1150  it = al_J.erase(it);
1151  } else ++it;
1152  }
1153  if (align_J.NotEmpty() && !align_J->IsEmpty()) {
1154  it = al_D.begin();
1155  while (it != al_D.end()) {
1156  if (s_DJNotCompatible(**it, **(al_J.begin()), q_ms, m_IgOptions->m_Min_D_match)) {
1157  it = al_D.erase(it);
1158  } else ++it;
1159  }
1160 
1161  }
1162  }
1163 
1164  }
1165 
1166 }
1167 
1168 
1170  CRef<CSearchResultSet>& results_J,
1171  CRef <CIgAnnotation>& annot,
1172  CRef<CSeq_align_set>& align_D,
1173  CRef<CSeq_align_set>& align_J,
1174  string q_ct,
1175  bool q_ms,
1176  ENa_strand q_st,
1177  int q_ve,
1178  int iq) {
1179 
1180  CRef<CSeq_align_set> original_align_D(new CSeq_align_set);
1181  CRef<CSeq_align_set> original_align_J(new CSeq_align_set);
1182 
1183  /* preprocess D */
1184  CSearchResults& res_D = (*results_D)[iq];
1185  if (res_D.HasAlignments()) {
1186 
1187  align_D.Reset(const_cast<CSeq_align_set *>
1188  (&*(res_D.GetSeqAlign())));
1189  original_align_D->Assign(*align_D);
1190 
1191  }
1192 
1193  /* preprocess J */
1194  CSearchResults& res_J = (*results_J)[iq];
1195  if (res_J.HasAlignments()) {
1196  align_J.Reset(const_cast<CSeq_align_set *>
1197  (&*(res_J.GetSeqAlign())));
1198  original_align_J->Assign(*align_J);
1199 
1200  }
1201  //try as VA
1202  x_FindDJAln(align_D, align_J, q_ct, q_ms, q_st, q_ve, iq, false);
1203  if ((original_align_D.NotEmpty() && !original_align_D->Get().empty()) && (q_ct =="VA" || q_ct =="VD")) {
1204 
1205  annot->m_ChainType[0] = "VA";
1206  //try as VD
1207  x_FindDJAln(original_align_D, original_align_J, q_ct, q_ms, q_st, q_ve, iq, true);
1208  int as_heavy_chain_score = 0;
1209  int as_light_chain_score = 0;
1210  int d_score = 0;
1211  if(original_align_J.NotEmpty() && !original_align_J->Get().empty()){
1212  original_align_J->Get().front()->GetNamedScore(CSeq_align::eScore_Score, as_heavy_chain_score);
1213  }
1214 
1215  if(original_align_D.NotEmpty() && !original_align_D->Get().empty()){
1216  original_align_D->Get().front()->GetNamedScore(CSeq_align::eScore_Score, d_score);
1217  }
1218  if (align_J.NotEmpty() && !align_J->Get().empty()){
1219  align_J->Get().front()->GetNamedScore(CSeq_align::eScore_Score, as_light_chain_score);
1220  }
1221 
1222 
1223  if (as_heavy_chain_score + d_score> as_light_chain_score){
1224  if (align_D.NotEmpty() && original_align_D.NotEmpty()){
1225  align_D->Assign(*original_align_D);
1226  }
1227  if (align_J.NotEmpty() && original_align_J.NotEmpty()){
1228  align_J->Assign(*original_align_J);
1229  }
1230 
1231  annot->m_ChainType[0] = "VD";
1232  }
1233 
1234  }
1235 
1236 }
1237 
1239  string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1240  int j_cdr3end = m_AnnotationInfo.GetJDomain(sid);
1241  int subject_start = align->GetSeqStart(1);
1242  int subject_end = align->GetSeqStop(1);
1243  //don't try if j starts after cdr3 ends as we don't know for sure where the boundry is
1244  if (j_cdr3end > 0 && subject_start - j_cdr3end <= 1) {
1245  CAlnMap j_map(align->GetSegs().GetDenseg());
1246 
1247  //+1 actaully is in fwr4 already...need to do this so that a insertion right in front
1248  // of fwr4 can be handled.
1249  annot->m_JDomain[1] = j_map.GetSeqPosFromSeqPos(0, 1,
1250  max(subject_start, min(j_cdr3end + 1,
1251  subject_end)),
1253 
1254  if (align->GetSeqStrand(0) == eNa_strand_minus) {
1255  annot->m_JDomain[1] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() - annot->m_JDomain[1] - 1;
1256  }
1257 
1258  //deduct one back to in CDR3
1259  if (subject_end > j_cdr3end) {
1260  annot->m_JDomain[1] --;
1261  }
1262  //allow missed alignment to the first bp and deduce the cdr3 by gapless extension backwards
1263  } else if (j_cdr3end > 0 && subject_start - j_cdr3end <= 2) {
1264  CAlnMap j_map(align->GetSegs().GetDenseg());
1265 
1266  //+1 actaully is in fwr4 already...need to do this so that a insertion right in front
1267  // of fwr4 can be handled.
1268  annot->m_JDomain[1] = j_map.GetSeqPosFromSeqPos(0, 1, subject_start, IAlnExplorer::eRight);
1269 
1270  if (align->GetSeqStrand(0) == eNa_strand_minus) {
1271  annot->m_JDomain[1] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() - annot->m_JDomain[1] - 1;
1272  }
1273 
1274  //deduct diff back to be in CDR3
1275  if (subject_end > j_cdr3end) {
1276  annot->m_JDomain[1] = annot->m_JDomain[1] - (subject_start - j_cdr3end);
1277  }
1278  } else if (j_cdr3end > 0 && subject_start - j_cdr3end <= 4) {
1279  //allow up to 3 missing fwr4 starting nucleotides. Too many might introduce inaccuracy
1280  //this code should be integrated with the above one
1281  CAlnMap j_map(align->GetSegs().GetDenseg());
1282 
1283  annot->m_JDomain[1] = j_map.GetSeqPosFromSeqPos(0, 1, subject_start, IAlnExplorer::eRight);
1284 
1285  if (align->GetSeqStrand(0) == eNa_strand_minus) {
1286  annot->m_JDomain[1] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() - annot->m_JDomain[1] - 1;
1287  }
1288 
1289  //deduct diff back to be in CDR3
1290  if (subject_end > j_cdr3end) {
1291  annot->m_JDomain[1] = annot->m_JDomain[1] - (subject_start - j_cdr3end);
1292  }
1293  }
1294 
1295  //fwr4
1296  if (annot->m_JDomain[1] > 0) {
1297  int j_fwr4end_offset = m_AnnotationInfo.GetFwr4EndOffset(sid);
1298  annot->m_JDomain[4] = j_fwr4end_offset;
1299  if (j_fwr4end_offset >= 0) {
1300  int j_fwr4end = m_Scope->GetBioseqHandle(align->GetSeq_id(1)).GetBioseqLength() - j_fwr4end_offset - 1;
1301  CAlnMap j_map(align->GetSegs().GetDenseg());
1302 
1303  annot->m_JDomain[3] = j_map.GetSeqPosFromSeqPos(0, 1, min(j_fwr4end, subject_end), IAlnExplorer::eRight);
1304 
1305 
1306  if (align->GetSeqStrand(0) == eNa_strand_minus) {
1307  annot->m_JDomain[3] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() - annot->m_JDomain[3] - 1;
1308  }
1309  //cdr3 domain at the end of alignment
1310  if (annot->m_JDomain[1] == annot->m_JDomain[3]) {
1311  annot->m_JDomain[3] = -1;
1312  }
1313  }
1314  }
1315 
1316 
1317 }
1318 
1320  CRef<CSearchResultSet>& results_D,
1321  CRef<CSearchResultSet>& results_J,
1322  vector<CRef <CIgAnnotation> > &annots) {
1323 
1324  int iq = 0;
1325  NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1326  string q_ct = (*annot)->m_ChainType[0];
1327  bool q_ms = (*annot)->m_MinusStrand;
1328  ENa_strand q_st = (q_ms) ? eNa_strand_minus : eNa_strand_plus;
1329  int q_ve = (q_ms) ? (*annot)->m_GeneInfo[0] : (*annot)->m_GeneInfo[1] - 1;
1330 
1331  CRef<CSeq_align_set> align_D (0);
1332  CSearchResults& res_D = (*results_D)[iq];
1333  if (res_D.HasAlignments()) {
1334  align_D = res_D.SetSeqAlign();
1335  }
1336 
1337  // preprocess D
1338  if (align_D && !align_D.Empty() && !align_D->IsEmpty()) {
1339  CSeq_align_set::Tdata & align_list = align_D->Set();
1340  CSeq_align_set::Tdata::iterator it = align_list.begin();
1341 
1342  //test compatability between V and D
1343  it = align_list.begin();
1344  while (it != align_list.end()) {
1345  bool keep = true;
1346  // chain type test
1347  if (q_ct!="N/A") {
1348  char s_ct = q_ct[1];
1349  string d_id;
1350  (*it)->GetSeq_id(1).GetLabel(&d_id, CSeq_id::eContent);
1351  string d_chain_type = m_AnnotationInfo.GetDJChainType(d_id);
1352  if (d_chain_type != "N/A"){
1353  if (d_chain_type[1] != q_ct[1]) keep = false;
1354  } else { //assume D gene id style
1355  string sid = (*it)->GetSeq_id(1).AsFastaString();
1356  sid = NStr::ToUpper(sid);
1357  if (sid.substr(0, 4) == "LCL|") sid = sid.substr(4, sid.length());
1358  if ((sid.substr(0, 2) == "IG" || sid.substr(0, 2) == "TR")
1359  && sid[3] == 'D') {
1360  s_ct = sid[2];
1361  }
1362  if (s_ct!='B' && s_ct!='D') s_ct = q_ct[1];
1363  if (s_ct != q_ct[1]) keep = false;
1364  }
1365  }
1366 
1367  //remove failed seq_align
1368  if (!keep) it = align_list.erase(it);
1369  else ++it;
1370  }
1371 
1372 
1373  //strand test
1374  bool strand_found = false;
1375  ITERATE(CSeq_align_set::Tdata, it, align_list) {
1376  if ((*it)->GetSeqStrand(0) == q_st) {
1377  strand_found = true;
1378  break;
1379  }
1380  }
1381  if (strand_found) {
1382  it = align_list.begin();
1383  while (it != align_list.end()) {
1384  if ((*it)->GetSeqStrand(0) != q_st) {
1385  it = align_list.erase(it);
1386  } else ++it;
1387  }
1388  }
1389  //v end test
1390  it = align_list.begin();
1391  while (it != align_list.end()) {
1392  bool keep = false;
1393  int q_ds = (*it)->GetSeqStart(0);
1394  int q_de = (*it)->GetSeqStop(0);
1395  if (q_ms) keep = (q_de >= q_ve - max_allowed_VD_distance && q_ds <= q_ve - m_IgOptions->m_Min_D_match);
1396  else keep = (q_ds <= q_ve + max_allowed_VD_distance && q_de >= q_ve + m_IgOptions->m_Min_D_match);
1397  if (!keep) it = align_list.erase(it);
1398  else ++it;
1399  }
1400  // sort according to score
1401  align_list.sort(s_CompareSeqAlignByScoreAndName);
1402 
1403  /* process J */
1404  CRef<CSeq_align_set> align_J (0);
1405  CSearchResults& res_J = (*results_J)[iq];
1406  if (res_J.HasAlignments()) {
1407  align_J = res_J.SetSeqAlign();
1408  }
1409  if (align_J && align_J.NotEmpty() && !align_J->IsEmpty() && !align_list.empty()) {
1410 
1411  CSeq_align_set::Tdata & al_J = align_J->Set();
1412  CSeq_align_set::Tdata::iterator it = al_J.begin();
1413  while (it != al_J.end()) {
1414  if (s_DJNotCompatible(*(align_list.front()), **it, q_ms, m_IgOptions->m_Min_D_match)) {
1415  it = al_J.erase(it);
1416  } else ++it;
1417  }
1418  }
1419  }
1420 
1421  iq ++;
1422  }
1423 }
1424 
1426  vector<CRef <CIgAnnotation> > &annots) {
1427  int iq = 0;
1428  CRef<CSeq_align_set> align_C(0);
1429  NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1430  bool q_ms = (*annot)->m_MinusStrand;
1431  ENa_strand q_st = (q_ms) ? eNa_strand_minus : eNa_strand_plus;
1432  CSearchResults& res_C = (*results_C)[iq];
1433  if (res_C.HasAlignments()) {
1434  align_C.Reset(const_cast<CSeq_align_set *>(&*(res_C.GetSeqAlign())));
1435  if (align_C && !align_C->Get().empty()) {
1436  CSeq_align_set::Tdata & align_list = align_C->Set();
1437  CSeq_align_set::Tdata::iterator it = align_list.begin();
1438  while (it != align_list.end()) {
1439  bool keep = true;
1440 
1441  /* strand test */
1442  if ((*it)->GetSeqStrand(0) != q_st) keep = false;
1443 
1444  /* remove failed seq_align */
1445  if (!keep) it = align_list.erase(it);
1446  else ++it;
1447  }
1448  iq ++;
1449  }
1450  }
1451  }
1452 }
1453 
1455  CRef<CSearchResultSet>& results_D,
1456  CRef<CSearchResultSet>& results_J,
1457  vector<CRef <CIgAnnotation> > &annots) {
1458 
1459  int iq = 0;
1460  NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1461  string q_ct = (*annot)->m_ChainType[0];
1462  bool q_ms = (*annot)->m_MinusStrand;
1463  ENa_strand q_st = (q_ms) ? eNa_strand_minus : eNa_strand_plus;
1464  int q_ve = (q_ms) ? (*annot)->m_GeneInfo[0] : (*annot)->m_GeneInfo[1] - 1;
1465 
1466  CRef<CSeq_align_set> align_D, align_J;
1467 
1468  x_FindDJ( results_D, results_J, *annot, align_D, align_J, q_ct, q_ms, q_st, q_ve, iq);
1469  iq ++;
1470  }
1471 }
1472 
1474  vector<CRef <CIgAnnotation> > &annots)
1475 {
1476 
1477  int iq = 0;
1478  NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1479 
1480  string q_ct = (*annot)->m_ChainType[0];
1481  CSearchResults& res_d = (*results_D)[iq];
1482  CConstRef<CSeq_align_set> align_D = res_d.GetSeqAlign();
1483  if (align_D && !align_D.Empty() && !align_D->IsEmpty()) {
1484  const CSeq_align_set::Tdata& align_list = align_D->Get();
1485  CRef<CSeq_align> align = align_list.front();
1486  (*annot)->m_GeneInfo[2] = align->GetSeqStart(0);
1487  (*annot)->m_GeneInfo[3] = align->GetSeqStop(0)+1;
1488  (*annot)->m_TopGeneIds[1] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[1]);
1489  string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1490  (*annot)->m_DframeStart = m_AnnotationInfo.GetFrameOffset(sid);
1491 
1492  }
1493 
1494 
1495  /* next set of results */
1496  ++iq;
1497  }
1498 };
1499 
1501  CRef<CSearchResultSet> &results_j,
1502  vector<CRef <CIgAnnotation> > &annots)
1503 {
1504  int iq = 0;
1505 
1506  NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1507 
1508  const CSearchResults& res_c = (*results_c)[iq];
1509  CConstRef<CSeq_align_set> align_c = res_c.GetSeqAlign();
1510  const CSearchResults& res_j = (*results_j)[iq];
1511  CConstRef<CSeq_align_set> align_j = res_j.GetSeqAlign();
1512 
1513  if (align_c.NotEmpty() && !align_c->IsEmpty() && align_j.NotEmpty() && !align_j->IsEmpty()) {
1514  const CSeq_align_set::Tdata & align_list = align_c->Get();
1515  CRef<CSeq_align> align = align_list.front();
1516  (*annot)->m_TopGeneIds[3] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[3]);
1517  (*annot)->m_GeneInfo[6] = align->GetSeqStart(0);
1518  (*annot)->m_GeneInfo[7] = align->GetSeqStop(0)+1;
1519  if ((*annot)->m_JDomain[3] > 0 && (*annot)->m_JDomain[1] > 0) {
1520  int subject_start = align->GetSeqStart(1);
1521  int subject_end = align->GetSeqStop(1);
1522  int seq_end = m_Scope->GetBioseqHandle(align->GetSeq_id(1)).GetBioseqLength() - 1;
1523  CAlnMap c_map(align->GetSegs().GetDenseg());
1524 
1525  (*annot)->m_CDomain[1] = c_map.GetSeqPosFromSeqPos(0, 1, min(subject_end, seq_end), IAlnExplorer::eRight);
1526 
1527  if (align->GetSeqStrand(0) == eNa_strand_minus) {
1528  (*annot)->m_CDomain[1] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() -
1529  (*annot)->m_CDomain[1] - 1;
1530  }
1531  //c start
1532  int query_start = c_map.GetSeqPosFromSeqPos(0, 1, subject_start, IAlnExplorer::eRight);
1533  if (align->GetSeqStrand(0) == eNa_strand_minus) {
1534  (*annot)->m_CDomain[0] = m_Scope->GetBioseqHandle(align->GetSeq_id(0)).GetBioseqLength() -
1535  query_start - 1;
1536  } else {
1537  (*annot)->m_CDomain[0] = query_start;
1538  }
1539 
1540  //if there are one or more gaps between fwr4 end and c start, then extend fwr4 end
1541  int diff = max(0, (*annot)->m_CDomain[0] - (*annot)->m_JDomain[3] - 1);
1542  //j stop and j length
1543  int j_end = m_Scope->GetBioseqHandle(align_j->Get().front()->GetSeq_id(1)).GetBioseqLength() - 1;
1544  if ((*annot)->m_JDomain[4] > 0) {
1545  j_end -= (*annot)->m_JDomain[4];
1546  }
1547  int j_stop = align_j->Get().front()->GetSeqStop(1);
1548  int j_extend_max = max(0, j_end - j_stop);
1549  int extend_len = min(diff, j_extend_max);
1550  if (extend_len > 0) {
1551  (*annot)->m_JDomain[3] += extend_len;
1552  }
1553  }
1554  }
1555  /* next set of results */
1556  ++iq;
1557  }
1558 }
1559 
1560 
1562  vector<CRef <CIgAnnotation> > &annots)
1563 {
1564  int iq = 0;
1565  NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1566 
1567  bool q_ms = (*annot)->m_MinusStrand;
1568 
1569  const CSearchResults& res_j = (*results_J)[iq];
1570  CConstRef<CSeq_align_set> align_J = res_j.GetSeqAlign();
1571 
1572  /* annotate J */
1573  if (align_J.NotEmpty() && !align_J->IsEmpty()) {
1574  const CSeq_align_set::Tdata & align_list = align_J->Get();
1575  CRef<CSeq_align> align = align_list.front();
1576  x_FillJDomain(align, *annot);
1577  (*annot)->m_GeneInfo[4] = align->GetSeqStart(0);
1578  (*annot)->m_GeneInfo[5] = align->GetSeqStop(0)+1;
1579  string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1580  int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
1581  if (frame_offset >= 0) {
1582  int frame_adj = (align->GetSeqStart(1) + 3 - frame_offset) % 3;
1583  (*annot)->m_FrameInfo[2] = (q_ms) ?
1584  align->GetSeqStop(0) + frame_adj
1585  : align->GetSeqStart(0) - frame_adj;
1586  }
1587  (*annot)->m_TopGeneIds[2] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[2]);
1588  }
1589 
1590  /* next set of results */
1591  ++iq;
1592  }
1593 }
1594 
1596  CRef<CSearchResultSet> &results_J,
1597  vector<CRef <CIgAnnotation> > &annots)
1598 {
1599  int iq = 0;
1600  NON_CONST_ITERATE(vector<CRef <CIgAnnotation> >, annot, annots) {
1601 
1602  string q_ct = (*annot)->m_ChainType[0];
1603  bool q_ms = (*annot)->m_MinusStrand;
1604 
1605  const CSearchResults& res_j = (*results_J)[iq];
1606  const CSearchResults& res_d = (*results_D)[iq];
1607  CConstRef<CSeq_align_set> align_D = res_d.GetSeqAlign();
1608  CConstRef<CSeq_align_set> align_J = res_j.GetSeqAlign();
1609  /* annotate D */
1610  if (align_D.NotEmpty() && !align_D->IsEmpty()) {
1611  const CSeq_align_set::Tdata & align_list = align_D->Get();
1612  CRef<CSeq_align> align = align_list.front();
1613  (*annot)->m_GeneInfo[2] = align->GetSeqStart(0);
1614  (*annot)->m_GeneInfo[3] = align->GetSeqStop(0)+1;
1615  (*annot)->m_TopGeneIds[1] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[1]);
1616  string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1617  (*annot)->m_DframeStart = m_AnnotationInfo.GetFrameOffset(sid);
1618 
1619  }
1620 
1621  /* annotate J */
1622  if (align_J.NotEmpty() && !align_J->IsEmpty()) {
1623  const CSeq_align_set::Tdata & align_list = align_J->Get();
1624  CRef<CSeq_align> align = align_list.front();
1625  x_FillJDomain(align, *annot);
1626  (*annot)->m_GeneInfo[4] = align->GetSeqStart(0);
1627  (*annot)->m_GeneInfo[5] = align->GetSeqStop(0)+1;
1628  string sid = s_RemoveLocalPrefix(align->GetSeq_id(1).AsFastaString());
1629  int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
1630  if (frame_offset >= 0) {
1631  int frame_adj = (align->GetSeqStart(1) + 3 - frame_offset) % 3;
1632  (*annot)->m_FrameInfo[2] = (q_ms) ?
1633  align->GetSeqStop(0) + frame_adj
1634  : align->GetSeqStart(0) - frame_adj;
1635  }
1636  (*annot)->m_TopGeneIds[2] = s_MakeTopHitsId(align_list, m_IgOptions->m_NumAlign[2]);
1637  }
1638 
1639  /* next set of results */
1640  ++iq;
1641  }
1642 };
1643 
1644 // query chain type and domain is annotated by germline alignment
1646  CRef<CSearchResultSet> &dm_results,
1647  vector<CRef <CIgAnnotation> > &annots)
1648 {
1650  CScope scope_q(*mgr), scope_s(*mgr);
1651  CRef<CSeqDB> db_V, db_domain;
1652  bool annotate_subject = false;
1653  if (m_IgOptions->m_Db[0]->IsBlastDb()) {
1654  string db_name_V = m_IgOptions->m_Db[0]->GetDatabaseName();
1655  string db_name_domain = m_IgOptions->m_Db[3]->GetDatabaseName();
1658  db_V.Reset(new CSeqDB(db_name_V, db_type));
1659  if (db_name_V == db_name_domain) {
1660  db_domain.Reset(&(*db_V));
1661  } else {
1662  db_domain.Reset(new CSeqDB(db_name_domain, db_type));
1663  }
1664  annotate_subject = true;
1665  }
1666 
1667  int iq = 0;
1668  ITERATE(CSearchResultSet, result, *dm_results) {
1669 
1670  CIgAnnotation *annot = &*(annots[iq]);
1671  annot->m_ChainType.push_back("N/A"); // Assuming non-ig sequence first
1672  annot->m_ChainTypeToShow = "N/A";
1673  if ((*result)->HasAlignments() && (*gl_results)[iq].HasAlignments()) {
1674 
1675 
1676  CConstRef<CSeq_align> master_align =
1677  (*gl_results)[iq].GetSeqAlign()->Get().front();
1678  CAlnMap q_map(master_align->GetSegs().GetDenseg());
1679 
1680  if (master_align->GetSeqStrand(0) == eNa_strand_minus) {
1681  annot->m_MinusStrand = true;
1682  }
1683 
1684  int q_ends[2], q_dir;
1685 
1686  if (annot->m_MinusStrand) {
1687  q_ends[1] = master_align->GetSeqStart(0);
1688  q_ends[0] = master_align->GetSeqStop(0);
1689  q_dir = -1;
1690 
1691  } else {
1692  q_ends[0] = master_align->GetSeqStart(0);
1693  q_ends[1] = master_align->GetSeqStop(0);
1694  q_dir = 1;
1695  }
1696 
1697  const CSeq_align_set::Tdata & align_list = (*result)->GetSeqAlign()->Get();
1698 
1699  ITERATE(CSeq_align_set::Tdata, it, align_list) {
1700 
1701  string sid = s_RemoveLocalPrefix((*it)->GetSeq_id(1).AsFastaString());
1703  annot->m_ChainTypeToShow = annot->m_ChainType[0];
1704  int domain_info[10];
1705 
1706  if (m_AnnotationInfo.GetDomainInfo(sid, domain_info)) {
1707 
1708 
1709  CAlnMap s_map((*it)->GetSegs().GetDenseg());
1710  int s_start = (*it)->GetSeqStart(1);
1711  int s_stop = (*it)->GetSeqStop(1);
1712 
1713  CRef<CAlnMap> d_map;
1714  int d_start = -1;
1715  int d_stop = -1;
1716 
1717  int start, stop;
1718 
1719  if (m_IgOptions->m_CustomInternalData == NcbiEmptyString && annotate_subject) {
1720  //blast2 between top germline V and internal top V
1721  CRef<CBioseq> seq_q = db_domain->SeqidToBioseq((*it)->GetSeq_id(1));
1722  CBioseq_Handle hdl_q = scope_q.AddBioseq(*seq_q);
1723  CRef<CBioseq> seq_s = db_V->SeqidToBioseq(master_align->GetSeq_id(1));
1724  CBioseq_Handle hdl_s = scope_s.AddBioseq(*seq_s);
1726  query.SetWhole();
1727  query.SetId((*it)->GetSeq_id(1));
1728  subject.SetWhole();
1729  subject.SetId(master_align->GetSeq_id(1));
1730  SSeqLoc q_loc(&query, &scope_q);
1731  SSeqLoc s_loc(&subject, &scope_s);
1732  CBl2Seq bl2seq(q_loc, s_loc, (m_IgOptions->m_IsProtein)? eBlastp: eBlastn);
1733  const CSearchResults& result = (*(bl2seq.RunEx()))[0];
1734  if (result.HasAlignments()) {
1735  CConstRef<CSeq_align> subject_align = result.GetSeqAlign()->Get().front();
1736  d_map.Reset(new CAlnMap(subject_align->GetSegs().GetDenseg()));
1737  d_start = subject_align->GetSeqStart(0);
1738  d_stop = subject_align->GetSeqStop(0);
1739  }
1740  scope_q.RemoveBioseq(hdl_q);
1741  scope_s.RemoveBioseq(hdl_s);
1742  }
1743 
1744  for (int i =0; i<10; i+=2) {
1745 
1746  start = domain_info[i] - 1;
1747  stop = domain_info[i+1] - 1;
1749  //use custom data for top germline V regions
1750  annot->m_DomainInfo_S[i] = domain_info[i] - 1;
1751  annot->m_DomainInfo_S[i+1] = domain_info[i+1] - 1;
1752  } else {
1753  //use blast2 to annotate to germline V
1754  if (start <= d_stop && stop >= d_start) {
1755  int start_copy = start;
1756  int stop_copy = stop;
1757  if (start_copy < d_start) start_copy = d_start;
1758  if (stop_copy > d_stop) stop_copy = d_stop;
1759  if (start_copy <= stop_copy) {
1760  if (i>0 && annot->m_DomainInfo_S[i-1]>=0) {
1761  annot->m_DomainInfo_S[i] = annot->m_DomainInfo_S[i-1] + 1;
1762  } else {
1763  annot->m_DomainInfo_S[i] =
1764  d_map->GetSeqPosFromSeqPos(1, 0, start_copy, IAlnExplorer::eForward);
1765  }
1766  annot->m_DomainInfo_S[i+1] =
1767  d_map->GetSeqPosFromSeqPos(1, 0, stop_copy, IAlnExplorer::eBackwards);
1768  }
1769  }
1770  }
1771 
1772  if (start > s_stop || stop < s_start) continue;
1773 
1774  if (start < s_start) start = s_start;
1775 
1776  if (stop > s_stop) stop = s_stop;
1777 
1778  if (start > stop) continue;
1779 
1780  start = s_map.GetSeqPosFromSeqPos(0, 1, start, IAlnExplorer::eForward);
1781  stop = s_map.GetSeqPosFromSeqPos(0, 1, stop, IAlnExplorer::eBackwards);
1782 
1783  if ((start - q_ends[1])*q_dir > 0 || (stop - q_ends[0])*q_dir < 0) continue;
1784 
1785  if ((start - q_ends[0])*q_dir < 0) start = q_ends[0];
1786 
1787  if ((stop - q_ends[1])*q_dir > 0) stop = q_ends[1];
1788 
1789  if ((start - stop)*q_dir > 0) continue;
1790 
1791  int aln_start = q_map.GetAlnPosFromSeqPos (0, start);
1792  CAlnMap::TNumseg seg = q_map.GetSeg(aln_start);
1793  int pos = q_map.GetStart(1, seg);
1794  if (pos >=0) {//no mapping for gap
1795  start = q_map.GetSeqPosFromSeqPos(1, 0, start, IAlnExplorer::eForward);
1796  start = q_map.GetSeqPosFromSeqPos(0, 1, start);
1797  }
1798 
1799  int aln_stop = q_map.GetAlnPosFromSeqPos (0, stop);
1800  seg = q_map.GetSeg(aln_stop);
1801  pos = q_map.GetStart(1, seg);
1802  if (pos >=0) {
1803  stop = q_map.GetSeqPosFromSeqPos(1, 0, stop, IAlnExplorer::eBackwards);
1804  stop = q_map.GetSeqPosFromSeqPos(0, 1, stop);
1805  }
1806 
1807  if ((start - stop)*q_dir > 0) continue;
1808  //annotate query
1809  annot->m_DomainInfo[i] = start;
1810  annot->m_DomainInfo[i+1] = stop;
1811  }
1812 
1813 
1814 
1815  // extension of the first and last annotated domain (if any)
1816  int i = 0;
1817  int extension = 0;
1818  while (i<10 && annot->m_DomainInfo[i] < 0) i+=2;
1819  if (i < 10 && domain_info[i] > 0) {
1820  extension = (domain_info[i] - 1 -
1821  s_map.GetSeqPosFromSeqPos(1, 0, annot->m_DomainInfo[i],
1822  IAlnExplorer::eBackwards))*q_dir;
1823  annot->m_DomainInfo[i] += extension;
1824  //this does not get reversed like m_DomainInfo
1825  annot->m_DomainInfo_S[i] -= abs(extension);
1826 
1827  if (annot->m_DomainInfo[i] < 0) annot->m_DomainInfo[i] = 0;
1828  if (annot->m_DomainInfo_S[i] < 0) annot->m_DomainInfo_S[i] = 0;
1829 
1830  i+=2;
1831  while (i<10 && annot->m_DomainInfo[i] >=0) {
1832  annot->m_DomainInfo[i] = annot->m_DomainInfo[i-1] + q_dir;
1833  i+=2;
1834  }
1835  i = 9;
1836  while (i>0 && annot->m_DomainInfo[i] < 0) i-=2;
1837  if (i >= 0) {
1838  annot->m_DomainInfo[i] += (domain_info[i] - 1 -
1839  s_map.GetSeqPosFromSeqPos(1, 0, annot->m_DomainInfo[i],
1840  IAlnExplorer::eForward))*q_dir;
1841  if (annot->m_DomainInfo[i] < 0) annot->m_DomainInfo[i] = 0;
1842  }
1843  }
1844 
1845  // any extra alignments after FWR3 are attributed to CDR3
1846  start = annot->m_DomainInfo[9];
1847 
1848  if (start >= 0 && (start - q_ends[1])*q_dir < 0) {
1849  start = q_map.GetSeqPosFromSeqPos(1, 0, start+q_dir, IAlnExplorer::eForward);
1850  start = q_map.GetSeqPosFromSeqPos(0, 1, start);
1851 
1852  if ((start - q_ends[1])*q_dir <= 0) {
1853  annot->m_DomainInfo[10] = start;
1854  annot->m_DomainInfo[11] = q_ends[1];
1855  }
1856  }
1857  // annotate the query frame offset
1858  int frame_offset = m_AnnotationInfo.GetFrameOffset(sid);
1859 
1860  if (frame_offset >= 0) {
1861  int q_start = (*it)->GetSeqStart(0);
1862  int q_stop = (*it)->GetSeqStop(0);
1863  int q_mid = q_start + q_stop;
1864  int q_dif = q_stop - q_start;
1865  int frame_adj = (3 - ((*it)->GetSeqStart(1) + 3 - frame_offset) % 3) %3;
1866  annot->m_FrameInfo[0] = (q_mid - q_dir *q_dif)/2 + q_dir * frame_adj;
1867 
1868  //counting frame from fwr3 end, not the V end since we need to ignore a few bases
1869  //in the CDR3 to allow any insertion or deletion at V gene end
1870  if (annot->m_DomainInfo[9] > 0) {
1871  int fwr3_stop = annot->m_DomainInfo[9];
1872 
1873  if (annot->m_MinusStrand) {
1874 
1875  q_start = max(q_start, fwr3_stop);
1876  q_mid = q_start + q_stop;
1877  q_dif = q_stop - q_start;
1878  frame_adj = (s_map.GetSeqPosFromSeqPos(1, 0, q_start, IAlnExplorer::eBackwards) + 3 - frame_offset) % 3;
1879  } else {
1880  q_stop = min(q_stop, fwr3_stop);
1881  q_mid = q_start + q_stop;
1882  q_dif = q_stop - q_start;
1883  frame_adj = (s_map.GetSeqPosFromSeqPos(1, 0, q_stop, IAlnExplorer::eBackwards) + 3 - frame_offset) % 3;
1884  }
1885  } else {
1886  frame_adj = ((*it)->GetSeqStop(1) + 3 - frame_offset) % 3;
1887  }
1888 
1889  annot->m_FrameInfo[1] = (q_mid + q_dir *q_dif)/2 - q_dir * frame_adj;
1890  }
1891  break;
1892 
1893  }
1895  //only use top hit custom annotation. This is not done via alignment mapping.
1896  break;
1897  }
1898  }
1899  }
1900  ++iq;
1901  }
1902 };
1903 
1905  vector<CRef <CIgAnnotation> > &annots)
1906 {
1907  int iq = 0;
1909 
1910  CIgAnnotation *annot = &*(annots[iq++]);
1911 
1912  if ((*result)->HasAlignments()) {
1913  int num_aligns = (*result)->GetSeqAlign()->Size();
1914  CIgBlastResults *ig_result = dynamic_cast<CIgBlastResults *>
1915  (const_cast<CSearchResults *>(&**result));
1916  for (int i=0; i<ig_result->m_NumActualV; ++i, --num_aligns) {
1917  annot->m_ChainType.push_back("V");
1918  }
1919  for (int i=0; i<ig_result->m_NumActualD; ++i, --num_aligns) {
1920  annot->m_ChainType.push_back("D");
1921  }
1922  for (int i=0; i<ig_result->m_NumActualJ; ++i, --num_aligns) {
1923  annot->m_ChainType.push_back("J");
1924  }
1925  for (int i=0; i<ig_result->m_NumActualC; ++i, --num_aligns) {
1926  annot->m_ChainType.push_back("C");
1927  }
1928  for (int i=0; i<num_aligns; ++i) {
1929  annot->m_ChainType.push_back("N/A");
1930  }
1931  }
1932  }
1933 };
1934 
1936 {
1938  if ((*result)->HasAlignments()) {
1939  CRef<CSeq_align_set> align(const_cast<CSeq_align_set *>
1940  (&*((*result)->GetSeqAlign())));
1941  CSeq_align_set::Tdata & align_list = align->Set();
1942  align_list.sort(s_CompareSeqAlignByEvalue);
1943  }
1944  }
1945 };
1946 
1947 // convert sequencecomparison to database mode
1949 {
1950  if (result->GetResultType() != eSequenceComparison) {
1951  return;
1952  }
1953 
1954  int num_queries = m_Query->Size();
1955  int num_results = result->GetNumResults();
1956  int ir = 0;
1957  CSearchResultSet *retv = new CSearchResultSet();
1958 
1959  for (int iq = 0; iq< num_queries && ir< num_results; ++iq) {
1960 
1961  CSearchResults &res = (*result)[ir++];
1962  CRef<CBlastAncillaryData> ancillary = res.GetAncillaryData();
1963  TQueryMessages errmsg = res.GetErrors();
1964  CConstRef<CSeq_id> rid = res.GetSeqId();
1965  CRef<CSeq_align_set> align(const_cast<CSeq_align_set *>
1966  (&*(res.GetSeqAlign())));
1967  CSeq_align_set::Tdata & align_list = align->Set();
1968 
1969  CConstRef<CSeq_id> qid = m_Query->GetBlastSearchQuery(iq)->GetQueryId();
1970  while(!qid->Match(*rid)) {
1972  CRef<CSearchResults> r(new CSearchResults(qid, empty, errmsg, ancillary));
1973  retv->push_back(r);
1974  qid = m_Query->GetBlastSearchQuery(++iq)->GetQueryId();
1975  }
1976 
1977  while(ir < num_results && (*result)[ir].GetSeqId()->Match(*qid)) {
1978  CSearchResults &add_res = (*result)[ir++];
1980  add.Reset(const_cast<CSeq_align_set *>
1981  (&*(add_res.GetSeqAlign())));
1982  CSeq_align_set::Tdata & add_list = add->Set();
1983  align_list.insert(align_list.end(), add_list.begin(), add_list.end());
1984  }
1985  CRef<CSearchResults> r(new CSearchResults(qid, align, errmsg, ancillary));
1986  retv->push_back(r);
1987  }
1988 
1989  result.Reset(retv);
1990 };
1991 
1993  int num_aligns,
1994  int gene,
1995  CRef<CSearchResultSet> &final_results)
1996 {
1997  bool new_result = (final_results.Empty());
1998  if (new_result) {
1999  final_results.Reset(new CSearchResultSet());
2000  }
2001 
2002  int iq = 0;
2004 
2005  CRef<CSeq_align_set> align;
2006  int actual_align = 0;
2007 
2008  if ((*result)->HasAlignments()) {
2009  align.Reset(const_cast<CSeq_align_set *>
2010  (&*((*result)->GetSeqAlign())));
2011 
2012  // keep only the first num_alignments
2013  if (num_aligns >= 0) {
2014  CSeq_align_set::Tdata & align_list = align->Set();
2015  if (align_list.size() > (CSeq_align_set::Tdata::size_type)num_aligns) {
2016  CSeq_align_set::Tdata::iterator it = align_list.begin();
2017  for (int i=0; i<num_aligns; ++i) ++it;
2018  align_list.erase(it, align_list.end());
2019  actual_align = num_aligns;
2020  } else {
2021  actual_align = align_list.size();
2022  }
2023  }
2024  }
2025 
2026  TQueryMessages errmsg = (*result)->GetErrors();
2027  CConstRef<CSeq_id> query = (*result)->GetSeqId();
2028 
2029  CIgBlastResults *ig_result;
2030  if (new_result) {
2031  // TODO maybe we need the db ancillary instead?
2032  CRef<CBlastAncillaryData> ancillary = (*result)->GetAncillaryData();
2033  ig_result = new CIgBlastResults(query, align, errmsg, ancillary);
2034  CRef<CSearchResults> r(ig_result);
2035  final_results->push_back(r);
2036  } else {
2037  while( !(*final_results)[iq].GetSeqId()->Match(*query)) ++iq;
2038  ig_result = dynamic_cast<CIgBlastResults *> (&(*final_results)[iq]);
2039  if (!align.Empty()) {
2040  CSeq_align_set::Tdata & ig_list = ig_result->SetSeqAlign()->Set();
2041  CSeq_align_set::Tdata & align_list = align->Set();
2042 
2043  if (gene < 0) {
2044  // Remove duplicate seq_aligns
2045  CSeq_align_set::Tdata::iterator it = align_list.begin();
2046  while (it != align_list.end()) {
2047  if (s_SeqAlignInSet(ig_list, *it)) it = align_list.erase(it);
2048  else ++it;
2049  }
2050  }
2051 
2052  if (!align_list.empty()) {
2053  ig_list.insert(ig_list.end(), align_list.begin(), align_list.end());
2054  ig_result->GetErrors().Combine(errmsg);
2055  }
2056  }
2057  }
2058 
2059  switch(gene) {
2060  case 0: ig_result->m_NumActualV = actual_align; break;
2061  case 1: ig_result->m_NumActualD = actual_align; break;
2062  case 2: ig_result->m_NumActualJ = actual_align; break;
2063  case 3: ig_result->m_NumActualC = actual_align; break;
2064  default: break;
2065  }
2066  }
2067 };
2068 
2070  CRef<CSearchResultSet> &final_results)
2071 {
2072  int iq = 0;
2073  NON_CONST_ITERATE(CSearchResultSet, result, *final_results) {
2074  CIgBlastResults *ig_result = dynamic_cast<CIgBlastResults *>
2075  (const_cast<CSearchResults *>(&**result));
2076  CIgAnnotation *annot = &*(annots[iq++]);
2077  ig_result->SetIgAnnotation().Reset(annot);
2078  if (annot->m_GeneInfo[4] < 0 && m_IgOptions->m_MinJLength > 0) { //no J
2079  if ((*result)->HasAlignments()){
2080  (*result)->SetSeqAlign()->Set().clear();
2081  }
2082  }
2083  }
2084 };
2085 
2086 END_SCOPE(blast)
2088 
2089 /* @} */
#define static
Declares the CBl2Seq (BLAST 2 Sequences) class.
@ eSequenceComparison
Seq-aligns in the BLAST 2 Sequence style (one alignment per query-subject pair)
@ eBlastn
Nucl-Nucl (traditional blastn)
Definition: blast_types.hpp:58
@ eBlastp
Protein-Protein.
Definition: blast_types.hpp:59
ncbi::TMaskedQueryRegions mask
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:614
TSignedSeqPos GetAlnPosFromSeqPos(TNumrow row, TSeqPos seq_pos, ESearchDirection dir=eNone, bool try_reverse_dir=true) const
Definition: alnmap.cpp:527
TNumseg GetSeg(TSeqPos aln_pos) const
Definition: alnmap.cpp:373
CDense_seg::TNumseg TNumseg
Definition: alnmap.hpp:72
TSignedSeqPos GetSeqPosFromSeqPos(TNumrow for_row, TNumrow row, TSeqPos seq_pos, ESearchDirection dir=eNone, bool try_reverse_dir=true) const
Definition: alnmap.cpp:688
CBioseq_Handle –.
Runs the BLAST algorithm between 2 sequences.
Definition: bl2seq.hpp:58
Defines BLAST error codes (user errors included)
Creates BlastOptionsHandle objects with default values for the programs/tasks requested.
Encapsulates ALL the BLAST algorithm's options.
size_type Size() const
Returns the number of queries found in this query vector.
Definition: sseqloc.hpp:305
void SetMaskedRegions(size_type i, TMaskedQueryRegions mqr)
Assign a list of masked regions to one query.
Definition: sseqloc.hpp:350
void AddMask(size_type i, CRef< CSeqLocInfo > sli)
Add a masked region to the set for a query.
Definition: sseqloc.hpp:359
CRef< CBlastSearchQuery > GetBlastSearchQuery(size_type i) const
Get the CBlastSearchQuery object at index i.
Definition: sseqloc.hpp:367
CFile –.
Definition: ncbifile.hpp:1605
Class to perform a BLAST search on local BLAST databases Note that PHI-BLAST can be run using this cl...
Definition: local_blast.hpp:62
NCBI C++ Object Manager dependant implementation of IQueryFactory.
API for Remote Blast Requests.
CScope –.
Definition: scope.hpp:92
Search Results for All Queries.
Search Results for One Query.
CSeqDB.
Definition: seqdb.hpp:161
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
@ eNucleotide
Definition: seqdb.hpp:175
@ eProtein
Definition: seqdb.hpp:174
CRef< CBioseq > SeqidToBioseq(const CSeq_id &seqid) const
Get a CBioseq for a given Seq-id.
Definition: seqdb.cpp:1021
structure for seqloc info
Definition: seqlocinfo.hpp:48
bool IsEmpty() const
TSeqPos GetSeqStop(TDim row) const
Definition: Seq_align.cpp:273
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
bool GetNamedScore(const string &id, int &score) const
Get score.
Definition: Seq_align.cpp:563
TSeqPos GetSeqStart(TDim row) const
Definition: Seq_align.cpp:252
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
TSeqPos GetAlignLength(bool include_gaps=true) const
Get the length of this alignment.
Definition: Seq_align.cpp:1993
@ eRight
Towards higher aln coord (always to the right)
@ eBackwards
Towards lower seq coord (to the left if plus strand, right if minus)
@ eForward
Towards higher seq coord (to the right if plus strand, left if minus)
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
Class for the messages for an individual query sequence.
Constants used in compositional score matrix adjustment.
@ eNoCompositionBasedStats
Don't use composition based statistics.
bool m_ExtendAlign5end
Definition: igblast.hpp:85
void x_AnnotateDomain(CRef< CSearchResultSet > &gl_results, CRef< CSearchResultSet > &dm_results, vector< CRef< CIgAnnotation > > &annot)
Annotate the query chaintype and domains based on blast results.
Definition: igblast.cpp:1645
static bool s_DJNotCompatible(const CSeq_align &d, const CSeq_align &j, bool ms, int margin)
Definition: igblast.cpp:911
bool m_IsProtein
Definition: igblast.hpp:66
string GetDatabaseName() const
Returns the database name if appropriate, else kEmptyStr for subject sequences.
void SetCompositionBasedStats(ECompoAdjustModes mode)
void SetEvalueThreshold(double eval)
Sets EvalueThreshold.
CRef< IQueryFactory > m_Subject
Definition: igblast.hpp:301
virtual void SetNumberOfThreads(size_t nthreads)
Mutator for the number of threads.
double GetEvalueThreshold() const
int m_DomainInfo_S[10]
Definition: igblast.hpp:108
CRef< CSearchResultSet > Run()
Run the Ig-BLAST engine.
Definition: igblast.cpp:356
static bool s_CompareSeqAlignByScore(const CRef< CSeq_align > &x, const CRef< CSeq_align > &y)
Definition: igblast.cpp:896
string m_ChainTypeToShow
Definition: igblast.hpp:99
CRef< CSeq_align_set > & SetSeqAlign()
Definition: igblast.hpp:214
CIgAnnotationInfo(CConstRef< CIgBlastOptions > &ig_options)
Definition: igblast.cpp:83
int GetJDomain(const string &sid)
Definition: igblast.hpp:163
void x_AnnotateC(CRef< CSearchResultSet > &results_c, CRef< CSearchResultSet > &results_j, vector< CRef< CIgAnnotation > > &annot)
Definition: igblast.cpp:1500
static int max_allowed_VD_distance
Definition: igblast.cpp:56
CRef< CLocalDbAdapter > m_LocalDb
Definition: igblast.hpp:302
bool GetDomainInfo(const string sid, int *domain_info)
Definition: igblast.hpp:137
string m_EntrezQuery
Definition: igblast.hpp:307
void SetGapOpeningCost(int g)
static int max_J_length
Definition: igblast.cpp:60
static void s_ReadLinesFromFile(const string &fn, vector< string > &lines)
Definition: igblast.cpp:65
CConstRef< objects::CSeq_align_set > GetSeqAlign() const
Accessor for the Seq-align results.
vector< string > m_ChainType
Definition: igblast.hpp:98
static void s_SortResultsByEvalue(CRef< CSearchResultSet > &results)
Sort blast results according to evalue.
Definition: igblast.cpp:1935
const string & GetRID(void)
Gets the request id (RID) associated with the search.
static int extend_length3end
Definition: igblast.cpp:59
void x_AnnotateDJ(CRef< CSearchResultSet > &results_D, CRef< CSearchResultSet > &results_J, vector< CRef< CIgAnnotation > > &annot)
Annotate the D and J genes based on blast results.
Definition: igblast.cpp:1595
static bool s_SeqAlignInSet(CSeq_align_set::Tdata &align_list, CRef< CSeq_align > &align)
Definition: igblast.cpp:858
CRef< CLocalDbAdapter > m_Db[5]
Definition: igblast.hpp:78
static int max_allowed_VJ_distance_with_D
Definition: igblast.cpp:54
int m_FrameInfo[3]
Definition: igblast.hpp:103
CRef< CSearchResultSet > Run()
Executes the search.
void x_FindDJAln(CRef< CSeq_align_set > &align_D, CRef< CSeq_align_set > &align_J, string q_ct, bool q_ms, ENa_strand q_st, int q_ve, int iq, bool va_or_vd_as_heavy_chain)
Definition: igblast.cpp:972
void x_ProcessCResult(CRef< CSearchResultSet > &results_C, vector< CRef< CIgAnnotation > > &annots)
Definition: igblast.cpp:1425
void x_SetAnnotation(vector< CRef< CIgAnnotation > > &annot, CRef< CSearchResultSet > &final_results)
Append annotation info to the final results.
Definition: igblast.cpp:2069
void x_SetupNoOverlapDSearch(const vector< CRef< CIgAnnotation > > &annots, CRef< CSearchResultSet > &results, CRef< IQueryFactory > &qf, CRef< CBlastOptionsHandle > &opts_hndl, int db_type)
Definition: igblast.cpp:692
int m_GeneInfo[8]
Definition: igblast.hpp:102
CBlastOptions & SetOptions()
Returns a reference to the internal options class which this object is a handle for.
map< string, int > m_Fwr4EndOffset
Definition: igblast.hpp:193
void SetGapExtensionCost(int e)
map< string, int > m_DomainIndex
Definition: igblast.hpp:187
bool IsBlastDb() const
Returns true if this object represents a BLAST database.
static int max_allowed_V_end_to_J_end
Definition: igblast.cpp:61
int m_DomainInfo[12]
Definition: igblast.hpp:105
CRef< CIgAnnotation > & SetIgAnnotation()
Definition: igblast.hpp:210
void x_ScreenByAlignLength(CRef< CSearchResultSet > &results, int length)
Definition: igblast.cpp:173
int GetFrameOffset(const string sid)
Definition: igblast.hpp:155
int m_NumAlign[4]
Definition: igblast.hpp:82
void x_SetupDJSearch(const vector< CRef< CIgAnnotation > > &annots, CRef< IQueryFactory > &qf, CRef< CBlastOptionsHandle > &opts_hndl, int db_type)
Prepare blast option handle and query for D, J germline database search.
Definition: igblast.cpp:622
void SetMismatchPenalty(int p)
vector< string > m_TopGeneIds
Definition: igblast.hpp:97
bool m_DetectOverlap
Definition: igblast.hpp:89
CRef< CBlastAncillaryData > GetAncillaryData() const
Accessor for the query's search ancillary.
static string s_RemoveLocalPrefix(const string &sid)
Definition: igblast.cpp:809
bool m_IsLocal
Definition: igblast.hpp:298
void SetHitlistSize(int s)
Sets HitlistSize.
CRef< objects::CSeq_align_set > SetSeqAlign()
Only intended to be used if you need to edit the seqlign.
CConstRef< CIgBlastOptions > m_IgOptions
Definition: igblast.hpp:305
void x_SetupCRegionSearch(const vector< CRef< CIgAnnotation > > &annots, CRef< IQueryFactory > &qf, CRef< CBlastOptionsHandle > &opts_hndl)
Definition: igblast.cpp:575
CRef< CBlastOptionsHandle > m_Options
Definition: igblast.hpp:304
string m_RID
Definition: igblast.hpp:309
vector< int > m_DomainData
Definition: igblast.hpp:188
static void s_AppendResults(CRef< CSearchResultSet > &results, int num_aligns, int gene, CRef< CSearchResultSet > &final_results)
Append blast results to the final results.
Definition: igblast.cpp:1992
const string GetDomainChainType(const string sid)
Definition: igblast.hpp:148
void SetWordSize(int ws)
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
bool Submit(void)
This submits the search (if necessary) and returns immediately.
void x_SetupDbSearch(vector< CRef< CIgAnnotation > > &annot, CRef< IQueryFactory > &qf)
Prepare blast option handle and query for specified database search.
Definition: igblast.cpp:754
CRef< CSearchResultSet > RunEx()
Performs the same functionality as Run(), but it returns a different data type.
Definition: bl2seq.cpp:196
string m_Origin
Definition: igblast.hpp:67
map< string, int > m_FrameOffset
Definition: igblast.hpp:190
map< string, string > m_DJChainType
Definition: igblast.hpp:191
size_t m_NumThreads
Definition: igblast.hpp:299
void x_AnnotateD(CRef< CSearchResultSet > &results_D, vector< CRef< CIgAnnotation > > &annot)
Definition: igblast.cpp:1473
void x_FillJDomain(CRef< CSeq_align > &align, CRef< CIgAnnotation > &annot)
Definition: igblast.cpp:1238
static bool s_CompareSeqAlignByScoreAndName(const CRef< CSeq_align > &x, const CRef< CSeq_align > &y)
Definition: igblast.cpp:530
static int max_v_j_overlap
Definition: igblast.cpp:62
void x_ProcessDJResult(CRef< CSearchResultSet > &results_V, CRef< CSearchResultSet > &results_D, CRef< CSearchResultSet > &results_J, vector< CRef< CIgAnnotation > > &annots)
Definition: igblast.cpp:1454
const string GetDJChainType(const string sid)
Definition: igblast.hpp:179
string m_AuxFilename
Definition: igblast.hpp:74
void x_ExtendAlign3end(CRef< CSearchResultSet > &results)
Definition: igblast.cpp:275
static int j_wordsize
Definition: igblast.cpp:63
int m_JDomain[5]
Definition: igblast.hpp:111
CIgAnnotationInfo m_AnnotationInfo
Definition: igblast.hpp:306
static int extend_length5end
Definition: igblast.cpp:58
int GetFwr4EndOffset(const string &sid)
Definition: igblast.hpp:171
static int max_allowed_j_deletion
Definition: igblast.cpp:57
CRef< CScope > m_Scope
Definition: igblast.hpp:308
string m_IgDataPath
Definition: igblast.hpp:77
string m_DFrameFileName
Definition: igblast.hpp:75
void push_back(value_type &element)
Add a value to the back of this container.
CRef< CSearchResultSet > GetResultSet()
Submit the search (if necessary) and return the results.
void SetFilterString(const char *f, bool clear=true)
Sets FilterString.
map< string, int > m_JDomainInfo
Definition: igblast.hpp:192
TQueryMessages GetErrors(int min_severity=eBlastSevError) const
Accessor for the error/warning messsages for this query.
void x_FindDJ(CRef< CSearchResultSet > &results_D, CRef< CSearchResultSet > &results_J, CRef< CIgAnnotation > &annot, CRef< CSeq_align_set > &align_D, CRef< CSeq_align_set > &align_J, string q_ct, bool q_ms, ENa_strand q_st, int q_ve, int iq)
Definition: igblast.cpp:1169
CConstRef< objects::CSeq_id > GetSeqId() const
Accessor for the query's sequence identifier.
void Combine(const TQueryMessages &other)
Combine other messages with these.
Definition: blast_aux.cpp:978
void x_AnnotateJ(CRef< CSearchResultSet > &results_J, vector< CRef< CIgAnnotation > > &annot)
Definition: igblast.cpp:1561
CRef< CBlastQueryVector > m_Query
Definition: igblast.hpp:300
static int max_allowed_VJ_distance_without_D
Definition: igblast.cpp:55
bool m_MinusStrand
Definition: igblast.hpp:96
bool m_ExtendAlign3end
Definition: igblast.hpp:86
void x_ExtendAlign5end(CRef< CSearchResultSet > &results)
Definition: igblast.cpp:190
void x_AnnotateV(CRef< CSearchResultSet > &results, vector< CRef< CIgAnnotation > > &annot)
Annotate the V gene based on blast results.
Definition: igblast.cpp:839
void x_ProcessDGeneResult(CRef< CSearchResultSet > &results_V, CRef< CSearchResultSet > &results_D, CRef< CSearchResultSet > &results_J, vector< CRef< CIgAnnotation > > &annots)
Definition: igblast.cpp:1319
static bool s_CompareSeqAlignByEvalue(const CRef< CSeq_align > &x, const CRef< CSeq_align > &y)
Definition: igblast.cpp:869
void x_SetChainType(CRef< CSearchResultSet > &results, vector< CRef< CIgAnnotation > > &annot)
Set the subject chain type and frame info.
Definition: igblast.cpp:1904
static string s_MakeTopHitsId(const CSeq_align_set::Tdata &align_list, int num_align)
Definition: igblast.cpp:815
string m_CustomInternalData
Definition: igblast.hpp:76
map< string, string > m_DomainChainType
Definition: igblast.hpp:189
CRef< CSearchDatabase > m_RemoteDb
Definition: igblast.hpp:303
void x_SetupVSearch(CRef< IQueryFactory > &qf, CRef< CBlastOptionsHandle > &opts_hndl)
Prepare blast option handle and query for V germline database search.
Definition: igblast.cpp:553
static bool s_IsSeqAlignAsGood(const CRef< CSeq_align > &x, const CRef< CSeq_align > &y)
Definition: igblast.cpp:791
string m_DomainSystem
Definition: igblast.hpp:68
int GetWordSize() const
void x_ConvertResultType(CRef< CSearchResultSet > &results)
Convert bl2seq result to database search mode.
Definition: igblast.cpp:1948
void SetEntrezQuery(const char *x)
Restrict search to sequences matching this Entrez query.
void SetMatchReward(int r)
bool HasAlignments() const
Return true if there are any alignments for this query.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
CDiagContext_Extra & Print(const string &name, const string &value)
The method does not print the argument, but adds it to the string.
Definition: ncbidiag.cpp:2622
CDiagContext & GetDiagContext(void)
Get diag context instance.
Definition: logging.cpp:818
CDiagContext_Extra Extra(void) const
Create a temporary CDiagContext_Extra object.
Definition: ncbidiag.hpp:2095
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1065
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:605
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void RemoveBioseq(const CBioseq_Handle &seq)
Revoke Bioseq previously added using AddBioseq().
Definition: scope.cpp:382
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TSeqPos GetBioseqLength(void) const
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
#define NcbiEmptyString
Definition: ncbistr.hpp:122
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2510
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
Tdata & Set(void)
Assign a value to data member.
list< CRef< CSeq_align > > Tdata
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
Declares CIgBlast, the C++ API for the IG-BLAST engine.
int i
int len
Main class to perform a BLAST search on the local machine.
constexpr bool empty(list< Ts... >) noexcept
#define abs(a)
Definition: ncbi_heapmgr.c:130
T max(T x_, T y_)
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
The Object manager core.
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
#define count
static int * results[]
Declares the CRemoteBlast class.
static bool GetSeqId(const T &d, set< string > &labels, const string name="", bool detect=false, bool found=false)
string SeqDB_ResolveDbPath(const string &filename)
Resolve a file path using SeqDB's path algorithms.
static SLJIT_INLINE sljit_ins ms(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
static string subject
static string query
#define _ASSERT
else result
Definition: token2.c:20
#define const
Definition: zconf.h:232
Modified on Wed Sep 04 15:02:01 2024 by modify_doxy.py rev. 669887