NCBI C++ ToolKit
cn3d_threader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cn3d_threader.cpp 97253 2022-06-29 17:35:29Z dzhang $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Paul Thiessen
27 *
28 * File Description:
29 * class to isolate and run the threader
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistd.hpp> // must come first to avoid NCBI type clashes
36 #include <corelib/ncbistl.hpp>
37 
38 #include <memory>
39 
41 
44 
46 
48 #include "cn3d_threader.hpp"
49 #include "sequence_set.hpp"
50 #include "molecule.hpp"
51 #include "structure_set.hpp"
52 #include "residue.hpp"
53 #include "coord_set.hpp"
54 #include "atom_set.hpp"
55 #include "cn3d_tools.hpp"
56 #include "molecule_identifier.hpp"
57 #include "sequence_viewer.hpp"
58 #include "cn3d_pssm.hpp"
59 
62 
63 
64 BEGIN_SCOPE(Cn3D)
65 
66 // define to include debugging output (threader structures to files)
67 //#define DEBUG_THREADER
68 
69 // always do debugging in Debug build mode
70 #if (!defined(DEBUG_THREADER) && defined(_DEBUG))
71 #define DEBUG_THREADER
72 #endif
73 
74 // default threading options
76  mergeAfterEachSequence(true),
77  freezeIsolatedBlocks(true),
78  weightPSSM(0.5),
79  loopLengthMultiplier(1.5),
80  nRandomStarts(1),
81  nResultAlignments(1),
82  terminalResidueCutoff(-1)
83 {
84 }
85 
87 
88 // gives threader residue number for a character (-1 if non-standard aa)
90 {
91  typedef map < char, int > Char2Int;
92  static Char2Int charMap;
93 
94  if (charMap.size() == 0) {
95  for (unsigned int i=0; i<Threader::ThreaderResidues.size(); ++i)
96  charMap[Threader::ThreaderResidues[i]] = i;
97  }
98 
99  Char2Int::const_iterator c = charMap.find((unsigned char) toupper(r));
100  return ((c != charMap.end()) ? c->second : -1);
101 }
102 
103 const unsigned int Threader::SCALING_FACTOR = 1000000;
104 
105 const string Threader::ThreaderResidues = "ARNDCQEGHILKMFPSTWYV";
106 
107 // gives NCBIStdaa residue number for a threader residue number (or # for 'X' if char == -1)
109 {
110  r = toupper((unsigned char) r);
112  ((int)r >= 0 && r < (int)Threader::ThreaderResidues.size()) ? Threader::ThreaderResidues[r] : 'X');
113 }
114 
116  alignmentManager(parentAlnMgr)
117 {
118 }
119 
121 {
122  ContactMap::iterator c, ce = contacts.end();
123  for (c=contacts.begin(); c!=ce; ++c) FreeFldMtf(c->second);
124 }
125 
126 Seq_Mtf * Threader::CreateSeqMtf(const BlockMultipleAlignment *multiple, double weightPSSM)
127 {
128  // special case for "PSSM" of single-row "alignment" - just use BLOSUM62 score
129  if (multiple->NRows() == 1) {
130  Seq_Mtf *seqMtf = NewSeqMtf(multiple->GetMaster()->Length(), ThreaderResidues.size());
131  for (unsigned int res=0; res<multiple->GetMaster()->Length(); ++res)
132  for (unsigned int aa=0; aa<ThreaderResidues.size(); ++aa)
133  seqMtf->ww[res][aa] = ThrdRound(
134  weightPSSM * SCALING_FACTOR *
136  TRACEMSG("Created Seq_Mtf (PSSM) from BLOSUM62 scores");
137  return seqMtf;
138  }
139 
140  // can't calculate PSSM with no blocks
141  if (multiple->HasNoAlignedBlocks()) {
142  ERRORMSG("Can't create Seq_Mtf with no aligned blocks");
143  return NULL;
144  }
145 
146  // convert matrix
147  TRACEMSG("converting PSSM scores to Seq_Mtf");
148  Seq_Mtf *seqMtf = NewSeqMtf(multiple->GetMaster()->Length(), ThreaderResidues.size());
149  for (unsigned int c=0; c<multiple->GetMaster()->Length(); ++c)
150  for (unsigned int r=0; r<ThreaderResidues.size(); ++r)
151  seqMtf->ww[c][r] = ThrdRound(weightPSSM * SCALING_FACTOR *
152  multiple->GetPSSM().GetPSSMScore(LookupNCBIStdaaNumberFromCharacter(ThreaderResidues[r]), c));
153 
154  return seqMtf;
155 }
156 
157 Cor_Def * Threader::CreateCorDef(const BlockMultipleAlignment *multiple, double loopLengthMultiplier)
158 {
159  static const unsigned int MIN_LOOP_MAX = 2;
160  static const unsigned int EXTENSION_MAX = 10;
161 
163  multiple->GetUngappedAlignedBlocks(&alignedBlocks);
164  Cor_Def *corDef = NewCorDef(alignedBlocks.size());
165 
166  // zero loop constraints for tails
167  corDef->lll.llmn[0] = corDef->lll.llmn[alignedBlocks.size()] =
168  corDef->lll.llmx[0] = corDef->lll.llmx[alignedBlocks.size()] =
169  corDef->lll.lrfs[0] = corDef->lll.lrfs[alignedBlocks.size()] = 0;
170 
171  // loop constraints for unaligned regions between aligned blocks
172  BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator
173  b = alignedBlocks.begin(), be = alignedBlocks.end();
174  unsigned int n, max;
175  for (n=1, ++b; b!=be; ++n, ++b) {
176  const UnalignedBlock *uaBlock = multiple->GetUnalignedBlockBefore(*b);
177  if (uaBlock) {
178  max = (unsigned int) (loopLengthMultiplier * uaBlock->width);
179  if (max < MIN_LOOP_MAX) max = MIN_LOOP_MAX;
180  } else
181  max = MIN_LOOP_MAX;
182  corDef->lll.llmn[n] = 0;
183  corDef->lll.llmx[n] = max;
184  corDef->lll.lrfs[n] = max;
185  }
186 
187  // minimum block sizes (in coordinates of master)
188  const Block::Range *range;
189  int mid;
190  for (n=0, b=alignedBlocks.begin(); b!=be; ++b, ++n) {
191  range = (*b)->GetRangeOfRow(0);
192  mid = (range->from + range->to) / 2;
193  corDef->sll.rfpt[n] = mid;
194  corDef->sll.nomn[n] = mid - range->from;
195  corDef->sll.comn[n] = range->to - mid;
196  }
197 
198  // left extension - trim to available residues
199  corDef->sll.nomx[0] = corDef->sll.nomn[0] + EXTENSION_MAX;
200  if (corDef->sll.rfpt[0] - corDef->sll.nomx[0] < 0)
201  corDef->sll.nomx[0] = corDef->sll.rfpt[0];
202 
203  // right extension - trim to available residues
204  corDef->sll.comx[alignedBlocks.size() - 1] = corDef->sll.comn[alignedBlocks.size() - 1] + EXTENSION_MAX;
205  if (corDef->sll.rfpt[alignedBlocks.size() - 1] + corDef->sll.comx[alignedBlocks.size() - 1] >=
206  (int)multiple->GetMaster()->Length())
207  corDef->sll.comx[alignedBlocks.size() - 1] =
208  multiple->GetMaster()->Length() - corDef->sll.rfpt[alignedBlocks.size() - 1] - 1;
209 
210  // extensions into unaligned areas between blocks
211  const Block::Range *prevRange = NULL;
212  unsigned int nUnaligned, extN;
213  for (n=0, b=alignedBlocks.begin(); b!=be; ++b, ++n) {
214  range = (*b)->GetRangeOfRow(0);
215  if (n > 0) {
216  nUnaligned = range->from - prevRange->to - 1;
217  extN = nUnaligned / 2; // N extension of right block gets smaller portion if nUnaligned is odd
218  corDef->sll.nomx[n] = corDef->sll.nomn[n] + extN;
219  corDef->sll.comx[n - 1] = corDef->sll.comn[n - 1] + nUnaligned - extN;
220  }
221  prevRange = range;
222  }
223 
224  // no fixed segments
225  corDef->fll.n = 0;
226 
227  return corDef;
228 }
229 
231  const BlockMultipleAlignment *pairwise, int terminalCutoff)
232 {
233  const Sequence *dependentSeq = pairwise->GetSequenceOfRow(1);
234  BlockMultipleAlignment::UngappedAlignedBlockList multipleABlocks, pairwiseABlocks;
235  multiple->GetUngappedAlignedBlocks(&multipleABlocks);
236  pairwise->GetUngappedAlignedBlocks(&pairwiseABlocks);
237 
238  // query has # constraints = # blocks in multiple alignment
239  Qry_Seq *qrySeq = NewQrySeq(dependentSeq->Length(), multipleABlocks.size());
240 
241  // fill in residue numbers
242  unsigned int i;
243  for (i=0; i<dependentSeq->Length(); ++i)
245 
246  // if a block in the multiple is contained in the pairwise (looking at master coords),
247  // then add a constraint to keep it there
248  BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator
249  m, me = multipleABlocks.end(), p, pe = pairwiseABlocks.end();
250  const Block::Range *multipleRange, *pairwiseRange;
251  for (i=0, m=multipleABlocks.begin(); m!=me; ++i, ++m) {
252  multipleRange = (*m)->GetRangeOfRow(0);
253  for (p=pairwiseABlocks.begin(); p!=pe; ++p) {
254  pairwiseRange = (*p)->GetRangeOfRow(0);
255  if (pairwiseRange->from <= multipleRange->from && pairwiseRange->to >= multipleRange->to) {
256  int masterCenter = (multipleRange->from + multipleRange->to) / 2;
257  // offset of master residue at center of multiple block
258  int offset = masterCenter - pairwiseRange->from;
259  pairwiseRange = (*p)->GetRangeOfRow(1);
260  // dependent residue in pairwise aligned to master residue at center of multiple block
261  qrySeq->sac.mn[i] = qrySeq->sac.mx[i] = pairwiseRange->from + offset;
262  break;
263  }
264  }
265  }
266 
267  // if a terminal block is unconstrained (mn,mx == -1), set limits for how far the new
268  // (religned) block is allowed to be from the edge of the next block or from the
269  // aligned region set upon demotion
270  if (terminalCutoff >= 0) {
271  if (qrySeq->sac.mn[0] == -1) {
272  if (pairwise->alignDependentFrom >= 0) {
273  qrySeq->sac.mn[0] = pairwise->alignDependentFrom - terminalCutoff;
274  } else if (pairwiseABlocks.size() > 0) {
275  const Block::Range *nextQryBlock = pairwiseABlocks.front()->GetRangeOfRow(1);
276  qrySeq->sac.mn[0] = nextQryBlock->from - 1 - terminalCutoff;
277  }
278  if (qrySeq->sac.mn[0] < 0) qrySeq->sac.mn[0] = 0;
279  INFOMSG("new N-terminal block constrained to query loc >= " << qrySeq->sac.mn[0] + 1);
280  }
281  if (qrySeq->sac.mx[multipleABlocks.size() - 1] == -1) {
282  if (pairwise->alignDependentTo >= 0) {
283  qrySeq->sac.mx[multipleABlocks.size() - 1] = pairwise->alignDependentTo + terminalCutoff;
284  } else if (pairwiseABlocks.size() > 0) {
285  const Block::Range *prevQryBlock = pairwiseABlocks.back()->GetRangeOfRow(1);
286  qrySeq->sac.mx[multipleABlocks.size() - 1] = prevQryBlock->to + 1 + terminalCutoff;
287  }
288  if (qrySeq->sac.mx[multipleABlocks.size() - 1] >= qrySeq->n ||
289  qrySeq->sac.mx[multipleABlocks.size() - 1] < 0)
290  qrySeq->sac.mx[multipleABlocks.size() - 1] = qrySeq->n - 1;
291  INFOMSG("new C-terminal block constrained to query loc <= "
292  << qrySeq->sac.mx[multipleABlocks.size() - 1] + 1);
293  }
294  }
295 
296  return qrySeq;
297 }
298 
299 /*----------------------------------------------------------------------------
300  * stuff to read in the contact potential. (code swiped from DDV)
301  *---------------------------------------------------------------------------*/
302 static unsigned int CountWords(char* chs) {
303  unsigned int i, Count=0;
304  bool InsideStr = false;
305  string Str(chs);
306  for (i=0; i<Str.size(); ++i) {
307  if (!InsideStr && (Str[i] != ' ')) {
308  ++Count;
309  InsideStr = true;
310  }
311  if (Str[i] == ' ') {
312  InsideStr = false;
313  }
314  }
315  return(Count);
316 }
317 static void ReadToRowOfEnergies(ifstream& InFile, unsigned int NumResTypes) {
318  char Str[1024];
319  while (!InFile.eof()) {
320  InFile.getline(Str, sizeof(Str));
321  if (CountWords(Str) == NumResTypes) {
322  break;
323  }
324  }
325 }
326 
327 static const unsigned int NUM_RES_TYPES = 21;
328 
329 Rcx_Ptl * Threader::CreateRcxPtl(double weightContacts)
330 {
331  Rcx_Ptl* pmf;
332  const char *FileName = "ContactPotential";
333  char ResName[32];
334  char Path[512];
335  unsigned int i, j, k;
336  double temp;
337 
338  static const unsigned int kNumDistances = 6;
339  static const unsigned int kPeptideIndex = 20;
340 
341  /* open the contact potential for reading */
342  unique_ptr<CNcbiIfstream> InFile(new CNcbiIfstream((GetDataDir()+FileName).c_str(), IOS_BASE::in));
343  if (!(*InFile)) {
344  ERRORMSG("Threader::CreateRcxPtl() - can't open " << Path << " for reading");
345  return NULL;
346  }
347 
348  pmf = NewRcxPtl(NUM_RES_TYPES, kNumDistances, kPeptideIndex);
349 
350  /* read in the contact potential */
351  for (i=0; i<kNumDistances; ++i) {
353  if (InFile->eof()) goto error;
354  for (j=0; j<NUM_RES_TYPES; ++j) {
355  InFile->getline(ResName, sizeof(ResName), ' '); /* skip residue name */
356  if (InFile->eof()) goto error;
357  for (k=0; k<NUM_RES_TYPES; ++k) {
358  *InFile >> temp;
359  if (InFile->eof()) goto error;
360  pmf->rre[i][j][k] = ThrdRound(temp*SCALING_FACTOR*weightContacts);
361  }
362  }
363  }
364 
365  /* read in the hydrophobic energies */
366  ReadToRowOfEnergies(*InFile, kNumDistances);
367  for (i=0; i<NUM_RES_TYPES; ++i) {
368  InFile->getline(ResName, sizeof(ResName), ' '); /* skip residue name */
369  if (InFile->eof()) goto error;
370  for (j=0; j<kNumDistances; ++j) {
371  *InFile >> temp;
372  if (InFile->eof()) goto error;
373  pmf->re[j][i] = ThrdRound(temp*SCALING_FACTOR*weightContacts);
374  }
375  }
376 
377  /* calculate sum of pair energies plus hydrophobic energies */
378  for(i=0; i<kNumDistances; ++i) {
379  for(j=0; j<NUM_RES_TYPES; ++j) {
380  for(k=0; k<NUM_RES_TYPES; ++k) {
381  pmf->rrt[i][j][k] = pmf->rre[i][j][k] + pmf->re[i][j] + pmf->re[i][k];
382  }
383  }
384  }
385 
386  return(pmf);
387 
388 error:
389  ERRORMSG("Threader::CreateRcxPtl() - error parsing " << FileName);
390  FreeRcxPtl(pmf);
391  return NULL;
392 }
393 
394 /*----------------------------------------------------------------------------
395  * set up the annealing parameters. (more code swiped from DDV)
396  * hard-coded for now. later we can move these parameters to a file.
397  *---------------------------------------------------------------------------*/
398 Gib_Scd * Threader::CreateGibScd(bool fast, unsigned int nRandomStarts)
399 {
400  Gib_Scd* gsp;
401  unsigned int NumTrajectoryPoints;
402 
403  static const unsigned int kNumTempSteps = 3;
404 
405  gsp = NewGibScd(kNumTempSteps);
406 
407  gsp->nrs = nRandomStarts; /* Number of random starts */
408  gsp->nts = kNumTempSteps; /* Number of temperature steps */
409  gsp->crs = 50; /* Number of starts before convergence test */
410  gsp->cfm = 20; /* Top thread frequency convergence criterion */
411  gsp->csm = 5; /* Top thread start convergence criterion */
412  gsp->cet = 5 * SCALING_FACTOR;/* Temperature for convergence test ensemble */
413  gsp->cef = 50; /* Percent of ensemble defining top threads */
414  gsp->isl = 1; /* Code for choice of starting locations */
415  gsp->iso = 0; /* Code for choice of segment sample order */
416  gsp->ito = 0; /* Code for choice of terminus sample order */
417  gsp->rsd = -1; /* Seed for random number generator -- neg for Rand01() */
418  gsp->als = 0; /* Code for choice of alignment record style */
419  gsp->trg = 0; /* Code for choice of trajectory record */
420 
421  if (fast) {
422  // gsp->nti[0] = 5; /* Number of iterations per tempeature step */
423  // gsp->nti[1] = 10;
424  // gsp->nti[2] = 25;
425  gsp->nti[0] = 10; /* Number of iterations per tempeature step */
426  gsp->nti[1] = 20;
427  gsp->nti[2] = 40;
428  } else {
429  gsp->nti[0] = 10; /* Number of iterations per tempeature step */
430  gsp->nti[1] = 20;
431  gsp->nti[2] = 40;
432  }
433 
434  gsp->nac[0] = 4; /* Number of alignment cycles per iteration */
435  gsp->nac[1] = 4;
436  gsp->nac[2] = 4;
437 
438  gsp->nlc[0] = 2; /* Number of location cycles per iteration */
439  gsp->nlc[1] = 2;
440  gsp->nlc[2] = 2;
441 
442  if (fast) {
443  // gsp->tma[0] = 5 * SCALING_FACTOR; /* Temperature steps for alignment sampling */
444  // gsp->tma[1] = 5 * SCALING_FACTOR;
445  // gsp->tma[2] = 5 * SCALING_FACTOR;
446  gsp->tma[0] = 20 * SCALING_FACTOR; /* Temperature steps for alignment sampling */
447  gsp->tma[1] = 10 * SCALING_FACTOR;
448  gsp->tma[2] = 5 * SCALING_FACTOR;
449  } else {
450  gsp->tma[0] = 20 * SCALING_FACTOR; /* Temperature steps for alignment sampling */
451  gsp->tma[1] = 10 * SCALING_FACTOR;
452  gsp->tma[2] = 5 * SCALING_FACTOR;
453  }
454 
455  gsp->tml[0] = 5 * SCALING_FACTOR; /* Temperature steps for location sampling */
456  gsp->tml[1] = 5 * SCALING_FACTOR;
457  gsp->tml[2] = 5 * SCALING_FACTOR;
458 
459  gsp->lms[0] = 0; /* Iterations before local minimum test */
460  gsp->lms[1] = 10;
461  gsp->lms[2] = 20;
462 
463  gsp->lmw[0] = 0; /* Iterations in local min test interval */
464  gsp->lmw[1] = 10;
465  gsp->lmw[2] = 10;
466 
467  gsp->lmf[0] = 0; /* Percent of top score indicating local min */
468  gsp->lmf[1] = 80;
469  gsp->lmf[2] = 95;
470 
471  if (gsp->als == 0) {
472  NumTrajectoryPoints = 1;
473  } else {
474  NumTrajectoryPoints = 0;
475  for (unsigned int i=0; i<kNumTempSteps; ++i) {
476  NumTrajectoryPoints += gsp->nti[i] * (gsp->nac[i] + gsp->nlc[i]);
477  }
478  NumTrajectoryPoints *= gsp->nrs;
479  NumTrajectoryPoints += gsp->nrs;
480  }
481  gsp->ntp = NumTrajectoryPoints;
482 
483  return(gsp);
484 }
485 
486 #define NO_VIRTUAL_COORDINATE(coord) \
487  do { coord->type = Threader::MISSING_COORDINATE; return; } while (0)
488 
489 static void GetVirtualResidue(const AtomSet *atomSet, const Molecule *mol,
490  const Residue *res, Threader::VirtualCoordinate *coord)
491 {
492  // find coordinates of key atoms
493  const AtomCoord *C = NULL, *CA = NULL, *CB = NULL, *N = NULL;
494  Residue::AtomInfoMap::const_iterator a, ae = res->GetAtomInfos().end();
495  for (a=res->GetAtomInfos().begin(); a!=ae; ++a) {
496  AtomPntr ap(mol->id, res->id, a->first);
497  if (a->second->atomicNumber == 6) {
498  if (a->second->code == " C ")
499  C = atomSet->GetAtom(ap, true, true);
500  else if (a->second->code == " CA ")
501  CA = atomSet->GetAtom(ap, true, true);
502  else if (a->second->code == " CB ")
503  CB = atomSet->GetAtom(ap, true, true);
504  } else if (a->second->atomicNumber == 7 && a->second->code == " N ")
505  N = atomSet->GetAtom(ap, true, true);
506  if (C && CA && CB && N) break;
507  }
508  if (!C || !CA || !N) NO_VIRTUAL_COORDINATE(coord);
509 
510  // find direction of real or idealized C-beta
511  Vector toCB;
512 
513  // if C-beta present, vector is in its direction
514  if (CB) {
515  toCB = CB->site - CA->site;
516  }
517 
518  // ... else need to calculate a C-beta direction (not C-beta position!)
519  else {
520  Vector CaN, CaC, cross, bisect;
521  CaN = N->site - CA->site;
522  CaC = C->site - CA->site;
523  // for a true bisector, these vectors should be normalized! but they aren't in other
524  // versions of the threader (Cn3D/C and S), so the average is used instead...
525 // CaN.normalize();
526 // CaC.normalize();
527  bisect = CaN + CaC;
528  bisect.normalize();
529  cross = vector_cross(CaN, CaC);
530  cross.normalize();
531  toCB = 0.816497 * cross - 0.57735 * bisect;
532  }
533 
534  // virtual C-beta location is 2.4 A away from C-alpha in the C-beta direction
535  toCB.normalize();
536  coord->coord = CA->site + 2.4 * toCB;
538 
539  // is this disulfide-bound?
540  Molecule::DisulfideMap::const_iterator ds = mol->disulfideMap.find(res->id);
541  coord->disulfideWith =
542  (ds == mol->disulfideMap.end()) ? -1 :
543  (ds->second - 1) * 2; // calculate virtualCoordinate index from other residueID
544 }
545 
546 static void GetVirtualPeptide(const AtomSet *atomSet, const Molecule *mol,
547  const Residue *res1, const Residue *res2, Threader::VirtualCoordinate *coord)
548 {
549  if (res1->alphaID == Residue::NO_ALPHA_ID || res2->alphaID == Residue::NO_ALPHA_ID)
550  NO_VIRTUAL_COORDINATE(coord);
551 
552  AtomPntr ap1(mol->id, res1->id, res1->alphaID), ap2(mol->id, res2->id, res2->alphaID);
553  const AtomCoord
554  *atom1 = atomSet->GetAtom(ap1, true, true), // 'true' means just use first alt coord
555  *atom2 = atomSet->GetAtom(ap2, true, true);
556  if (!atom1 || !atom2) NO_VIRTUAL_COORDINATE(coord);
557 
558  coord->coord = (atom1->site + atom2->site) / 2;
560  coord->disulfideWith = -1;
561 }
562 
563 static void GetVirtualCoordinates(const Molecule *mol, const AtomSet *atomSet,
564  Threader::VirtualCoordinateList *virtualCoordinates)
565 {
566  virtualCoordinates->resize(2 * mol->residues.size() - 1);
567  Molecule::ResidueMap::const_iterator r, re = mol->residues.end();
568  const Residue *prevResidue = NULL;
569  unsigned int i = 0;
570  for (r=mol->residues.begin(); r!=re; ++r) {
571  if (prevResidue)
572  GetVirtualPeptide(atomSet, mol,
573  prevResidue, r->second, &((*virtualCoordinates)[i++]));
574  prevResidue = r->second;
575  GetVirtualResidue(atomSet, mol,
576  r->second, &((*virtualCoordinates)[i++]));
577  }
578 }
579 
580 static const unsigned int MAX_DISTANCE_BIN = 5;
581 static unsigned int BinDistance(const Vector& p1, const Vector& p2)
582 {
583  double dist = (p2 - p1).length();
584  unsigned int bin;
585 
586  if (dist > 10.0)
587  bin = MAX_DISTANCE_BIN + 1;
588  else if (dist > 9.0)
589  bin = 5;
590  else if (dist > 8.0)
591  bin = 4;
592  else if (dist > 7.0)
593  bin = 3;
594  else if (dist > 6.0)
595  bin = 2;
596  else if (dist > 5.0)
597  bin = 1;
598  else
599  bin = 0;
600 
601  return bin;
602 }
603 
605  Threader::ContactList *resResContacts, Threader::ContactList *resPepContacts)
606 {
607  unsigned int i, j, bin;
608 
609  // loop i through whole chain, just to report all missing coords
610  for (i=0; i<coords.size(); ++i) {
611  if (coords[i].type == Threader::MISSING_COORDINATE) {
612  WARNINGMSG("Threader::CreateFldMtf() - unable to determine virtual coordinate for "
613  << ((i%2 == 0) ? "sidechain " : "peptide ") << (i/2));
614  continue;
615  }
616 
617  for (j=i+10; j<coords.size(); ++j) { // must be at least 10 virtual bonds away
618 
619  if (coords[j].type == Threader::MISSING_COORDINATE ||
620  // not interested in peptide-peptide contacts
621  (coords[i].type == Threader::VIRTUAL_PEPTIDE &&
622  coords[j].type == Threader::VIRTUAL_PEPTIDE) ||
623  // don't include disulfide-bonded cysteine pairs
624  (coords[i].disulfideWith == (int)j || coords[j].disulfideWith == (int)i)
625  ) continue;
626 
627  bin = BinDistance(coords[i].coord, coords[j].coord);
628  if (bin <= MAX_DISTANCE_BIN) {
629  // add residue-residue contact - res1 is lower-numbered residue
630  if (coords[i].type == Threader::VIRTUAL_RESIDUE &&
631  coords[j].type == Threader::VIRTUAL_RESIDUE) {
632  resResContacts->resize(resResContacts->size() + 1);
633  resResContacts->back().vc1 = i;
634  resResContacts->back().vc2 = j;
635  resResContacts->back().distanceBin = bin;
636  }
637  // add residue-peptide contact
638  else {
639  resPepContacts->resize(resPepContacts->size() + 1);
640  resPepContacts->back().distanceBin = bin;
641  // peptide must go in vc2
642  if (coords[i].type == Threader::VIRTUAL_RESIDUE) {
643  resPepContacts->back().vc1 = i;
644  resPepContacts->back().vc2 = j;
645  } else {
646  resPepContacts->back().vc2 = i;
647  resPepContacts->back().vc1 = j;
648  }
649  }
650  }
651  }
652  }
653 }
654 
655 static void TranslateContacts(const Threader::ContactList& resResContacts,
656  const Threader::ContactList& resPepContacts, Fld_Mtf *fldMtf)
657 {
658  unsigned int i;
659  Threader::ContactList::const_iterator c;
660  for (i=0, c=resResContacts.begin(); i<resResContacts.size(); ++i, ++c) {
661  fldMtf->rrc.r1[i] = c->vc1 / 2; // threader coord points to (res,pep) pair
662  fldMtf->rrc.r2[i] = c->vc2 / 2;
663  fldMtf->rrc.d[i] = c->distanceBin;
664  }
665  for (i=0, c=resPepContacts.begin(); i<resPepContacts.size(); ++i, ++c) {
666  fldMtf->rpc.r1[i] = c->vc1 / 2;
667  fldMtf->rpc.p2[i] = c->vc2 / 2;
668  fldMtf->rpc.d[i] = c->distanceBin;
669  }
670 }
671 
672 // for sorting contacts
673 inline bool operator < (const Threader::Contact& c1, const Threader::Contact& c2)
674 {
675  return (c1.vc1 < c2.vc1 || (c1.vc1 == c2.vc1 && c1.vc2 < c2.vc2));
676 }
677 
678 static void GetMinimumLoopLengths(const Molecule *mol, const AtomSet *atomSet, Fld_Mtf *fldMtf)
679 {
680  unsigned int i, j;
681  const AtomCoord *a1, *a2;
682  Molecule::ResidueMap::const_iterator r1, r2, re = mol->residues.end();
683  for (r1=mol->residues.begin(), i=0; r1!=re; ++r1, ++i) {
684 
685  if (r1->second->alphaID == Residue::NO_ALPHA_ID)
686  a1 = NULL;
687  else {
688  AtomPntr ap1(mol->id, r1->second->id, r1->second->alphaID);
689  a1 = atomSet->GetAtom(ap1, true, true); // 'true' means just use first alt coord
690  }
691 
692  for (r2=r1, j=i; r2!=re; ++r2, ++j) {
693 
694  if (i == j) {
695  fldMtf->mll[i][j] = 0;
696  } else {
697  if (r2->second->alphaID == Residue::NO_ALPHA_ID)
698  a2 = NULL;
699  else {
700  AtomPntr ap2(mol->id, r2->second->id, r2->second->alphaID);
701  a2 = atomSet->GetAtom(ap2, true, true);
702  }
703  fldMtf->mll[i][j] = fldMtf->mll[j][i] =
704  (!a1 || !a2) ? 0 : (int) (((a2->site - a1->site).length() - 2.7) / 3.4);
705  }
706  }
707  }
708 }
709 
710 Fld_Mtf * Threader::CreateFldMtf(const Sequence *masterSequence)
711 {
712  if (!masterSequence) return NULL;
713 
714  const Molecule *mol = masterSequence->molecule;
715 
716  // return cached copy if we've already constructed a Fld_Mtf for this master
717  ContactMap::iterator c = mol ? contacts.find(mol) : contacts.find(masterSequence);
718  if (c != contacts.end()) return c->second;
719 
720  // work-around to allow PSSM-only threading when master has no structure (or only C-alphas)
721  Fld_Mtf *fldMtf;
722  if (!mol || mol->parentSet->isAlphaOnly) {
723  fldMtf = NewFldMtf(masterSequence->Length(), 0, 0);
724  contacts[masterSequence] = fldMtf;
725  return fldMtf;
726  }
727 
728  // for convenience so subroutines don't have to keep looking this up... Use first
729  // CoordSet if multiple model (e.g., NMR)
730  const StructureObject *object;
731  if (!mol->GetParentOfType(&object)) return NULL;
732  const AtomSet *atomSet = object->coordSets.front()->atomSet;
733 
734  // get virtual coordinates for this chain
735  VirtualCoordinateList virtualCoordinates;
736  GetVirtualCoordinates(mol, atomSet, &virtualCoordinates);
737 
738  // check for contacts of virtual coords separated by >= 10 virtual bonds
739  ContactList resResContacts, resPepContacts;
740  GetContacts(virtualCoordinates, &resResContacts, &resPepContacts);
741 
742  // create Fld_Mtf, and store contacts in it
743  fldMtf = NewFldMtf(mol->residues.size(), resResContacts.size(), resPepContacts.size());
744  resPepContacts.sort(); // not really necessary, but makes same order as Cn3D for comparison/testing
745  TranslateContacts(resResContacts, resPepContacts, fldMtf);
746 
747  // fill out min. loop lengths
748  GetMinimumLoopLengths(mol, atomSet, fldMtf);
749 
750  TRACEMSG("created Fld_Mtf for " << mol->identifier->pdbID << " chain '" << mol->identifier->pdbChain << "'");
751 
752  contacts[mol] = fldMtf;
753  return fldMtf;
754 }
755 
756 static BlockMultipleAlignment * CreateAlignmentFromThdTbl(const Thd_Tbl *thdTbl, unsigned int nResult,
757  const Cor_Def *corDef, BlockMultipleAlignment::SequenceList *sequences, AlignmentManager *alignmentManager)
758 {
759  if (corDef->sll.n != thdTbl->nsc || (int)nResult >= thdTbl->n) {
760  ERRORMSG("CreateAlignmentFromThdTbl() - inconsistent Thd_Tbl");
761  return NULL;
762  }
763 
764  BlockMultipleAlignment *newAlignment = new BlockMultipleAlignment(sequences, alignmentManager);
765 
766  // add blocks from threader result
767  for (int block=0; block<corDef->sll.n; ++block) {
768  UngappedAlignedBlock *aBlock = new UngappedAlignedBlock(newAlignment);
769  aBlock->SetRangeOfRow(0,
770  corDef->sll.rfpt[block] - thdTbl->no[block][nResult],
771  corDef->sll.rfpt[block] + thdTbl->co[block][nResult]);
772  aBlock->SetRangeOfRow(1,
773  thdTbl->al[block][nResult] - thdTbl->no[block][nResult],
774  thdTbl->al[block][nResult] + thdTbl->co[block][nResult]);
775  aBlock->width = thdTbl->no[block][nResult] + 1 + thdTbl->co[block][nResult];
776  if (!newAlignment->AddAlignedBlockAtEnd(aBlock)) {
777  ERRORMSG("CreateAlignmentFromThdTbl() - error adding block");
778  delete newAlignment;
779  return NULL;
780  }
781  }
782 
783  // finish alignment
784  if (!newAlignment->AddUnalignedBlocks() || !newAlignment->UpdateBlockMapAndColors()) {
785  ERRORMSG("CreateAlignmentFromThdTbl() - error finishing alignment");
786  delete newAlignment;
787  return NULL;
788  }
789 
790  return newAlignment;
791 }
792 
793 static bool FreezeIsolatedBlocks(Cor_Def *corDef, const Cor_Def *masterCorDef, const Qry_Seq *qrySeq)
794 {
795  if (!corDef || !masterCorDef || !qrySeq ||
796  corDef->sll.n != masterCorDef->sll.n || corDef->sll.n != qrySeq->sac.n) {
797  ERRORMSG("FreezeIsolatedBlocks() - bad parameters");
798  return false;
799  }
800 
801  TRACEMSG("freezing blocks...");
802  for (int i=0; i<corDef->sll.n; ++i) {
803 
804  // default: blocks allowed to grow
805  corDef->sll.nomx[i] = masterCorDef->sll.nomx[i];
806  corDef->sll.comx[i] = masterCorDef->sll.comx[i];
807 
808  // new blocks always allowed to grow
809  if (qrySeq->sac.mn[i] < 0 || qrySeq->sac.mx[i] < 0) continue;
810 
811  // if an existing block is adjacent to any new (to-be-realigned) block, then allow block's
812  // boundaries to grow on that side; otherwise, freeze (isolated) existing block boundaries
813  bool adjacentLeft = (i > 0 && (qrySeq->sac.mn[i - 1] < 0 || qrySeq->sac.mx[i - 1] < 0));
814  bool adjacentRight = (i < corDef->sll.n - 1 &&
815  (qrySeq->sac.mn[i + 1] < 0 || qrySeq->sac.mx[i + 1] < 0));
816 
817  if (!adjacentLeft) {
818  corDef->sll.nomx[i] = corDef->sll.nomn[i];
819 // TESTMSG("block " << i << " fixed N-terminus");
820  }
821  if (!adjacentRight) {
822  corDef->sll.comx[i] = corDef->sll.comn[i];
823 // TESTMSG("block " << i << " fixed C-terminus");
824  }
825  }
826 
827  return true;
828 }
829 
830 bool Threader::Realign(const ThreaderOptions& options, BlockMultipleAlignment *masterMultiple,
831  const AlignmentList *originalAlignments, AlignmentList *newAlignments,
832  unsigned int *nRowsAddedToMultiple, SequenceViewer *sequenceViewer)
833 {
834  *nRowsAddedToMultiple = 0;
835  if (!masterMultiple || !originalAlignments || !newAlignments || originalAlignments->size() == 0)
836  return false;
837 
838  // either calculate no z-scores (0), or calculate z-score for best result (1)
839  static const unsigned int zscs = 0;
840 
841  Seq_Mtf *seqMtf = NULL;
842  Cor_Def *corDef = NULL, *masterCorDef = NULL;
843  Rcx_Ptl *rcxPtl = NULL;
844  Gib_Scd *gibScd = NULL;
845  Fld_Mtf *fldMtf = NULL;
846  float *trajectory = NULL;
847  bool retval = false;
848 
849  AlignmentList::const_iterator p, pe = originalAlignments->end();
850 
851 #ifdef DEBUG_THREADER
852  FILE *pFile;
853 #endif
854 
855  // create contact lists
856  if (options.weightPSSM < 1.0 && (!masterMultiple->GetMaster()->molecule ||
857  masterMultiple->GetMaster()->molecule->parentSet->isAlphaOnly)) {
858  ERRORMSG("Can't use contact potential on non-structured master, or alpha-only (virtual bond) models!");
859  goto cleanup;
860  }
861  if (!(fldMtf = CreateFldMtf(masterMultiple->GetMaster()))) goto cleanup;
862 
863  // create potential and Gibbs schedule
864  if (!(rcxPtl = CreateRcxPtl(1.0 - options.weightPSSM))) goto cleanup;
865  if (!(gibScd = CreateGibScd(true, options.nRandomStarts))) goto cleanup;
866  trajectory = new float[gibScd->ntp];
867 
868  // create initial PSSM
869  if (!(seqMtf = CreateSeqMtf(masterMultiple, options.weightPSSM))) goto cleanup;
870 #ifdef DEBUG_THREADER
871  pFile = fopen("Seq_Mtf.debug.txt", "w");
872  PrintSeqMtf(seqMtf, pFile);
873  fclose(pFile);
874 #endif
875 
876  // create core definition
877  if (!(corDef = CreateCorDef(masterMultiple, options.loopLengthMultiplier))) goto cleanup;
878  if (options.freezeIsolatedBlocks) // make a copy to used as an original "master"
879  if (!(masterCorDef = CreateCorDef(masterMultiple, options.loopLengthMultiplier))) goto cleanup;
880 
881 #ifdef DEBUG_THREADER
882  pFile = fopen("Fld_Mtf.debug.txt", "w");
883  PrintFldMtf(fldMtf, pFile);
884  fclose(pFile);
885 #endif
886 
887  for (p=originalAlignments->begin(); p!=pe; ) {
888 
889  if ((*p)->NRows() != 2 || (*p)->GetMaster() != masterMultiple->GetMaster()) {
890  ERRORMSG("Threader::Realign() - bad pairwise alignment");
891  continue;
892  }
893 
894  Qry_Seq *qrySeq = NULL;
895  Thd_Tbl *thdTbl = NULL;
896  unsigned int success = 0;
897 
898  // create query sequence
899  if (!(qrySeq = CreateQrySeq(masterMultiple, *p, options.terminalResidueCutoff))) goto cleanup2;
900 #ifdef DEBUG_THREADER
901  pFile = fopen("Qry_Seq.debug.txt", "w");
902  PrintQrySeq(qrySeq, pFile);
903  fclose(pFile);
904 #endif
905 
906  // freeze block sizes if opted (changes corDef but not masterCorDef or qrySeq)
907  if (options.freezeIsolatedBlocks)
908  FreezeIsolatedBlocks(corDef, masterCorDef, qrySeq);
909 #ifdef DEBUG_THREADER
910  pFile = fopen("Cor_Def.debug.txt", "w");
911  PrintCorDef(corDef, pFile);
912  fclose(pFile);
913 #endif
914 
915  // create results storage structure
916  thdTbl = NewThdTbl(options.nResultAlignments, corDef->sll.n);
917 
918  // actually run the threader (finally!)
919  INFOMSG("threading " << (*p)->GetSequenceOfRow(1)->identifier->ToString());
920  success = atd(fldMtf, corDef, qrySeq, rcxPtl, gibScd, thdTbl, seqMtf,
921  trajectory, zscs, SCALING_FACTOR, (float) options.weightPSSM);
922 
923  BlockMultipleAlignment *newAlignment;
924  if (success) {
925  TRACEMSG("threading succeeded");
926 #ifdef DEBUG_THREADER
927  pFile = fopen("Thd_Tbl.debug.txt", "w");
928  PrintThdTbl(thdTbl, pFile);
929  fclose(pFile);
930 #endif
931  // create new alignment(s) from threading result; merge or add to list as appropriate
932  for (int i=0; i<thdTbl->n; ++i) {
933 
934  // skip if this entry is not a real result
935  if (thdTbl->tf[i] <= 0) continue;
936 
938  sequences->front() = (*p)->GetMaster();
939  sequences->back() = (*p)->GetSequenceOfRow(1);
940  newAlignment = CreateAlignmentFromThdTbl(thdTbl, i, corDef, sequences, alignmentManager);
941  if (!newAlignment) continue;
942 
943  // set scores to show in alignment
944  newAlignment->SetRowDouble(0, thdTbl->tg[i]);
945  newAlignment->SetRowDouble(1, thdTbl->tg[i]);
946  string status = string("Threading successful; alignment score before merge: ") + NStr::DoubleToString(thdTbl->tg[i]);
947  newAlignment->SetRowStatusLine(0, status);
948  newAlignment->SetRowStatusLine(1, status);
949 
950  if (options.mergeAfterEachSequence) {
951  if (!sequenceViewer->EditorIsOn())
952  sequenceViewer->TurnOnEditor();
953  if (masterMultiple->MergeAlignment(newAlignment)) {
954  delete newAlignment; // if merge is successful, we can delete this alignment;
955  newAlignment = NULL;
956  ++(*nRowsAddedToMultiple);
957  }
958  }
959 
960  // no merge or merge failed - add new alignment to list, let calling function deal with it
961  if (newAlignment)
962  newAlignments->push_back(newAlignment);
963  }
964  }
965 
966  // threading failed - add old alignment to list so it doesn't get lost
967  else {
968  TRACEMSG("threading failed!");
969  newAlignment = (*p)->Clone();
970  newAlignment->SetRowDouble(0, -1.0);
971  newAlignment->SetRowDouble(1, -1.0);
972  newAlignment->SetRowStatusLine(0, "Threading failed!");
973  newAlignment->SetRowStatusLine(1, "Threading failed!");
974  newAlignments->push_back(newAlignment);
975  }
976 
977 cleanup2:
978  if (qrySeq) FreeQrySeq(qrySeq);
979  if (thdTbl) FreeThdTbl(thdTbl);
980 
981  ++p;
982  if (success && p != pe && options.mergeAfterEachSequence) {
983  // re-create PSSM after each merge
984  FreeSeqMtf(seqMtf);
985  if (!(seqMtf = CreateSeqMtf(masterMultiple, options.weightPSSM))) goto cleanup;
986  }
987  }
988 
989  retval = true;
990 
991 cleanup:
992  if (seqMtf) FreeSeqMtf(seqMtf);
993  if (corDef) FreeCorDef(corDef);
994  if (masterCorDef) FreeCorDef(masterCorDef);
995  if (rcxPtl) FreeRcxPtl(rcxPtl);
996  if (gibScd) FreeGibScd(gibScd);
997  if (trajectory) delete[] trajectory;
998 
999  return retval;
1000 }
1001 
1003  unsigned int row, const vector < int >& residueNumbers, const Seq_Mtf *seqMtf)
1004 {
1005  double score = 0.0;
1006  BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator b, be = aBlocks.end();
1007  const Block::Range *masterRange, *dependentRange;
1008  unsigned int i;
1009 
1010  for (b=aBlocks.begin(); b!=be; ++b) {
1011  masterRange = (*b)->GetRangeOfRow(0);
1012  dependentRange = (*b)->GetRangeOfRow(row);
1013  for (i=0; i<(*b)->width; ++i)
1014  if (residueNumbers[dependentRange->from + i] >= 0)
1015  score += seqMtf->ww[masterRange->from + i][residueNumbers[dependentRange->from + i]];
1016  }
1017 
1018 // TESTMSG("PSSM score for row " << row << ": " << score);
1019  return score;
1020 }
1021 
1022 static double CalculateContactScore(const BlockMultipleAlignment *multiple,
1023  unsigned int row, const vector < int >& residueNumbers, const Fld_Mtf *fldMtf, const Rcx_Ptl *rcxPtl)
1024 {
1025  double score = 0.0;
1026  int seqIndex1, seqIndex2, resNum1, resNum2, dist, i;
1027 
1028  // for each res-res contact, convert seqIndexes of master into corresponding seqIndexes
1029  // of dependent if they're aligned; add contact energies if so
1030  for (i=0; i<fldMtf->rrc.n; ++i) {
1031  seqIndex1 = multiple->GetAlignedDependentIndex(fldMtf->rrc.r1[i], row);
1032  if (seqIndex1 < 0) continue;
1033  seqIndex2 = multiple->GetAlignedDependentIndex(fldMtf->rrc.r2[i], row);
1034  if (seqIndex2 < 0) continue;
1035 
1036  resNum1 = residueNumbers[seqIndex1];
1037  resNum2 = residueNumbers[seqIndex2];
1038  if (resNum1 < 0 || resNum2 < 0) continue;
1039 
1040  dist = fldMtf->rrc.d[i];
1041  score += rcxPtl->rre[dist][resNum1][resNum2] + rcxPtl->re[dist][resNum1] + rcxPtl->re[dist][resNum2];
1042  }
1043 
1044  // ditto for res-pep contacts - except only one dependent residue to look up; 2nd is always peptide group
1045  for (i=0; i<fldMtf->rpc.n; ++i) {
1046  seqIndex1 = multiple->GetAlignedDependentIndex(fldMtf->rpc.r1[i], row);
1047  if (seqIndex1 < 0) continue;
1048 
1049  // peptides are only counted if both contributing master residues are aligned
1050  if (fldMtf->rpc.p2[i] >= (int)multiple->GetMaster()->Length() - 1 ||
1051  !multiple->IsAligned(0U, fldMtf->rpc.p2[i]) ||
1052  !multiple->IsAligned(0U, fldMtf->rpc.p2[i] + 1)) continue;
1053 
1054  resNum1 = residueNumbers[seqIndex1];
1055  if (resNum1 < 0) continue;
1056  resNum2 = NUM_RES_TYPES - 1; // peptide group
1057 
1058  dist = fldMtf->rpc.d[i];
1059  score += rcxPtl->rre[dist][resNum1][resNum2] + rcxPtl->re[dist][resNum1] + rcxPtl->re[dist][resNum2];
1060  }
1061 
1062 // TESTMSG("Contact score for row " << row << ": " << score);
1063  return score;
1064 }
1065 
1066 bool Threader::CalculateScores(const BlockMultipleAlignment *multiple, double weightPSSM)
1067 {
1068  Seq_Mtf *seqMtf = NULL;
1069  Rcx_Ptl *rcxPtl = NULL;
1070  Fld_Mtf *fldMtf = NULL;
1072  vector < int > residueNumbers;
1073  bool retval = false;
1074  unsigned int row;
1075 
1076  // create contact lists
1077  if (weightPSSM < 1.0 && (!multiple->GetMaster()->molecule ||
1078  multiple->GetMaster()->molecule->parentSet->isAlphaOnly)) {
1079  ERRORMSG("Can't use contact potential on non-structured master, or alpha-only (virtual bond) models!");
1080  goto cleanup;
1081  }
1082  if (weightPSSM < 1.0 && !(fldMtf = CreateFldMtf(multiple->GetMaster()))) goto cleanup;
1083 
1084  // create PSSM
1085  if (weightPSSM > 0.0 && !(seqMtf = CreateSeqMtf(multiple, weightPSSM))) goto cleanup;
1086 
1087  // create potential
1088  if (weightPSSM < 1.0 && !(rcxPtl = CreateRcxPtl(1.0 - weightPSSM))) goto cleanup;
1089 
1090  // get aligned blocks
1091  multiple->GetUngappedAlignedBlocks(&aBlocks);
1092 
1093  for (row=0; row<multiple->NRows(); ++row) {
1094 
1095  // get sequence's residue numbers
1096  const Sequence *seq = multiple->GetSequenceOfRow(row);
1097  residueNumbers.resize(seq->Length());
1098  for (unsigned int i=0; i<seq->Length(); ++i)
1100 
1101  // sum score types (weightPSSM already built into seqMtf & rcxPtl)
1102  double
1103  scorePSSM = (weightPSSM > 0.0) ?
1104  CalculatePSSMScore(aBlocks, row, residueNumbers, seqMtf) : 0.0,
1105  scoreContacts = (weightPSSM < 1.0) ?
1106  CalculateContactScore(multiple, row, residueNumbers, fldMtf, rcxPtl) : 0.0,
1107  score = (scorePSSM + scoreContacts) / SCALING_FACTOR;
1108 
1109  // set score in alignment rows (for sorting and status line display)
1110  multiple->SetRowDouble(row, score);
1111  CNcbiOstrstream oss;
1112  oss << "PSSM+Contact score (PSSM x" << weightPSSM << "): " << score;
1113  multiple->SetRowStatusLine(row, (string) CNcbiOstrstreamToString(oss));
1114  }
1115 
1116  retval = true;
1117 
1118 cleanup:
1119  if (seqMtf) FreeSeqMtf(seqMtf);
1120  if (rcxPtl) FreeRcxPtl(rcxPtl);
1121  return retval;
1122 }
1123 
1126 {
1127  // create contact lists
1128  if (!multiple->GetMaster()->molecule || multiple->GetMaster()->molecule->parentSet->isAlphaOnly) {
1129  ERRORMSG("Can't use contact potential on non-structured master, or alpha-only (virtual bond) models!");
1130  return false;
1131  }
1132 
1133  violations->clear();
1134  violations->resize(multiple->NRows());
1135 
1136  // look for too-short regions between aligned blocks in unstructured sequences, using minimum loop
1137  // lengths of all structured sequences
1139  multiple->GetUngappedAlignedBlocks(&aBlocks);
1140  BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator b, be = aBlocks.end(), n;
1141  unsigned int nViolations = 0, row;
1142  const Block::Range *thisRange, *nextRange;
1143  int minimumLoop;
1144  for (b=aBlocks.begin(); b!=be; ++b) {
1145  n = b;
1146  ++n;
1147  if (n == be) break;
1148 
1149  // find minimum allowed loop length between these blocks based on any structure
1150  minimumLoop = kMax_Int;
1151  for (row=0; row<multiple->NRows(); ++row) {
1152  if (multiple->GetSequenceOfRow(row)->molecule && !multiple->GetSequenceOfRow(row)->molecule->parentSet->isAlphaOnly) {
1153  thisRange = (*b)->GetRangeOfRow(row);
1154  nextRange = (*n)->GetRangeOfRow(row);
1155  Fld_Mtf *fldMtf = CreateFldMtf(multiple->GetSequenceOfRow(row));
1156  if (!fldMtf) {
1157  ERRORMSG("Can't create FldMtf for " << multiple->GetSequenceOfRow(row)->identifier->ToString());
1158  return false;
1159  }
1160  if (fldMtf->mll[thisRange->to][nextRange->from] < minimumLoop)
1161  minimumLoop = fldMtf->mll[thisRange->to][nextRange->from];
1162  }
1163  }
1164 
1165  // check for too-short loops in all unstructured rows
1166  for (row=1; row<multiple->NRows(); ++row) {
1167  if (!multiple->GetSequenceOfRow(row)->molecule || multiple->GetSequenceOfRow(row)->molecule->parentSet->isAlphaOnly) {
1168  thisRange = (*b)->GetRangeOfRow(row);
1169  nextRange = (*n)->GetRangeOfRow(row);
1170  // violation found
1171  if (nextRange->from - thisRange->to - 1 < minimumLoop) {
1172  (*violations)[row].push_back(make_pair(thisRange->to, nextRange->from));
1173  ++nViolations;
1174  }
1175  }
1176  }
1177  }
1178 
1179 // TESTMSG("Found " << nViolations << " geometry violations");
1180  return nViolations;
1181 }
1182 
1184  const BlockMultipleAlignment *toBeThreaded)
1185 {
1186  unsigned int nBlocksToAlign = 0;
1187  BlockMultipleAlignment::UngappedAlignedBlockList multipleABlocks, pairwiseABlocks;
1188  coreAlignment->GetUngappedAlignedBlocks(&multipleABlocks);
1189  toBeThreaded->GetUngappedAlignedBlocks(&pairwiseABlocks);
1190 
1191  // if a block in the multiple is *not* contained in the pairwise (looking at master coords),
1192  // then it'll probably be realigned upon threading
1193  BlockMultipleAlignment::UngappedAlignedBlockList::const_iterator
1194  m, me = multipleABlocks.end(), p, pe = pairwiseABlocks.end();
1195  const Block::Range *multipleRange, *pairwiseRange;
1196  for (m=multipleABlocks.begin(); m!=me; ++m) {
1197  multipleRange = (*m)->GetRangeOfRow(0);
1198  bool realignBlock = true;
1199  for (p=pairwiseABlocks.begin(); p!=pe; ++p) {
1200  pairwiseRange = (*p)->GetRangeOfRow(0);
1201  if (pairwiseRange->from <= multipleRange->from && pairwiseRange->to >= multipleRange->to) {
1202  realignBlock = false;
1203  break;
1204  }
1205  }
1206  if (realignBlock) ++nBlocksToAlign;
1207  }
1208 
1209  if (nBlocksToAlign <= 1)
1210  return 1;
1211  else
1212  // round to nearest integer
1213  return (int) (exp(1.5 + 0.25432 * nBlocksToAlign) + 0.5);
1214 }
1215 
1216 END_SCOPE(Cn3D)
Vector site
Definition: atom_set.hpp:62
const AtomCoord * GetAtom(const AtomPntr &atom, bool getAny=false, bool suppressWarning=false) const
Definition: atom_set.cpp:252
BlockMultipleAlignment * Clone(void) const
std::vector< const Sequence * > SequenceList
const UnalignedBlock * GetUnalignedBlockBefore(const UngappedAlignedBlock *aBlock) const
void SetRowDouble(unsigned int row, double value) const
const BLAST_Matrix * GetPSSM(void) const
void SetRowStatusLine(unsigned int row, const std::string &value) const
const Sequence * GetMaster(void) const
std::vector< const UngappedAlignedBlock * > UngappedAlignedBlockList
const Sequence * GetSequenceOfRow(unsigned int row) const
void GetUngappedAlignedBlocks(UngappedAlignedBlockList *blocks) const
bool UpdateBlockMapAndColors(bool clearRowInfo=true)
int GetAlignedDependentIndex(unsigned int masterSeqIndex, unsigned int dependentRow) const
bool IsAligned(unsigned int row, unsigned int seqIndex) const
bool AddAlignedBlockAtEnd(UngappedAlignedBlock *newBlock)
bool MergeAlignment(const BlockMultipleAlignment *newAlignment)
void SetRangeOfRow(unsigned int row, int from, int to)
unsigned int width
CB –.
Definition: B.hpp:64
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
std::string ToString(void) const
int id
Definition: molecule.hpp:84
DisulfideMap disulfideMap
Definition: molecule.hpp:95
const MoleculeIdentifier * identifier
Definition: molecule.hpp:86
ResidueMap residues
Definition: molecule.hpp:89
int id
Definition: residue.hpp:69
static const int NO_ALPHA_ID
Definition: residue.hpp:75
int alphaID
Definition: residue.hpp:76
const AtomInfoMap & GetAtomInfos(void) const
Definition: residue.hpp:127
void TurnOnEditor(void)
const Molecule * molecule
const MoleculeIdentifier * identifier
unsigned int Length(void) const
string sequenceString
Definition: cav_seqset.hpp:93
StructureSet * parentSet
bool GetParentOfType(const T **ptr, bool warnIfNotFound=true) const
double loopLengthMultiplier
std::vector< VirtualCoordinate > VirtualCoordinateList
std::list< BlockMultipleAlignment * > AlignmentList
Cor_Def * CreateCorDef(const BlockMultipleAlignment *multiple, double loopLengthMultiplier)
ContactMap contacts
std::vector< IntervalList > GeometryViolationsForRow
bool Realign(const ThreaderOptions &options, BlockMultipleAlignment *masterMultiple, const AlignmentList *originalAlignments, AlignmentList *newAlignments, unsigned int *nRowsAddedToMultiple, SequenceViewer *sequenceViewer)
AlignmentManager * alignmentManager
Threader(AlignmentManager *parentAlnMgr)
static const unsigned int SCALING_FACTOR
static const std::string ThreaderResidues
unsigned int GetGeometryViolations(const BlockMultipleAlignment *multiple, GeometryViolationsForRow *violations)
~Threader(void)
Gib_Scd * CreateGibScd(bool fast, unsigned int nRandomStarts)
Seq_Mtf * CreateSeqMtf(const BlockMultipleAlignment *multiple, double weightPSSM)
Qry_Seq * CreateQrySeq(const BlockMultipleAlignment *multiple, const BlockMultipleAlignment *pairwise, int terminalCutoff)
static unsigned int EstimateNRandomStarts(const BlockMultipleAlignment *coreAlignment, const BlockMultipleAlignment *toBeThreaded)
bool CalculateScores(const BlockMultipleAlignment *multiple, double weightPSSM)
Fld_Mtf * CreateFldMtf(const Sequence *masterSequence)
Rcx_Ptl * CreateRcxPtl(double weightContacts)
std::list< Contact > ContactList
void normalize(void)
bool EditorIsOn(void) const
USING_SCOPE(objects)
static void ReadToRowOfEnergies(ifstream &InFile, unsigned int NumResTypes)
ThreaderOptions globalThreaderOptions
static void GetVirtualResidue(const AtomSet *atomSet, const Molecule *mol, const Residue *res, Threader::VirtualCoordinate *coord)
static void GetVirtualPeptide(const AtomSet *atomSet, const Molecule *mol, const Residue *res1, const Residue *res2, Threader::VirtualCoordinate *coord)
static bool FreezeIsolatedBlocks(Cor_Def *corDef, const Cor_Def *masterCorDef, const Qry_Seq *qrySeq)
static const unsigned int NUM_RES_TYPES
static unsigned int CountWords(char *chs)
bool operator<(const Threader::Contact &c1, const Threader::Contact &c2)
static int LookupThreaderResidueNumberFromCharacterAbbrev(char r)
static double CalculatePSSMScore(const BlockMultipleAlignment::UngappedAlignedBlockList &aBlocks, unsigned int row, const vector< int > &residueNumbers, const Seq_Mtf *seqMtf)
unsigned int LookupNCBIStdaaNumberFromThreaderResidueNumber(char r)
static void TranslateContacts(const Threader::ContactList &resResContacts, const Threader::ContactList &resPepContacts, Fld_Mtf *fldMtf)
static void GetVirtualCoordinates(const Molecule *mol, const AtomSet *atomSet, Threader::VirtualCoordinateList *virtualCoordinates)
static void GetMinimumLoopLengths(const Molecule *mol, const AtomSet *atomSet, Fld_Mtf *fldMtf)
static const unsigned int MAX_DISTANCE_BIN
USING_NCBI_SCOPE
static BlockMultipleAlignment * CreateAlignmentFromThdTbl(const Thd_Tbl *thdTbl, unsigned int nResult, const Cor_Def *corDef, BlockMultipleAlignment::SequenceList *sequences, AlignmentManager *alignmentManager)
static double CalculateContactScore(const BlockMultipleAlignment *multiple, unsigned int row, const vector< int > &residueNumbers, const Fld_Mtf *fldMtf, const Rcx_Ptl *rcxPtl)
static void GetContacts(const Threader::VirtualCoordinateList &coords, Threader::ContactList *resResContacts, Threader::ContactList *resPepContacts)
static unsigned int BinDistance(const Vector &p1, const Vector &p2)
#define NO_VIRTUAL_COORDINATE(coord)
const string & GetDataDir(void)
Definition: cn3d_tools.cpp:330
#define TRACEMSG(stream)
Definition: cn3d_tools.hpp:83
#define INFOMSG(stream)
Definition: cn3d_tools.hpp:84
#define WARNINGMSG(stream)
Definition: cn3d_tools.hpp:85
#define ERRORMSG(stream)
Definition: cn3d_tools.hpp:86
Include a standard set of the NCBI C++ Toolkit most basic headers.
#define C(s)
Definition: common.h:231
string Path(const string &dir, const string &file)
Definition: fileutil.cpp:243
static void cleanup(void)
Definition: ct_dynamic.c:30
#define true
Definition: bool.h:35
int offset
Definition: replacements.h:160
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5187
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
yy_size_t n
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
unsigned int a
Definition: ncbi_localip.c:102
int toupper(Uchar c)
Definition: ncbictype.hpp:73
The NCBI C++/STL use hints.
T max(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static Threader::GeometryViolationsForRow violations
#define row(bind, expected)
Definition: string_bind.c:73
int * lrfs
Definition: thrdatd.h:79
int * comn
Definition: thrdatd.h:72
int * nomn
Definition: thrdatd.h:70
struct _Cor_Def::@23 sll
int * nomx
Definition: thrdatd.h:71
struct _Cor_Def::@24 lll
struct _Cor_Def::@25 fll
int * rfpt
Definition: thrdatd.h:69
int * comx
Definition: thrdatd.h:73
int * llmx
Definition: thrdatd.h:78
int * llmn
Definition: thrdatd.h:77
int n
Definition: thrdatd.h:74
int * d
Definition: thrdatd.h:54
int * p2
Definition: thrdatd.h:59
struct _Fld_Mtf::@21 rrc
int n
Definition: thrdatd.h:50
int * r1
Definition: thrdatd.h:52
struct _Fld_Mtf::@22 rpc
int * r2
Definition: thrdatd.h:53
int ** mll
Definition: thrdatd.h:63
int ntp
Definition: thrdatd.h:113
int cet
Definition: thrdatd.h:119
int iso
Definition: thrdatd.h:122
int * lms
Definition: thrdatd.h:132
int * lmw
Definition: thrdatd.h:133
int csm
Definition: thrdatd.h:118
int nrs
Definition: thrdatd.h:114
int * nlc
Definition: thrdatd.h:129
int nts
Definition: thrdatd.h:115
int ito
Definition: thrdatd.h:123
int trg
Definition: thrdatd.h:126
int * tma
Definition: thrdatd.h:130
int * nac
Definition: thrdatd.h:128
int * lmf
Definition: thrdatd.h:134
int isl
Definition: thrdatd.h:121
int * nti
Definition: thrdatd.h:127
int als
Definition: thrdatd.h:125
int * tml
Definition: thrdatd.h:131
int cfm
Definition: thrdatd.h:117
int cef
Definition: thrdatd.h:120
int crs
Definition: thrdatd.h:116
int rsd
Definition: thrdatd.h:124
int * mx
Definition: thrdatd.h:96
int * mn
Definition: thrdatd.h:95
int * sq
Definition: thrdatd.h:92
int n
Definition: thrdatd.h:93
struct _Qry_Seq::@26 sac
int *** rrt
Definition: thrdatd.h:105
int ** re
Definition: thrdatd.h:104
int *** rre
Definition: thrdatd.h:103
int ** ww
Definition: thrdatd.h:318
int ** no
Definition: thrdatd.h:153
int nsc
Definition: thrdatd.h:160
int * tf
Definition: thrdatd.h:149
int n
Definition: thrdatd.h:159
int ** al
Definition: thrdatd.h:152
int ** co
Definition: thrdatd.h:154
float * tg
Definition: thrdatd.h:139
static int GetBLOSUM62Score(char a, char b)
Definition: type.c:6
unsigned char LookupNCBIStdaaNumberFromCharacter(char r)
Definition: su_pssm.cpp:125
Gib_Scd * NewGibScd(int NumTempSteps)
Definition: thrdatd.c:1553
Fld_Mtf * FreeFldMtf(Fld_Mtf *mtf)
Definition: thrdatd.c:1615
Thd_Tbl * NewThdTbl(int NumResults, int NumCoreElements)
Definition: thrdatd.c:1665
Rcx_Ptl * NewRcxPtl(int NumResTypes, int NumDistances, int PeptideIndex)
Definition: thrdatd.c:1503
Rcx_Ptl * FreeRcxPtl(Rcx_Ptl *pmf)
Definition: thrdatd.c:1530
Cor_Def * FreeCorDef(Cor_Def *cdf)
Definition: thrdatd.c:1313
Seq_Mtf * NewSeqMtf(int NumResidues, int AlphabetSize)
Definition: thrdatd.c:1361
void PrintCorDef(Cor_Def *cdf, FILE *pFile)
Definition: thrdatd.c:1330
int atd(Fld_Mtf *mtf, Cor_Def *cdf, Qry_Seq *qsq, Rcx_Ptl *pmf, Gib_Scd *gsp, Thd_Tbl *ttb, Seq_Mtf *psm, float *trg, int zscs, double ScalingFactor, float PSSM_Weight)
Definition: thrdatd.c:68
void PrintThdTbl(Thd_Tbl *ttb, FILE *pFile)
Definition: thrdatd.c:1736
int ThrdRound(double Num)
Definition: thrdatd.c:1913
Fld_Mtf * NewFldMtf(int NumResidues, int NumResResContacts, int NumResPepContacts)
Definition: thrdatd.c:1590
void PrintFldMtf(Fld_Mtf *mtf, FILE *pFile)
Definition: thrdatd.c:1636
Qry_Seq * FreeQrySeq(Qry_Seq *qsq)
Definition: thrdatd.c:1456
Gib_Scd * FreeGibScd(Gib_Scd *gsp)
Definition: thrdatd.c:1573
Thd_Tbl * FreeThdTbl(Thd_Tbl *ttb)
Definition: thrdatd.c:1702
Seq_Mtf * FreeSeqMtf(Seq_Mtf *psm)
Definition: thrdatd.c:1381
void PrintSeqMtf(Seq_Mtf *psm, FILE *pFile)
Definition: thrdatd.c:1398
Cor_Def * NewCorDef(int NumBlocks)
Definition: thrdatd.c:1292
Qry_Seq * NewQrySeq(int NumResidues, int NumBlocks)
Definition: thrdatd.c:1433
void PrintQrySeq(Qry_Seq *qsq, FILE *pFile)
Definition: thrdatd.c:1468
#define N
Definition: crc32.c:57
Modified on Mon May 20 04:58:26 2024 by modify_doxy.py rev. 669887