NCBI C++ ToolKit
fta_qscore.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: fta_qscore.cpp 101888 2024-02-28 18:20:15Z vasilche $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: fta_qscore.cpp
27  *
28  * Author: Mark Cavanaugh
29  *
30  * File Description:
31  * Utilities to parse quality score buffer to single or
32  * delta SeqGraph.
33  *
34  */
35 
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
45 #include <objects/seq/Seq_inst.hpp>
48 #include <objects/seq/Seq_ext.hpp>
51 
52 #include "index.h"
53 
55 #include <algorithm>
56 
57 
58 #include "ftaerr.hpp"
59 #include "utilfun.h"
60 
61 #ifdef THIS_FILE
62 # undef THIS_FILE
63 #endif
64 #define THIS_FILE "fta_qscore.cpp"
65 
68 
69 /* Defines
70  */
71 
72 #define IS_DIGIT_OR_NA(c) ((c) == 'N' || (c) == 'A' || ('0' <= (c) && (c) <= '9'))
73 
74 #define QSBUF_MAXLINE 256 /* Maximum length for a line of data \
75  read from a buffer of quality-score \
76  data, plus one for the terminal \
77  newline character. */
78 #define QSBUF_MAXACC 51 /* Maximum length for an accession \
79  read from the 'defline' at the \
80  start of a buffer of quality-score \
81  data, plus one for \0 */
82 #define QSBUF_MAXTITLE 101 /* Maximum length for the title read \
83  from the 'defline' at the start of \
84  a buffer of quality-score data, \
85  plus one for \0 */
86 #define QSBUF_MAXSCORES 100 /* Maximum number of scores expected \
87  in a line of score data read from \
88  from a quality-score buffer */
89 #define QS_MIN_VALID_SCORE 0 /* Minimum valid quality score value */
90 #define QS_MAX_VALID_SCORE 100 /* Maximum valid quality score value */
91 
92 /***********************************************************
93  *
94  * Function: QSbuf_ReadLine
95  *
96  * Description: Read a line of data from a Quality Score buffer,
97  * copying its contents (up to \n or \0) into dest_buf.
98  *
99  * Arguments: qs_buf: buffer containing Quality Score data
100  * dest_buf: destination buffer for one line of data
101  * copied from qs_buf
102  * max_len: max length for a line copied into dest_buf
103  * line: line number counter (Int4Ptr)
104  *
105  * Returns: TRUE upon success, otherwise FALSE
106  *
107  ***********************************************************/
108 static bool QSbuf_ReadLine(const char* qs_buf, char* dest_buf, Int2 max_len, int* line)
109 {
110  Int4 i;
111 
112  if (! qs_buf || ! dest_buf)
113  return false;
114 
115  for (i = 1; i <= max_len; i++, dest_buf++, qs_buf++) {
116  *dest_buf = *qs_buf;
117  if (*qs_buf == '\n' || *qs_buf == '\0')
118  break;
119  }
120  (*line)++;
121 
122  if (i == max_len) {
123  /* you read qs_buf all the way to max_len
124  * unless the last character is \n or \0, there
125  * is data remaining that did not fit into max_len
126  * characters; max_len is not sufficient to read
127  * a line of data; this is an error
128  */
129  if (*dest_buf != '\n' && *dest_buf != '\0') {
130  /* error : max_len too short for reading the lines contained
131  * in qs_buf
132  */
133  return false;
134  }
135  }
136 
137  /* you did *not* read all the way to max_len characters
138  * (or the line you read fits exactly into max_len characters)
139  *
140  * if dest_buf ends with \n, convert it to \0
141  *
142  * if dest_buf does NOT end with \n, then qs_buf ended on
143  * a \0, without a newline; this is considered an error
144  */
145  if (*dest_buf != '\n') {
146  /* error : missing newline at end of qs_buf
147  */
148  return false;
149  }
150  *dest_buf = '\0';
151  return true;
152 }
153 
154 /***********************************************************
155  *
156  * Function: QSbuf_ParseDefline
157  *
158  * Description: Parse a FASTA-like "defline" read from a buffer
159  * of quality score data.
160  *
161  * Sample: >AL031704.13 Phrap Quality (Length:40617, Min: 1, Max: 99)
162  *
163  * Arguments: qs_defline: buffer containing the Quality Score
164  * header line; must be NULL-terminated
165  * def_acc: buffer for the accession number parsed
166  * from the defline; must be allocated by
167  * the caller, and big enough to hold an
168  * accession
169  * def_ver: buffer for the sequence version number
170  * parsed from the defline; must be allocated
171  * by the caller, and big enough to hold
172  * a version number (six digits? 7?)
173  * def_title: buffer for the 'title' of the quality
174  * score data parsed from the defline;
175  * must be allocated by the caller, and big
176  * enough to hold a typical title length
177  * def_len: Int4Ptr for the sequence length parsed
178  * from the defline
179  * def_max: unsigned char* for the max score parsed from
180  * the defline
181  * def_min: unsigned char* for the min score parsed from
182  * the defline
183  *
184  * Note: Parsing of the length, max, and min values
185  * is a little more relaxed than the other fields,
186  * because the values that result can be compared
187  * to values from the rest of the quality score data.
188  *
189  * Returns: 1 : defline successfully parsed
190  * 0 : bad args to this function; defline not parsed
191  * <0 : defline cannot be parsed due to data error
192  *
193  ***********************************************************/
194 static int QSbuf_ParseDefline(char* qs_defline, char* def_acc, char* def_ver, char* def_title, unsigned int* def_len, unsigned char* def_max, unsigned char* def_min)
195 {
196  char* p;
197  char* q;
198  char* r;
199  Int4 temp; /* used for checking the defline min
200  and max scores;
201  could exceed bounds of a Uint1
202  through a data error, hence the
203  temp var */
204 
205  if (! def_acc || ! def_ver || ! def_title ||
206  ! def_len || ! def_max || ! def_min)
207  return (0);
208 
209  if (! qs_defline || *qs_defline == '\0')
210  return (-1);
211 
212  /* init the numeric values that will be parsed from the defline
213  */
214  *def_len = 0;
215  *def_max = 0;
216  *def_min = 0;
217 
218  /* skip leading whitespace
219  */
220  for (q = qs_defline; isspace(*q);)
221  q++;
222 
223  if (*q == '\0')
224  return (-2);
225 
226  /* should be an initial >
227  */
228  if (*q != '>')
229  return (-3);
230  q++;
231 
232  p = q;
233 
234  /* first token to be read is the accession number
235  */
236  while (isalnum(*q))
237  q++;
238 
239  if (*q == '\0')
240  return (-4);
241  if (*q != '.' && ! isspace(*q))
242  return (-5);
243  *q++ = '\0';
244  StringCpy(def_acc, p);
245 
246  p = q;
247  if (*q == '\0')
248  return (-6);
249 
250  /* accession may be optionally followed by a version number
251  */
252  if (isdigit(*q)) {
253  while (isdigit(*q))
254  q++;
255 
256  if (*q == '\0')
257  return (-7);
258  if (! isspace(*q))
259  return (-8);
260  *q++ = '\0';
261  StringCpy(def_ver, p);
262 
263  p = q;
264  if (*q == '\0')
265  return (-9);
266  }
267 
268  /* Ignore additional whitespace chars that might follow acc/ver
269  */
270  while (isspace(*q)) {
271  p++;
272  q++;
273  }
274  if (*q == '\0')
275  return (-10);
276 
277  /* alphanumeric and whitespace characters that follow are the
278  * "title" of the collection of quality score data
279  */
280  while (isalnum(*q) || isspace(*q))
281  q++;
282 
283  if (*q == '\0')
284  return (-11);
285  if (*q != '(')
286  return (-12);
287 
288  /* trim terminal whitespace characters from the title
289  */
290  r = q;
291  r--;
292  while (isspace(*r))
293  r--;
294  *++r = '\0';
295 
296  if (StringHasNoText(p))
297  return (-13);
298  *q++ = '\0';
299  StringCpy(def_title, p);
300 
301  if (NStr::CompareNocase(def_title, "Phrap Quality") != 0 &&
302  NStr::CompareNocase(def_title, "Gap4") != 0 &&
303  NStr::CompareNocase(def_title, "Phred Quality") != 0) {
304  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadTitle, "Unrecognized title for quality score data : >%s< : should be 'Phrap Quality', 'Gap4', or 'Phred Quality'.", def_title);
305  return (-35);
306  }
307 
308  if (*q == '\0')
309  return (-14);
310 
311  /* Look for the 'Length:' token and skip past it
312  */
313  if (! StringEquNI(q, "Length:", 7))
314  return (-15);
315 
316  q = StringChr(q, ':');
317  q++;
318  p = q;
319  if (*q == '\0')
320  return (-16);
321 
322  /* Ignore additional whitespace chars that might follow 'Length:' token
323  */
324  while (isspace(*q)) {
325  p++;
326  q++;
327  }
328  if (*q == '\0')
329  return (-17);
330 
331  /* get the length value
332  */
333  while (isdigit(*q))
334  q++;
335 
336  if (*q == '\0')
337  return (-18);
338  *q++ = '\0';
339 
340  sscanf(p, "%ld", (long*)&temp);
341  *def_len = (Uint4)temp;
342  p = q;
343  if (*q == '\0')
344  return (-19);
345 
346  /* Ignore additional whitespace chars that might follow length
347  */
348  while (isspace(*q)) {
349  p++;
350  q++;
351  }
352  if (*q == '\0')
353  return (-20);
354 
355  /* Look for the 'Min:' token and skip past it
356  */
357  if (! StringEquNI(q, "Min:", 4))
358  return (-21);
359 
360  q = StringChr(q, ':');
361  q++;
362  p = q;
363  if (*q == '\0')
364  return (-22);
365 
366  /* Ignore additional whitespace chars that might follow 'Min:' token
367  */
368  while (isspace(*q)) {
369  p++;
370  q++;
371  }
372  if (*q == '\0')
373  return (-23);
374 
375  /* get the minumum score value
376  */
377  while (isdigit(*q))
378  q++;
379 
380  if (*q == '\0')
381  return (-24);
382  *q++ = '\0';
383 
384  sscanf(p, "%ld", (long*)&temp);
385  if (temp < QS_MIN_VALID_SCORE)
386  return (-25);
387  if (temp > QS_MAX_VALID_SCORE)
388  return (-26);
389 
390  *def_min = (Uint1)temp;
391 
392  p = q;
393  if (*q == '\0')
394  return (-27);
395 
396  /* Ignore additional whitespace chars that might follow minimum score
397  */
398  while (isspace(*q)) {
399  p++;
400  q++;
401  }
402  if (*q == '\0')
403  return (-28);
404 
405  /* Look for the 'Max:' token and skip past it
406  */
407  if (! StringEquNI(q, "Max:", 4))
408  return (-29);
409 
410  q = StringChr(q, ':');
411  q++;
412  p = q;
413  if (*q == '\0')
414  return (-30);
415 
416  /* Ignore additional whitespace chars that might follow 'Max:' token
417  */
418  while (isspace(*q)) {
419  p++;
420  q++;
421  }
422  if (*q == '\0')
423  return (-31);
424 
425  /* get the maximum score value
426  */
427  while (isdigit(*q))
428  q++;
429 
430  if (*q == '\0')
431  return (-32);
432  *q++ = '\0';
433 
434  sscanf(p, "%ld", (long*)&temp);
435  if (temp < QS_MIN_VALID_SCORE)
436  return (-33);
437  if (temp > QS_MAX_VALID_SCORE)
438  return (-34);
439 
440  *def_max = (Uint1)temp;
441 
442  return (1);
443 }
444 
445 /*****************************************************************************
446  *
447  * Function: QSbuf_ParseScores
448  *
449  * Description: Parse a line of data from a Quality Score buffer that supposedly
450  * contains a series of integer scores separated by whitespace.
451  * Populate an array of Uint1 with their values. This is a destructive
452  * parse in the sense that \0 are inserted into score_buf between the
453  * integer tokens.
454  *
455  * Arguments: score_buf: buffer containing integer-value Quality Scores; must be
456  * null-terminated
457  * scores: array of Uint1 to hold the scores parsed from score_buf
458  * (pointer to first element of a Uint1 array alloc'd by the caller)
459  * max_toks: maximum number of score tokens that are expected in score_buf
460  * should equal or exceed the number of elements in the scores array
461  * max_score: maximum score value encountered in score_buf (actually,
462  * the max that is encountered from multiple calls to QSbuf_ParseScores).
463  * caller should initialize to 0
464  * min_score: minimum score value encountered in score_buf (actually,
465  * the min that is encountered from multiple calls to QSbuf_ParseScores).
466  * caller should initialize to 255
467  * allow_na: when set to true, allow values of 'NA' in score_buf in
468  * addition to integers, and interpret them as scores of zero
469  *
470  * Returns: the number of scores that were written to the scores array;
471  * zero is returned for empty score_buf, or a score_buf that
472  * contains no scores; a negative value indicates that there was a
473  * problem parsing the data in score_buf
474  *
475  *****************************************************************************/
476 static Int4 QSbuf_ParseScores(char* score_buf, unsigned char* scores, Int4 max_toks, unsigned char* max_score, unsigned char* min_score, bool allow_na)
477 {
478  Char ch;
479  char *p, *q;
480  int val;
481  Int4 num_toks = 0;
482 
483  /* empty buffer, nothing to parse */
484 
485  if (! score_buf || *score_buf == '\0')
486  return 0;
487 
488  /* bad arguments */
489 
490  if (! scores || max_toks < 1)
491  return -1;
492 
493  /* Loop through score_buf a character (ch) at a time, until you reach a NULL.
494 
495  Skip whitespace characters, and save your current position. Then skip
496  digit characters. Insert a NULL at the first non-digit, and you've got a token
497  representing an integer score (from the saved position). Increment
498  beyond the non-digit, set ch to the resulting character, and then
499  try for another token.
500 
501  BUT! DDBJ data can contain non-digit tokens consisting of NA instead
502  of zero for the score values that fall in the gaps between contigs.
503  So use function IS_DIGIT_OR_NA() rather than IS_DIGIT(), then check
504  the returned token to see if it is "NA". If so, treat it as zero.
505  */
506 
507  p = score_buf;
508  ch = *p;
509 
510  while (ch != '\0') {
511  while (isspace(ch)) {
512  p++;
513  ch = *p;
514  }
515  /* score_buf might be nothing but whitespace, or might end with whitespace */
516  if (ch == '\0') {
517  break;
518  }
519 
520  q = p;
521  ch = *q;
522  while (IS_DIGIT_OR_NA(ch)) {
523  q++;
524  ch = *q;
525  }
526 
527  /* if not at buffer end, then check to see if current
528  character is whitespace; if not, then there is data
529  in the buffer other then digits and whitespace; data error */
530 
531  if (ch != '\0') {
532  if (! isspace(ch)) {
533  fprintf(stderr, "error: score_buf contains an illegal character : >%c<\n", ch);
534  return -2;
535  }
536  *q = '\0';
537  q++;
538  }
539 
540  if (*p == '\0') {
541  /* fprintf(stderr,"error: score_buf buffer contains no score values\n"); */
542  return -3;
543  }
544  if (max_toks < ++num_toks) {
545  /* fprintf(stderr,"error: score_buf contains more than >%ld< scores : problem at token >%s<\n", max_toks, p); */
546  return -4;
547  }
548 
549  /*
550  fprintf(stdout,"score token is >%s<\n", p);
551  fflush(stdout);
552  */
553 
554  if (allow_na && StringEqu(p, "NA")) {
555  *scores = 0;
556  scores++;
557  } else {
558  if (sscanf(p, "%d", &val) == 1) {
559 
560  /* fprintf(stdout,"integer value for score token is %d\n",val); */
561 
562  if (val < QS_MIN_VALID_SCORE) {
563  /* fprintf(stderr,"error: score_buf score >%d< is less than the minimum legal value >%d<\n", val, QS_MIN_VALID_SCORE); */
564  return -5;
565  } else if (val > QS_MAX_VALID_SCORE) {
566  /* fprintf(stderr,"error: score_buf score >%d< is more than the maximum legal value >%d<\n", val, QS_MAX_VALID_SCORE); */
567  return -6;
568  }
569  *scores = (Uint1)val;
570  scores++;
571 
572  *max_score = max(*max_score, (Uint1)val);
573  *min_score = min(*min_score, (Uint1)val);
574  } else {
575  /* fprintf(stderr,"error: sscan failure : score_buf score >%s< is probably not numeric\n", p); */
576  return -7;
577  }
578  }
579  p = q;
580  ch = *p;
581  }
582  return num_toks;
583 }
584 
585 /***********************************************************
586  *
587  * Function: Split_Qscore_SeqGraph_By_DeltaSeq
588  *
589  * Description: Take a single monolithic Seq-graph of quality
590  * scores and split it into a series of smaller
591  * Seq-graphs, each graph corresponding to one
592  * of the Delta-seq literals of the Bioseq to
593  * which the scores apply.
594  *
595  * Arguments: big_sgp: SeqGraphPtr for a single Seq-graph,
596  * containing basepair quality scores
597  * for every base of a sequence, including
598  * any gaps that might exist between its
599  * component contigs (scores at the gaps
600  * are presumably zero)
601  * bsp: BioseqPtr for the sequence to which
602  * the qscores apply; the bioseq must be
603  * a Delta-seq composed of a series of
604  * Seq-literals, one for each component
605  * contig of the bioseq
606  *
607  * Notes: This function cannot handle Delta-seq bioseqs
608  * that contain Seq-loc components (as opposed to
609  * Seq-literal).
610  *
611  * This function cannot handle Seq-literals with
612  * a length of zero (presumably representing a gap
613  * of unknown size).
614  *
615  * Returns: pointer to a chain of SeqGraph, created by
616  * splitting big_sgp up, based on the bsp Delta-seg;
617  * otherwise NULL
618  *
619  * Warning: This function cannot handle sequences more
620  * than an Int4 in length.
621  *
622  ***********************************************************/
623 
625  CBioseq& bioseq)
626 {
627  bool is_gap = false; /* set to TRUE if the Seq-literal
628  represents a gap, rather than
629  sequence data */
630  bool problem = false; /* set to TRUE if a problem is
631  encountered while processing
632  a new_sgp */
633  Uint1 max_score = 0; /* maximum quality score encountered
634  for a Delta-seq component */
635  Uint1 min_score = 0; /* minimum quality score encountered
636  for a Delta-seq component */
637  Uint4 last_pos = 1; /* previous position along bsp */
638  Uint4 curr_pos = 1; /* current position along bsp */
639  Int2 nonzero_gap = 0; /* counter of the number of non-zero
640  scores encountered in big_bs for a
641  Delta-seq component that represents
642  a gap; scores *should* be zero
643  within gaps */
644 
645  if (bioseq.GetInst().GetRepr() != CSeq_inst::eRepr_delta ||
646  ! bioseq.GetInst().IsSetExt())
647  return;
648 
649  CSeq_graph& big_graph = *(*graphs.begin());
650  if (! big_graph.GetGraph().IsByte()) {
651  ErrPostEx(SEV_ERROR, ERR_QSCORE_NonByteGraph, "Seq-graph to be split does not contain byte qscore values : cannot be processed.");
652  return;
653  }
654 
655  CByte_graph::TValues scores_str(big_graph.GetGraph().GetByte().GetValues().begin(),
656  big_graph.GetGraph().GetByte().GetValues().end());
657  if (scores_str.empty()) {
658  ErrPostEx(SEV_ERROR, ERR_QSCORE_MissingByteStore, "Seq-graph to be split has a NULL ByteStore for the qscore values : cannot be processed.");
659  return;
660  }
661 
662  string def_title;
663  if (big_graph.IsSetTitle())
664  def_title = big_graph.GetTitle();
665 
666  nonzero_gap = 0;
667  curr_pos = 0;
668 
669  CSeq_annot::C_Data::TGraph new_graphs;
670  for (const auto& delta : bioseq.GetInst().GetExt().GetDelta().Get()) {
671  is_gap = false;
672  last_pos = curr_pos;
673  max_score = QS_MIN_VALID_SCORE;
674  min_score = QS_MAX_VALID_SCORE;
675 
676  if (delta->IsLoc()) {
677  ErrPostEx(SEV_ERROR, ERR_QSCORE_NonLiteralDelta, "Cannot process Delta-seq bioseqs with Seq-loc components.");
678  problem = true;
679  break;
680  }
681 
682  if (! delta->IsLiteral()) {
683  ErrPostEx(SEV_ERROR, ERR_QSCORE_UnknownDelta, "Encountered Delta-seq component of unknown type.");
684  problem = true;
685  break;
686  }
687 
688  const CSeq_literal& literal = delta->GetLiteral();
689 
690  if (! literal.IsSetLength() || literal.GetLength() < 1) {
691  ErrPostEx(SEV_ERROR, ERR_QSCORE_ZeroLengthLiteral, "Encountered Delta-seq Seq-literal component with length of zero (or less) : cannot be processed.");
692  problem = true;
693  break;
694  }
695 
696  CByte_graph::TValues new_scores;
697 
698  if (! literal.IsSetSeq_data()) {
699  /* this Seq-literal contains no data, so it presumably
700  * represents a gap
701  */
702  is_gap = true;
703  }
704 
705  /* read the scores from big_bs for this Delta-seq component.
706  * remember the min and max scores. check for non-zero score
707  * if the component is a gap
708  */
709  size_t scores_size = literal.GetLength();
710  for (size_t i = 0; i < scores_size; i++) {
711  Uint1 byte = static_cast<Uint1>(scores_str[curr_pos]);
712  new_scores.push_back(scores_str[curr_pos]);
713 
714  /*
715  fprintf(stdout, "byte read from ByteStore is >%i<\n", *j);
716  fflush(stdout);
717 */
718 
719  curr_pos++;
720 
721  if (byte < min_score)
722  min_score = byte;
723  else if (byte > max_score)
724  max_score = byte;
725 
726  if (is_gap && byte != 0) {
727  if (nonzero_gap < 10) {
728  ErrPostEx(SEV_WARNING, ERR_QSCORE_NonZeroInGap, "Encountered non-zero score value %i within Delta-seq gap at position %ld of bioseq", byte, curr_pos);
729  nonzero_gap++;
730  } else if (nonzero_gap == 10) {
731  ErrPostEx(SEV_WARNING, ERR_QSCORE_NonZeroInGap, "Exceeded reporting threshold (10) for non-zero score values in Delta-seq gaps : no further messages will be generated");
732  nonzero_gap++;
733  }
734  }
735  }
736 
737  /* don't create a Seq-graph for gaps
738  */
739  if (is_gap)
740  continue;
741 
742  /* allocate a SeqGraph and a ByteStore
743  */
744 
745  CRef<CSeq_graph> graph(new CSeq_graph);
746  CSeq_interval& interval = graph->SetLoc().SetInt();
747 
748  interval.SetId(*(*bioseq.SetId().begin()));
749 
750  /* Write the scores from big_bs to the new ByteStore
751  */
752  graph->SetNumval(static_cast<TSeqPos>(new_scores.size()));
753  graph->SetGraph().SetByte().SetValues().swap(new_scores);
754 
755  /* there is no "compression" for the Seq-graph; there's supposed to
756  * be a score for every base in the sequence to which the quality
757  * score buffer applies
758  */
759  // graph->SetComp(1);
760 
761  /* no scaling of values
762  */
763  // graph->SetA(1.0);
764  // graph->SetB(0);
765 
766  /* Establish the byte-graph values
767  */
768  graph->SetGraph().SetByte().SetMin(min_score);
769  graph->SetGraph().SetByte().SetMax(max_score);
770  graph->SetGraph().SetByte().SetAxis(0);
771 
772  /*
773  fprintf(stdout, "new_sgp max score is %i\n", max_score);
774  fprintf(stdout, "new_sgp min score is %i\n", min_score);
775  fflush(stdout);
776 */
777 
778  if (! def_title.empty())
779  graph->SetTitle(def_title);
780 
781  interval.SetFrom(last_pos);
782  interval.SetTo(curr_pos - 1);
783 
784  /*
785  fprintf(stdout, "new_sgp from is %ld\n", last_pos);
786  fprintf(stdout, "new_sgp to is %ld\n", curr_pos - 1);
787  fflush(stdout);
788 */
789  new_graphs.push_back(graph);
790  }
791 
792  if (! problem)
793  graphs.swap(new_graphs);
794 
795  if (curr_pos != bioseq.GetLength()) {
796  ErrPostEx(SEV_WARNING, ERR_QSCORE_OutOfScores, "Exhausted available score data in Seq-graph being split at location %ld : total length of Delta-seq bioseq is %ld .", curr_pos, bioseq.GetLength());
797  }
798 }
799 
800 /***********************************************************
801  *
802  * Function: QSbuf_To_Single_Qscore_SeqGraph
803  *
804  * Description: Read a char* buffer that contains basepair
805  * Quality Score data and convert it to an ASN.1
806  * Seq-Graph object. The buffer is assumed to
807  * start with a FASTA-like identifier, and be
808  * followed by lines that contain whitespace-separated
809  * integer values. For example:
810  *
811  * >AL031704.13 Phrap Quality (Length:40617, Min: 1, Max: 99)
812  * 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
813  *
814  * If the allow_na argument is true, then the buffer may also
815  * contain tokens consisting of 'NA' (not applicable), which
816  * are interpreted as zero. Some quality scores from DDBJ
817  * follow this convention.
818  *
819  * This function builds a **single** Seq-graph
820  * for the entire quality score buffer, regardless
821  * of whether the sequence to which it applies is
822  * comprised of contigs separated by gaps. Subsequent
823  * processing should break this monolithic Seq-graph
824  * into a series of smaller Seq-graphs, if the
825  * sequence is represented as a Delta-seq with
826  * multiple Seq-literals for the component contigs.
827  * See function Split_Qscore_SeqGraph_By_DeltaSeq() .
828  *
829  * Arguments: qs_buf: the buffer containing Quality Score
830  * data; must be NULL-terminated
831  * bsp: BioseqPtr for the record to which the
832  * data in the quality score buffer applies;
833  * the bsp->id and bsp->length slots of
834  * the Bioseq must be populated; if the
835  * Bioseq is a Delta-seq, then the bsp->length
836  * must include the length of the gaps
837  * between the pieces of sequence data
838  * def_acc: buffer with which the accession number
839  * read from the defline in the quality
840  * score buffer is returned to the caller;
841  * caller must allocate, and it must be
842  * large enough for an accession; should
843  * be compared to the accession of the
844  * record that the caller is processing,
845  * to make sure they are equal
846  * def_ver: buffer with which the sequence version
847  * number read from defline in the quality
848  * score buffer is returned to the caller;
849  * caller must allocate, and it must be
850  * large enough for largest version
851  * (6 digits? 7?); should be compared
852  * to the version number of the record
853  * that the caller is processing, to make
854  * sure they are equal
855  * check_minmax: when set to true, min/max scores from
856  * the defline in qs_buf will be compared
857  * to the min/max scores in the score data;
858  * if the values are not equal, no SeqGraphPtr
859  * will be returned
860  * allow_na : when set to true, score values of "NA" in
861  * qs_buf are allowed, and interpreted as zero
862  *
863  * Returns: pointer to SeqGraph upon success, otherwise NULL
864  *
865  ***********************************************************/
866 static void QSbuf_To_Single_Qscore_SeqGraph(const char* qs_buf,
867  CBioseq& bioseq,
868  char* def_acc,
869  char* def_ver,
870  bool check_minmax,
871  bool allow_na,
873 {
874  Int4 qs_line = 0; /* current line number within qs_buf */
875  char* my_buf = nullptr; /* copy of a line of data from
876  qs_buf */
877  int def_stat; /* return status from parsing of the
878  'defline' in the quality score
879  buffer */
880  char* def_title; /* title parsed from the quality
881  score defline */
882  Uint4 def_len = 0; /* sequence length parsed from the
883  quality score defline */
884  Uint1 def_max = 0; /* maximum quality score parsed from
885  the quality score defline */
886  Uint1 def_min = 0; /* minimum quality score parsed from
887  the quality score defline */
888  Uint1 scores[QSBUF_MAXSCORES]; /* array of Uint1 to hold the
889  scores read from one line
890  of qs_buf data */
891  Uint4 total_scores = 0;
892  Uint1 max_score = QS_MIN_VALID_SCORE; /* maximum quality
893  score encountered in
894  qs_buf score data */
895  Uint1 min_score = QS_MAX_VALID_SCORE; /* minimum quality
896  score encountered in
897  qs_buf score data */
898  bool problem = false; /* set to TRUE for various error
899  conditions encountered in the
900  qs_buf data; used to free the
901  Seq-graph and return NULL */
902 
903  if (! qs_buf || *qs_buf == '\0' || ! def_acc || ! def_ver) {
904  ErrPostEx(SEV_ERROR, ERR_QSCORE_InvalidArgs, "Missing arguments for QSbuf_To_Single_SeqGraph call.");
905  return;
906  }
907 
908  if (bioseq.GetLength() < 1) {
909  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadBioseqLen, "Invalid Bioseq length : %ld", bioseq.GetLength());
910  return;
911  }
912 
913  if (! bioseq.IsSetId()) {
914  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadBioseqId, "Invalid Bioseq : no Seq-ids found.");
915  return;
916  }
917 
918  /* allocate a buffer for reading qs_buf, one line at a time
919  */
920  vector<char> mybuf(QSBUF_MAXLINE);
921  my_buf = mybuf.data();
922  if (! my_buf) {
923  ErrPostEx(SEV_ERROR, ERR_QSCORE_MemAlloc, "MemNew failure for my_buf buffer");
924  return;
925  }
926 
927  /* allocate a buffer for the 'title' read from the defline in qs_buf
928  */
929  vector<char> deftitle(QSBUF_MAXTITLE);
930  def_title = deftitle.data();
931  if (! def_title) {
932  ErrPostEx(SEV_ERROR, ERR_QSCORE_MemAlloc, "MemNew failure for def_title buffer");
933  return;
934  }
935 
936  CRef<CSeq_graph> graph;
937  CByte_graph::TValues scores_str;
938 
939  while (*qs_buf != '\0') {
940  if (! QSbuf_ReadLine(qs_buf, my_buf, QSBUF_MAXLINE, &qs_line)) {
941  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadQscoreRead, "QSbuf_ReadLine failure near line %ld of qscore buffer.", qs_line);
942  return;
943  }
944 
945  /*
946  fprintf(stdout, "line from qs_buf is:\n>%s<\n", my_buf);
947  fflush(stdout);
948 */
949 
950  /* \n has been replaced by \0 in the line returned by QSbuf_ReadLine
951  * we want to increment qs_buf to point to the character beyond the \0
952  *
953  * it's safe to do this only if QSbuf_ReadLine() returns true,
954  * which will happen only when the line from qs_buf ends with \n
955  * or \n\0
956  */
957  qs_buf += StringLen(my_buf) + 1;
958 
959  if (qs_line == 1) {
960  /* first line is supposed to be a 'defline' for the quality
961  * score data
962  */
963  if (*my_buf != '>') {
964  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadDefline, "qscore buffer does not start with required > character.");
965  return;
966  }
967 
968  def_stat = QSbuf_ParseDefline(my_buf, def_acc, def_ver, def_title, &def_len, &def_max, &def_min);
969  if (def_stat != 1) {
970  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadDefline, "QSbuf_ParseDefline failure : return value is >%d< : probable defline data/format error : defline is >%s<\n", def_stat, my_buf);
971  return;
972  }
973 
974  if (def_acc && *def_acc == '\0') {
975  ErrPostEx(SEV_ERROR, ERR_QSCORE_NoAccession, "Could not parse accession from qscore defline : defline is >%s<\n", my_buf);
976  return;
977  }
978  if (def_ver && *def_ver == '\0') {
979  ErrPostEx(SEV_ERROR, ERR_QSCORE_NoSeqVer, "Could not parse sequence version number from qscore defline : defline is >%s<\n", my_buf);
980  return;
981  }
982  if (def_title && *def_title == '\0') {
983  ErrPostEx(SEV_ERROR, ERR_QSCORE_NoTitle, "Could not parse title from qscore defline : defline is >%s<\n", my_buf);
984  return;
985  }
986  if (def_len != bioseq.GetLength()) {
987  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadLength, "Sequence length from qscore defline does not match bioseq length : %ld (defline) vs %ld (bioseq)", def_len, bioseq.GetLength());
988  return;
989  }
990  if (def_max < def_min || def_min > def_max) {
991  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadMinMax, "Maximum and minimum scores from qscore defline are contradictory : %ld (max) vs %ld (min)", def_max, def_min);
992  return;
993  }
994 
995  /* allocate a SeqGraph and a ByteStore
996  */
997  graph.Reset(new CSeq_graph);
998  graph->SetTitle(def_title);
999  } else {
1000  /* a small number of EMBL records have qscore data that
1001  * is terminated with a double slash; if encountered,
1002  * generate a warning message and then exit the while loop.
1003  */
1004 
1005  if (StringEqu(my_buf, "//")) {
1006  /* ErrPostEx(SEV_WARNING, ERR_QSCORE_DoubleSlash,
1007  "Encountered unusual double-slash terminator in qscore buffer : assuming it flags the end of qscore data.");*/
1008  break;
1009  }
1010 
1011  /* otherwise, this must be a line of quality score data
1012  */
1013  int qs_scores = QSbuf_ParseScores(my_buf, &scores[0], QSBUF_MAXSCORES, &max_score, &min_score, allow_na);
1014  if (qs_scores < 0) {
1015  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadScoreLine, "QSbuf_ParseScores failure : return value is >%ld< : probable score data/format error : score data near >%s<\n", qs_scores, my_buf);
1016  return;
1017  }
1018 
1019  /* write the scores to the ByteStore
1020  */
1021  std::copy(scores, scores + qs_scores, std::back_inserter(scores_str));
1022  total_scores += qs_scores;
1023  }
1024  }
1025 
1026  if (graph.Empty())
1027  return;
1028 
1029  if (total_scores != def_len) {
1030  ErrPostEx(SEV_ERROR, ERR_QSCORE_ScoresVsLen, "number of scores read from qscore buffer does not equal defline sequence length : %ld (scores) vs %ld (defline)", total_scores, def_len);
1031  problem = true;
1032  }
1033  if (total_scores != bioseq.GetLength()) {
1034  ErrPostEx(SEV_ERROR, ERR_QSCORE_ScoresVsBspLen, "number of scores read from qscore buffer does not equal supplied bioseq length : %ld (scores) vs %ld (bioseq)", total_scores, bioseq.GetLength());
1035  problem = true;
1036  }
1037  if (check_minmax) {
1038  if (def_max != max_score) {
1039  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadMax, "maximum score from qscore defline does not equal maximum score value : %ld (defline) vs %ld (scores)", def_max, max_score);
1040  problem = true;
1041  }
1042  if (def_min != min_score) {
1043  ErrPostEx(SEV_ERROR, ERR_QSCORE_BadMin, "minimum score from qscore defline does not equal minimum score value : %ld (defline) vs %ld (scores)", def_min, min_score);
1044  problem = true;
1045  }
1046  }
1047 
1048  /* if a problem has been encountered, free the SeqGraph and return NULL */
1049  if (problem) {
1050  return;
1051  }
1052 
1053  /* get a Seq-interval for the SeqGraph, and duplicate the Seq-id
1054  * of the Bioseq for use in the Seq-interval
1055  */
1056  CSeq_loc& loc = graph->SetLoc();
1057 
1058  /* otherwise, you can now put all the pieces of the Seq-graph together
1059  */
1060  graph->SetNumval(static_cast<TSeqPos>(scores_str.size()));
1061 
1062  /* there is no "compression" for the Seq-graph; there's supposed to
1063  * be a score for every base in the sequence to which the quality
1064  * score buffer applies
1065  */
1066 
1067  /* no scaling of values */
1068 
1069  /* Seq-graph type is "byte" */
1070  graph->SetGraph().SetByte().SetValues().swap(scores_str);
1071 
1072  /* Establish the byte-graph values
1073  */
1074  graph->SetGraph().SetByte().SetMin(min_score);
1075  graph->SetGraph().SetByte().SetMax(max_score);
1076  graph->SetGraph().SetByte().SetAxis(0);
1077 
1078 
1079  /* feature location for the Seq-graph runs from 0
1080  * to the sequence length - 1
1081  */
1082  CSeq_interval& interval = loc.SetInt();
1083  interval.SetFrom(0);
1084  interval.SetTo(bioseq.GetLength() - 1);
1085 
1086  loc.SetId(*(bioseq.GetId().front()));
1087 
1088  graphs.push_back(graph);
1089 }
1090 
1091 /**********************************************************/
1092 // TODO: functionality in this file was never tested
1093 bool QscoreToSeqAnnot(const string& qscore, CBioseq& bioseq, char* acc, Int2 ver, bool check_minmax, bool allow_na)
1094 {
1095  Char charver[100];
1096 
1097  if (qscore.empty() || ver < 1)
1098  return true;
1099 
1100  snprintf(charver, 100, "%d", (int)ver);
1101 
1103  QSbuf_To_Single_Qscore_SeqGraph(qscore.c_str(), bioseq, acc, charver, check_minmax, allow_na, graphs);
1104  if (graphs.empty())
1105  return false;
1106 
1107  if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_delta) {
1108  Split_Qscore_SeqGraph_By_DeltaSeq(graphs, bioseq);
1109  }
1110 
1111  if (graphs.empty())
1112  return false;
1113 
1114  CRef<CSeq_annot> annot(new CSeq_annot);
1115  annot->SetData().SetGraph().swap(graphs);
1116  annot->SetNameDesc("Graphs");
1117 
1118  bioseq.SetAnnot().push_front(annot);
1119 
1120  return true;
1121 }
1122 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
#define ERR_QSCORE_NoTitle
Definition: flat2err.h:566
#define ERR_QSCORE_BadMinMax
Definition: flat2err.h:568
#define ERR_QSCORE_BadBioseqLen
Definition: flat2err.h:560
#define ERR_QSCORE_MissingByteStore
Definition: flat2err.h:552
#define ERR_QSCORE_UnknownDelta
Definition: flat2err.h:554
#define ERR_QSCORE_BadQscoreRead
Definition: flat2err.h:562
#define ERR_QSCORE_NoSeqVer
Definition: flat2err.h:565
#define ERR_QSCORE_MemAlloc
Definition: flat2err.h:557
#define ERR_QSCORE_NonLiteralDelta
Definition: flat2err.h:553
#define ERR_QSCORE_ZeroLengthLiteral
Definition: flat2err.h:556
#define ERR_QSCORE_BadLength
Definition: flat2err.h:567
#define ERR_QSCORE_BadMax
Definition: flat2err.h:572
#define ERR_QSCORE_NonZeroInGap
Definition: flat2err.h:558
#define ERR_QSCORE_NonByteGraph
Definition: flat2err.h:576
#define ERR_QSCORE_BadBioseqId
Definition: flat2err.h:561
#define ERR_QSCORE_InvalidArgs
Definition: flat2err.h:559
#define ERR_QSCORE_BadDefline
Definition: flat2err.h:563
#define ERR_QSCORE_ScoresVsLen
Definition: flat2err.h:570
#define ERR_QSCORE_NoAccession
Definition: flat2err.h:564
#define ERR_QSCORE_BadTitle
Definition: flat2err.h:574
#define ERR_QSCORE_BadScoreLine
Definition: flat2err.h:569
#define ERR_QSCORE_OutOfScores
Definition: flat2err.h:575
#define ERR_QSCORE_ScoresVsBspLen
Definition: flat2err.h:571
#define ERR_QSCORE_BadMin
Definition: flat2err.h:573
USING_SCOPE(objects)
#define QS_MAX_VALID_SCORE
Definition: fta_qscore.cpp:79
#define QS_MIN_VALID_SCORE
Definition: fta_qscore.cpp:78
static int QSbuf_ParseDefline(char *qs_defline, char *def_acc, char *def_ver, char *def_title, unsigned int *def_len, unsigned char *def_max, unsigned char *def_min)
Definition: fta_qscore.cpp:183
#define QSBUF_MAXSCORES
Definition: fta_qscore.cpp:77
static bool QSbuf_ReadLine(const char *qs_buf, char *dest_buf, Int2 max_len, int *line)
Definition: fta_qscore.cpp:97
#define QSBUF_MAXTITLE
Definition: fta_qscore.cpp:76
static void QSbuf_To_Single_Qscore_SeqGraph(const char *qs_buf, CBioseq &bioseq, char *def_acc, char *def_ver, bool check_minmax, bool allow_na, CSeq_annot::C_Data::TGraph &graphs)
Definition: fta_qscore.cpp:855
static Int4 QSbuf_ParseScores(char *score_buf, unsigned char *scores, Int4 max_toks, unsigned char *max_score, unsigned char *min_score, bool allow_na)
Definition: fta_qscore.cpp:465
static void Split_Qscore_SeqGraph_By_DeltaSeq(CSeq_annot::C_Data::TGraph &graphs, CBioseq &bioseq)
Definition: fta_qscore.cpp:613
#define QSBUF_MAXLINE
Definition: fta_qscore.cpp:74
bool QscoreToSeqAnnot(const string &qscore, CBioseq &bioseq, char *acc, Int2 ver, bool check_minmax, bool allow_na)
#define IS_DIGIT_OR_NA(c)
Definition: fta_qscore.cpp:72
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:131
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:89
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
bool StringHasNoText(const char *s)
Definition: ftacpp.hpp:137
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CTempString literal(const char(&str)[Size])
Templatized initialization from a string literal.
Definition: tempstr.hpp:441
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
void SetTo(TTo value)
Assign a value to To data member.
void SetId(TId &value)
Assign a value to Id data member.
void SetFrom(TFrom value)
Assign a value to From data member.
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Seq_graph_.hpp:784
vector< char > TValues
Definition: Byte_graph_.hpp:89
void SetNumval(TNumval value)
Assign a value to Numval data member.
const TGraph & GetGraph(void) const
Get the Graph member data.
const TTitle & GetTitle(void) const
Get the Title member data.
Definition: Seq_graph_.hpp:775
const TByte & GetByte(void) const
Get the variant data.
Definition: Seq_graph_.cpp:153
void SetGraph(TGraph &value)
Assign a value to Graph data member.
Definition: Seq_graph_.cpp:250
const TValues & GetValues(void) const
Get the Values member data.
bool IsByte(void) const
Check if variant Byte is selected.
Definition: Seq_graph_.hpp:757
void SetLoc(TLoc &value)
Assign a value to Loc data member.
Definition: Seq_graph_.cpp:224
bool IsSetTitle(void) const
Check if a value has been assigned to Title data member.
Definition: Seq_graph_.hpp:763
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
list< CRef< CSeq_graph > > TGraph
Definition: Seq_annot_.hpp:195
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
Definition: Bioseq_.hpp:278
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
int i
static void byte(MDB_val *v)
Definition: mdb_dump.c:81
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
T max(T x_, T y_)
T min(T x_, T y_)
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
Modified on Fri Jun 14 16:48:28 2024 by modify_doxy.py rev. 669887