NCBI C++ ToolKit
alnread.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2  * $Id: alnread.c 86561 2019-05-23 18:35:07Z lavr $
3  *
4  * ===========================================================================
5  *
6  * PUBLIC DOMAIN NOTICE
7  * National Center for Biotechnology Information
8  *
9  * This software/database is a "United States Government Work" under the
10  * terms of the United States Copyright Act. It was written as part of
11  * the author's official duties as a United States Government employee and
12  * thus cannot be copyrighted. This software/database is freely available
13  * to the public for use. The National Library of Medicine and the U.S.
14  * Government have not placed any restriction on its use or reproduction.
15  *
16  * Although all reasonable efforts have been taken to ensure the accuracy
17  * and reliability of the software and data, the NLM and the U.S.
18  * Government do not and cannot warrant the performance or results that
19  * may be obtained by using this software or data. The NLM and the U.S.
20  * Government disclaim all warranties, express or implied, including
21  * warranties of performance, merchantability or fitness for any particular
22  * purpose.
23  *
24  * Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors: Colleen Bollin
29  *
30  */
31 
32 #include <util/creaders/alnread.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <ctype.h>
37 
38 #ifdef _MSC_VER
39 #define strdup _strdup
40 #endif
41 
42 static const size_t kMaxPrintedIntLen = 10;
43 #define kMaxPrintedIntLenPlusOne 11
44 
45 
46 /* ---------------------------------------------------------------------- */
47 typedef enum {
53 } EAlignFormat;
54 
55 /* ---------------------------------------------------------------------- */
56 
57 /* structures used internally */
58 typedef struct SLineInfo {
59  char * data;
60  int line_num;
63  struct SLineInfo * next;
65 
66 typedef struct SLineInfoReader {
69  char * curr_line_pos;
70  int data_pos;
72 
73 typedef struct SIntLink {
74  int ival;
75  struct SIntLink * next;
77 
78 typedef struct SStringCount {
79  char * string;
82  struct SStringCount * next;
84 
85 typedef struct SSizeInfo {
88  struct SSizeInfo * next;
90 
91 typedef struct SLengthList {
94  struct SLengthList * next;
96 
97 typedef struct SCommentLoc {
98  char * start;
99  char * end;
100  struct SCommentLoc * next;
102 
103 typedef struct SBracketedCommentList
104 {
108 
109 typedef struct SAlignRawSeq {
110  char * id;
113  struct SAlignRawSeq * next;
115 
116 typedef struct SAlignFileRaw {
128  char * alphabet;
134 
135 /* Function declarations
136  */
137 static EBool s_AfrpInitLineData(
138  SAlignRawFilePtr afrp, FReadLineFunction readfunc, void* pfile);
139 static void s_AfrpProcessFastaGap(
140  SAlignRawFilePtr afrp, SLengthListPtr * patterns, EBool * last_line_was_marked_id, char* plinestr, int overall_line_count);
141 
142 /* These functions are used for storing and transmitting information
143  * about errors encountered while reading the alignment data.
144  */
145 
146 /* This function allocates memory for a new error structure and populates
147  * the structure with default values.
148  * The new structure will be added to the end of the linked list of error
149  * structures pointed to by list.
150  */
152 {
153  TErrorInfoPtr eip, last;
154 
155  eip = (TErrorInfoPtr) malloc ( sizeof (SErrorInfo));
156  if (eip == NULL) {
157  return NULL;
158  }
159  eip->category = eAlnErr_Unknown;
160  eip->line_num = -1;
161  eip->id = NULL;
162  eip->message = NULL;
163  eip->next = NULL;
164  last = list;
165  while (last != NULL && last->next != NULL) {
166  last = last->next;
167  }
168  if (last != NULL) {
169  last->next = eip;
170  }
171  return eip;
172 }
173 
174 /* This function recursively frees the memory associated with a list of
175  * error structures as well as the member variables of the error structures.
176  */
177 extern void ErrorInfoFree (TErrorInfoPtr eip)
178 {
179  if (eip == NULL) {
180  return;
181  }
182  ErrorInfoFree (eip->next);
183  free (eip->id);
184  free (eip->message);
185  free (eip);
186 }
187 
188 /* This function creates and sends an error message regarding a NEXUS comment
189  * character.
190  */
191 static void
193 (const char * expected,
194  char seen,
195  const char * val_name,
196  FReportErrorFunction errfunc,
197  void * errdata)
198 {
199  TErrorInfoPtr eip;
200  const char * errformat = "Specified %s character does not match NEXUS"
201  " comment in file (specified %s, comment %c)";
202 
203  if (errfunc == NULL || val_name == NULL || expected == NULL) {
204  return;
205  }
206 
207  eip = ErrorInfoNew (NULL);
208  if (eip != NULL) {
210  eip->message = (char *) malloc (strlen (errformat) + strlen (val_name)
211  + strlen (expected) + 2);
212  if (eip->message != NULL) {
213  sprintf (eip->message, errformat, val_name, expected, seen);
214  }
215  errfunc (eip, errdata);
216  }
217 }
218 
219 
220 /* This function creates and sends an error message regarding a character
221  * that is unexpected in sequence data.
222  */
223 static void
225 (char * id,
226  char bad_char,
227  int num_bad,
228  int offset,
229  int line_number,
230  const char * reason,
231  FReportErrorFunction errfunc,
232  void * errdata)
233 {
234  TErrorInfoPtr eip;
235  const char * err_format =
236  "%d bad characters (%c) found at position %d (%s).";
237 
238  if (errfunc == NULL || num_bad == 0 || bad_char == 0
239  || reason == NULL) {
240  return;
241  }
242 
243  eip = ErrorInfoNew (NULL);
244  if (eip == NULL) {
245  return;
246  }
247 
248  eip->category = eAlnErr_BadData;
249  if (id != NULL) eip->id = strdup (id);
250  eip->line_num = line_number;
251  eip->message = (char*) malloc (strlen (err_format) + 2 * kMaxPrintedIntLen
252  + strlen (reason) + 3);
253  if (eip->message != NULL)
254  {
255  sprintf (eip->message, err_format, num_bad, bad_char, offset, reason);
256  }
257  errfunc (eip, errdata);
258 }
259 
260 
261 /* This function creates and sends an error message regarding an ID that
262  * was found in the wrong location.
263  */
264 static void
266 (char * id,
267  int line_number,
268  FReportErrorFunction report_error,
269  void * report_error_userdata)
270 {
271  TErrorInfoPtr eip;
272 
273  if (report_error == NULL) {
274  return;
275  }
276  eip = ErrorInfoNew (NULL);
277  if (eip == NULL) {
278  return;
279  }
281  eip->id = strdup (id);
282  eip->line_num = line_number;
283  eip->message = strdup ("Found unexpected ID");
284  report_error (eip, report_error_userdata);
285 }
286 
287 
288 /* This function creates and sends an error message regarding a line
289  * of sequence data that was expected to have a different length.
290  */
291 static void
293 (char * id,
294  int line_number,
295  FReportErrorFunction report_error,
296  void * report_error_userdata)
297 {
298  TErrorInfoPtr eip;
299 
300  if (report_error == NULL) {
301  return;
302  }
303  eip = ErrorInfoNew (NULL);
304  if (eip == NULL) {
305  return;
306  }
308  eip->id = strdup (id);
309  eip->line_num = line_number;
310  eip->message = strdup ("Inconsistent block line formatting");
311  report_error (eip, report_error_userdata);
312 }
313 
314 
315 /* This function creates and sends an error message regarding a line of
316  * sequence data that was expected to be a different length.
317  */
318 static void
320 (char * id,
321  TLineInfoPtr lip,
322  int expected_length,
323  FReportErrorFunction report_error,
324  void * report_error_userdata)
325 {
326  TErrorInfoPtr eip;
327  char * msg;
328  const char * format = "Expected line length %d, actual length %d";
329  size_t len;
330 
331  if (lip == NULL || report_error == NULL) {
332  return;
333  }
334 
335  eip = ErrorInfoNew (NULL);
336  if (eip == NULL) {
337  return;
338  }
340  eip->id = strdup(id);
341  eip->line_num = lip->line_num;
342  msg = (char*)malloc(strlen(format) + kMaxPrintedIntLen + 1);
343  if (msg != NULL) {
344  if (lip->data == NULL) {
345  len = 0;
346  } else {
347  len = strlen(lip->data);
348  }
349  sprintf(msg, format, expected_length, len);
350  eip->message = msg;
351  }
352  report_error(eip, report_error_userdata);
353 }
354 
355 
356 /* This function creates and sends an error message regarding a block of
357  * sequence data that was expected to contain more lines.
358  */
359 static void
361 (const char * id,
362  int line_num,
363  int expected_num,
364  int actual_num,
365  FReportErrorFunction report_error,
366  void * report_error_userdata)
367 {
368  TErrorInfoPtr eip;
369  const char * err_format = "Expected %d lines in block, found %d";
370 
371  if (report_error == NULL) {
372  return;
373  }
374 
375  eip = ErrorInfoNew (NULL);
376  if (eip == NULL) {
377  return;
378  }
380  eip->id = strdup (id);
381  eip->line_num = line_num;
382  eip->message = (char*)malloc(strlen(err_format) + 2 * kMaxPrintedIntLen + 1);
383  if (eip->message != NULL) {
384  sprintf (eip->message, err_format, expected_num, actual_num);
385  }
386  report_error (eip, report_error_userdata);
387 }
388 
389 
390 /* This function creates and sends an error message regarding a block of
391  * sequence data that contains duplicate IDs.
392  */
393 static void
395 (char * id,
396  int line_num,
397  FReportErrorFunction report_error,
398  void * report_error_userdata)
399 {
400  TErrorInfoPtr eip;
401  const char * err_format = "Duplicate ID! Sequences will be concatenated!";
402 
403  if (report_error == NULL) {
404  return;
405  }
406 
407  eip = ErrorInfoNew (NULL);
408  if (eip == NULL) {
409  return;
410  }
411  eip->category = eAlnErr_BadData;
412  eip->id = strdup (id);
413  eip->line_num = line_num;
414  eip->message = strdup (err_format);
415  report_error (eip, report_error_userdata);
416 }
417 
418 
419 /* This function creates and sends an error message regarding missing
420  * sequence data.
421  */
422 static void
424 (char * id,
425  FReportErrorFunction report_error,
426  void * report_error_userdata)
427 {
428  TErrorInfoPtr eip;
429 
430  if (report_error == NULL) {
431  return;
432  }
433  eip = ErrorInfoNew (NULL);
434  if (eip == NULL) {
435  return;
436  }
437  eip->category = eAlnErr_Fatal;
438  eip->id = strdup (id);
439  eip->message = strdup ("No data found");
440  report_error (eip, report_error_userdata);
441 }
442 
443 
444 /* This function creates and sends an error message indicating that the
445  * most common length of the sequences in the file do not match a comment
446  * found in the file.
447  */
448 static void
450 (char * id,
451  int expected_length,
452  int actual_length,
453  FReportErrorFunction report_error,
454  void * report_error_userdata)
455 {
456  TErrorInfoPtr eip;
457  const char * format_str = "Expected sequence length %d, actual length %d";
458 
459  if (report_error == NULL) {
460  return;
461  }
462  eip = ErrorInfoNew (NULL);
463  if (eip == NULL) {
464  return;
465  }
467  eip->id = strdup (id);
468  eip->message = (char *)malloc (strlen (format_str) + 50);
469  if (eip->message != NULL) {
470  sprintf (eip->message, format_str, expected_length, actual_length);
471  }
472  report_error (eip, report_error_userdata);
473 }
474 
475 
476 /* This function creates and sends an error message indicating that the
477  * number of sequences read does not match a comment in the alignment file.
478  */
479 static void
481 (int num_expected,
482  int num_found,
483  FReportErrorFunction report_error,
484  void * report_error_userdata)
485 {
486  TErrorInfoPtr eip;
487  const char * err_format = "Expected %d sequences, found %d";
488 
489  if (report_error == NULL) {
490  return;
491  }
492  eip = ErrorInfoNew (NULL);
493  if (eip == NULL) {
494  return;
495  }
497  eip->message = (char*)malloc(strlen(err_format) + 2 * kMaxPrintedIntLen + 1);
498 
499  if (eip->message != NULL)
500  {
501  sprintf (eip->message, err_format, num_expected, num_found);
502  }
503  report_error (eip, report_error_userdata);
504 }
505 
506 
507 static void
509 (int len_expected,
510  int len_found,
511  FReportErrorFunction report_error,
512  void * report_error_userdata)
513 {
514  TErrorInfoPtr eip;
515  const char * err_format = "Expected sequences of length %d, found %d";
516 
517  if (report_error == NULL) {
518  return;
519  }
520  eip = ErrorInfoNew (NULL);
521  if (eip == NULL) {
522  return;
523  }
524 
526  eip->message = (char*)malloc(strlen(err_format) + 2 * kMaxPrintedIntLen + 1);
527  if (eip->message != NULL) {
528  sprintf (eip->message, err_format, len_expected, len_found);
529  }
530  report_error (eip, report_error_userdata);
531 }
532 
533 
534 /* This function creates and sends an error message indicating that some or
535  * all of the organism information for the sequences are missing.
536  */
537 static void
539 (FReportErrorFunction report_error,
540  void * report_error_userdata)
541 {
542  TErrorInfoPtr eip;
543 
544  if (report_error == NULL) {
545  return;
546  }
547  eip = ErrorInfoNew (NULL);
548  if (eip == NULL) {
549  return;
550  }
551 
552  eip->category = eAlnErr_BadData;
553  eip->message = strdup ("Missing organism information");
554  report_error (eip, report_error_userdata);
555 }
556 
557 
558 /* This function creates and sends an error message regarding an ID that is
559  * used for more than one sequence.
560  */
561 static void
563 (TStringCountPtr scp,
564  FReportErrorFunction report_error,
565  void * report_error_userdata)
566 {
567  TErrorInfoPtr eip;
568  const char * err_format = "ID %s appears in the following locations:";
569  char * cp;
570  TIntLinkPtr line_number;
571 
572  if (report_error == NULL || scp == NULL || scp->string == NULL) {
573  return;
574  }
575 
576  eip = ErrorInfoNew(NULL);
577  if (eip == NULL) {
578  return;
579  }
580 
581  eip->category = eAlnErr_BadData;
582  eip->id = strdup(scp->string);
583  if (scp->line_numbers != NULL) {
584  eip->line_num = scp->line_numbers->ival;
585  }
586  eip->message = (char*)malloc(strlen(err_format)
587  + strlen(scp->string)
588  + (size_t)scp->num_appearances * 15
589  + 1);
590  if (eip->message != NULL) {
591  sprintf(eip->message, err_format, scp->string);
592  cp = eip->message + strlen (eip->message);
593  for (line_number = scp->line_numbers;
594  line_number != NULL;
595  line_number = line_number->next) {
596  sprintf(cp, " %d", line_number->ival);
597  cp += strlen (cp);
598  }
599  }
600  report_error(eip, report_error_userdata);
601 }
602 
603 
604 /* This function creates and sends an error message indicating that the file
605  * being read is an ASN.1 file.
606  */
607 static void
609 (FReportErrorFunction errfunc,
610  void * errdata)
611 {
612  TErrorInfoPtr eip;
613  const char * msg = "This is an ASN.1 file, "
614  "which cannot be read by this function.";
615 
616  if (errfunc == NULL) {
617  return;
618  }
619 
620  eip = ErrorInfoNew (NULL);
621  if (eip != NULL) {
622  eip->category = eAlnErr_BadData;
623  eip->message = (char *) malloc (strlen (msg) + 1);
624  if (eip->message != NULL) {
625  sprintf (eip->message, "%s", msg);
626  }
627  errfunc (eip, errdata);
628  }
629 }
630 
631 
632 /* This function reports that some sequences are inside brackets (indicating a segmented set)
633  * and that some sequences are outside the brackets.
634  */
635 static void
637 (TIntLinkPtr offset_list,
638  FReportErrorFunction errfunc,
639  void * errdata)
640 {
641  TErrorInfoPtr eip;
642  const char * msg = "This file contains sequences in brackets (indicating "
643  "a segmented alignment) as well as sequences not in brackets at lines "
644  "%s. Please either add or remove brackets to correct this problem.";
645  size_t num_lines = 0;
646  size_t msg_len = 0;
647  TIntLinkPtr t;
648  char * line_text_list;
649  char * line_text_list_offset;
650 
651  if (errfunc == NULL || offset_list == NULL) {
652  return;
653  }
654  for (t = offset_list; t != NULL; t = t->next) {
655  ++num_lines;
656  }
657  msg_len = num_lines * (kMaxPrintedIntLen + 2);
658  if (num_lines > 1) {
659  msg_len += 4;
660  }
661 
662  line_text_list = (char*)malloc(msg_len);
663  if (line_text_list == NULL) return;
664  line_text_list_offset = line_text_list;
665 
666  for (t = offset_list; t != NULL; t = t->next) {
667  if (t->next == NULL)
668  {
669  sprintf (line_text_list_offset, "%d", t->ival);
670  }
671  else if (num_lines == 2)
672  {
673  sprintf (line_text_list_offset, "%d and ", t->ival);
674  }
675  else if (t->next->next == NULL)
676  {
677  sprintf (line_text_list_offset, "%d, and ", t->ival);
678  }
679  else
680  {
681  sprintf (line_text_list_offset, "%d, ", t->ival);
682  }
683  line_text_list_offset += strlen (line_text_list_offset);
684  }
685 
686  msg_len += strlen(msg) + 1;
687 
688  eip = ErrorInfoNew (NULL);
689  if (eip != NULL) {
690  eip->category = eAlnErr_BadData;
691  eip->message = (char *) malloc (msg_len);
692  if (eip->message != NULL) {
693  sprintf(eip->message, msg, line_text_list);
694  }
695  errfunc(eip, errdata);
696  }
697  free(line_text_list);
698 }
699 
700 
701 /* This function reports an error if a line looks like it might contain an organism comment
702  * but is somehow improperly formatted
703  */
705 (char * linestring,
706  FReportErrorFunction errfunc,
707  void * errdata)
708 {
709  TErrorInfoPtr eip;
710  const char * msg = "This line may contain an improperly formatted organism description.\n"
711  "Organism descriptions should be of the form [org=tax name] or [organism=tax name].\n";
712 
713  if (errfunc == NULL || linestring == NULL) {
714  return;
715  }
716 
717  eip = ErrorInfoNew (NULL);
718  if (eip != NULL) {
719  eip->category = eAlnErr_BadData;
720  eip->message = (char *) malloc (strlen (msg) + strlen (linestring) + 1);
721  if (eip->message != NULL) {
722  strcpy (eip->message, msg);
723  strcat (eip->message, linestring);
724  }
725  errfunc (eip, errdata);
726  }
727 }
728 
729 
730 /* This function reports that the number of segments in an alignment of
731  * segmented sets is inconsistent.
732  */
734 (int line_num,
735  int num_seg,
736  int num_seg_exp,
737  FReportErrorFunction errfunc,
738  void * errdata)
739 {
740  TErrorInfoPtr eip;
741  const char * msg = "This segmented set contains a different number of segments (%d) than expected (%d).\n";
742 
743  if (errfunc == NULL) {
744  return;
745  }
746 
747  eip = ErrorInfoNew (NULL);
748  if (eip != NULL) {
749  eip->line_num = line_num;
750  eip->category = eAlnErr_BadData;
751  eip->message = (char*) malloc(strlen(msg) + 2 * kMaxPrintedIntLen + 1);
752  if (eip->message != NULL) {
753  sprintf (eip->message, msg, num_seg, num_seg_exp);
754  }
755  errfunc (eip, errdata);
756  }
757 }
758 
759 
760 /* This function allocates memory for a SSequenceInfo structure and
761  * initializes the member variables. It returns a pointer to the newly
762  * allocated memory.
763  */
765 {
766  TSequenceInfoPtr sip;
767 
768  sip = (TSequenceInfoPtr) malloc (sizeof (SSequenceInfo));
769  if (sip == NULL) {
770  return NULL;
771  }
772  sip->missing = strdup ("?");
773  sip->beginning_gap = strdup (".");
774  sip->middle_gap = strdup ("-");
775  sip->end_gap = strdup (".");
776  sip->match = strdup (".");
777  sip->alphabet = NULL;
778  return sip;
779 }
780 
781 
782 /* This function frees memory associated with the member variables of
783  * the SSequenceInfo structure and with the structure itself.
784  */
786 {
787  if (sip == NULL) {
788  return;
789  }
790  free (sip->missing);
791  free (sip->beginning_gap);
792  free (sip->middle_gap);
793  free (sip->end_gap);
794  free (sip->match);
795  sip->alphabet = NULL;
796  free (sip);
797 }
798 
799 
800 /* This function creates and sends an error message regarding an unused line.
801  */
802 static void
804 (int line_num_start,
805  int line_num_stop,
806  TLineInfoPtr line_val,
807  FReportErrorFunction errfunc,
808  void * errdata)
809 {
810  TErrorInfoPtr eip;
811  const char * errformat1 = "Line %d could not be assigned to an interleaved block";
812  const char * errformat2 = "Lines %d through %d could not be assigned to an interleaved block";
813  const char * errformat3 = "Contents of unused line: %s";
814  int skip;
815 
816  if (errfunc == NULL || line_val == NULL) {
817  return;
818  }
819 
820  eip = ErrorInfoNew (NULL);
821  if (eip != NULL) {
823  eip->line_num = line_num_start;
824  if (line_num_start == line_num_stop) {
825  eip->message = (char*)malloc(strlen(errformat1) + kMaxPrintedIntLen + 1);
826  if (eip->message != NULL) {
827  sprintf (eip->message, errformat1, line_num_start);
828  }
829  } else {
830  eip->message = (char*)malloc(strlen(errformat2) + 2*kMaxPrintedIntLen + 1);
831  if (eip->message != NULL) {
832  sprintf (eip->message, errformat2, line_num_start,
833  line_num_stop);
834  }
835  }
836  errfunc (eip, errdata);
837  }
838  /* report contents of unused lines */
839  for (skip = line_num_start;
840  skip < line_num_stop + 1 && line_val != NULL;
841  skip++) {
842  if (line_val->data == NULL) {
843  continue;
844  }
845  eip = ErrorInfoNew (NULL);
846  if (eip != NULL) {
848  eip->line_num = skip;
849  eip->message = (char *) malloc (strlen (errformat3)
850  + strlen (line_val->data) + 1);
851  if (eip->message != NULL) {
852  sprintf (eip->message, errformat3, line_val->data);
853  }
854  errfunc (eip, errdata);
855  }
856  line_val = line_val->next;
857  }
858 }
859 
860 
861 /* The following functions are used to manage a linked list of integer
862  * values.
863  */
864 
865 /* This function creates a new SIntLink structure with a value of ival.
866  * The new structure will be placed at the end of list if list is not NULL.
867  * The function will return a pointer to the new structure.
868  */
869 static TIntLinkPtr
871 (int ival,
872  TIntLinkPtr list)
873 {
874  TIntLinkPtr ilp, last;
875 
876  ilp = (TIntLinkPtr) malloc (sizeof (SIntLink));
877  if (ilp == NULL) {
878  return NULL;
879  }
880  ilp->ival = ival;
881  ilp->next = NULL;
882  last = list;
883  while (last != NULL && last->next != NULL) {
884  last = last->next;
885  }
886  if (last != NULL) {
887  last->next = ilp;
888  }
889  return ilp;
890 }
891 
892 
893 /* This function recursively frees memory associated with a linked list
894  * of SIntLink structures.
895  */
896 static void s_IntLinkFree (TIntLinkPtr ilp)
897 {
898  if (ilp == NULL) {
899  return;
900  }
901  s_IntLinkFree (ilp->next);
902  free (ilp);
903 }
904 
905 
906 /* These functions are used to accumulate and retrieve information on
907  * how often a size of data (number of lines or number of characters) occurs.
908  */
909 
910 /* This function allocates space for a new SSizeInfo structure and
911  * initializes its member variables. If list is not NULL, the new structure
912  * is added to the end of the list.
913  * The function returns a pointer to the newly allocated structure.
914  */
916 {
917  TSizeInfoPtr sip, last;
918 
919  sip = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
920  if (sip == NULL) {
921  return NULL;
922  }
923 
924  sip->size_value = 0;
925  sip->num_appearances = 0;
926  sip->next = NULL;
927  last = list;
928  while (last != NULL && last->next != NULL) {
929  last = last->next;
930  }
931  if (last != NULL) {
932  last->next = sip;
933  }
934  return sip;
935 }
936 
937 
938 /* This function recursively frees the memory associated with a linked list
939  * of SSizeInfo structures.
940  */
941 static void s_SizeInfoFree (TSizeInfoPtr list)
942 {
943  if (list == NULL) {
944  return;
945  }
946  s_SizeInfoFree (list->next);
947  list->next = NULL;
948  free (list);
949 }
950 
951 
952 /* This function returns eTrue if the two SSizeInfo structures have
953  * the same size_value and number of appearances, eFalse otherwise.
954  */
955 static EBool
957 (TSizeInfoPtr s1,
958  TSizeInfoPtr s2)
959 {
960  if (s1 == NULL
961  || s2 == NULL
962  || s1->size_value != s2->size_value
963  || s1->num_appearances != s2->num_appearances) {
964  return eFalse;
965  }
966  return eTrue;
967 }
968 
969 
970 /* This function searches list for a SSizeInfo structure with the
971  * same size_value as size_value. If it finds such a structure, it
972  * adds the value of num_appearances to the num_appearances for that
973  * structure, otherwise the function creates a new structure at the end
974  * of the list with the specified values of size_value and num_appearances.
975  * The function returns a pointer to the list of SSizeInfo structures.
976  */
978 (TSizeInfoPtr list,
979  int size_value,
980  int num_appearances)
981 {
982  TSizeInfoPtr p, last;
983 
984  last = NULL;
985  for (p = list; p != NULL && p->size_value != size_value; p = p->next) {
986  last = p;
987  }
988  if (p == NULL) {
989  p = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
990  if (p == NULL) {
991  return NULL;
992  }
993  p->size_value = size_value;
994  p->num_appearances = num_appearances;
995  p->next = 0;
996  if (last == NULL) {
997  list = p;
998  } else {
999  last->next = p;
1000  }
1001  } else {
1002  p->num_appearances += num_appearances;
1003  }
1004  return list;
1005 }
1006 
1007 
1008 /* This function searches list for a SSizeInfo structure with the
1009  * same size_value as size_value. If it finds such a structure, it
1010  * adds one to the num_appearances for that structure, otherwise the
1011  * function creates a new structure at the end of the list with the
1012  * specified values of size_value and num_appearances.
1013  * The function returns a pointer to the list of SSizeInfo structures.
1014  */
1015 static TSizeInfoPtr
1017 (TSizeInfoPtr list,
1018  int size_value)
1019 {
1020  return s_AddSizeInfoAppearances (list, size_value, 1);
1021 }
1022 
1023 
1024 /* This function searches list for the SSizeInfo structure with the
1025  * highest value for num_appearances. If more than one structure exists
1026  * with the highest value for num_appearances, the function chooses the
1027  * value with the highest value for size_value. The function returns a
1028  * pointer to the structure selected based on the above criteria.
1029  */
1031 {
1032  TSizeInfoPtr p, best;
1033 
1034  if (list == NULL) {
1035  return NULL;
1036  }
1037 
1038  best = list;
1039  for (p = list->next; p != NULL; p = p->next) {
1040  if (p->num_appearances > best->num_appearances
1041  || (p->num_appearances == best->num_appearances
1042  && p->size_value > best->size_value)) {
1043  best = p;
1044  }
1045  }
1046  return best;
1047 }
1048 
1049 
1050 /* This function uses s_GetMostPopularSizeInfo function to find the structure
1051  * in list that has the highest value for num_appearances and size_value.
1052  * If such a structure is found and has a num_appearances value greater than
1053  * one, the size_value for that structure will be returned, otherwise the
1054  * function returns 0.
1055  */
1057 {
1058  TSizeInfoPtr best;
1059 
1060  best = s_GetMostPopularSizeInfo (list);
1061  if (best == NULL) {
1062  return 0;
1063  }
1064  if (best->num_appearances > 1) {
1065  return best->size_value;
1066  } else {
1067  return 0;
1068  }
1069 }
1070 
1071 
1072 /* The following functions are used to keep track of patterns of line or
1073  * token lengths, which will be used to identify errors in formatting.
1074  */
1076 {
1077  SLengthListPtr llp, last;
1078 
1079  llp = (SLengthListPtr) malloc (sizeof (SLengthListData));
1080  if (llp == NULL) {
1081  return NULL;
1082  }
1083 
1084  llp->lengthrepeats = NULL;
1085  llp->num_appearances = 0;
1086  llp->next = NULL;
1087 
1088  last = list;
1089  while (last != NULL && last->next != NULL) {
1090  last = last->next;
1091  }
1092  if (last != NULL) {
1093  last->next = llp;
1094  }
1095  return llp;
1096 }
1097 
1098 
1099 /* This function recursively frees memory for a list of SLengthListData
1100  * structures and its member variables.
1101  */
1103 {
1104  if (llp == NULL) {
1105  return;
1106  }
1107  s_LengthListFree (llp->next);
1109  free (llp);
1110 }
1111 
1112 
1113 /* This function examines the last SSizeInfo structure in the
1114  * lengthrepeats member variable of llp. If the last structure
1115  * in the list has the same size_value value as the function argument
1116  * size_value, the value of num_appearances for that SizeInforData structure
1117  * will be incremented. Otherwise a new SSizeInfo structure will be
1118  * appended to the end of the lengthrepeats list with the specified
1119  * size_value and a num_appearances value of 1.
1120  */
1121 static void
1123 (SLengthListPtr llp,
1124  int size_value)
1125 {
1126  TSizeInfoPtr p, last;
1127 
1128  if (llp == NULL) {
1129  return;
1130  }
1131 
1132  last = NULL;
1133  for (p = llp->lengthrepeats; p != NULL; p = p->next) {
1134  last = p;
1135  }
1136  if (last == NULL || last->size_value != size_value) {
1137  p = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
1138  if (p == NULL) {
1139  return;
1140  }
1141  p->size_value = size_value;
1142  p->num_appearances = 1;
1143  p->next = 0;
1144  if (last == NULL) {
1145  llp->lengthrepeats = p;
1146  } else {
1147  last->next = p;
1148  }
1149  } else {
1150  last->num_appearances ++;
1151  }
1152 }
1153 
1154 
1155 /* This function examines whether two SLengthListData structures "match" -
1156  * the structures match if each SSizeInfo structure in llp1->lengthrepeats
1157  * has the same size_value and num_appearances values as the SSizeInfo
1158  * structure in the corresponding list position in llp2->lenghrepeats.
1159  * If the two structures match, the function returns eTrue, otherwise the
1160  * function returns eFalse.
1161  */
1162 static EBool
1164 (SLengthListPtr llp1,
1165  SLengthListPtr llp2)
1166 {
1167  TSizeInfoPtr sip1, sip2;
1168 
1169  if (llp1 == NULL || llp2 == NULL
1170  || llp1->lengthrepeats == NULL
1171  || llp2->lengthrepeats == NULL) {
1172  return eFalse;
1173  }
1174  for (sip1 = llp1->lengthrepeats, sip2 = llp2->lengthrepeats;
1175  sip1 != NULL && sip2 != NULL;
1176  sip1 = sip1->next, sip2 = sip2->next) {
1177  if ( ! s_SizeInfoIsEqual (sip1, sip2)
1178  || (sip1->next == NULL && sip2->next != NULL)
1179  || (sip1->next != NULL && sip2->next == NULL)) {
1180  return eFalse;
1181  }
1182  }
1183  return eTrue;
1184 }
1185 
1186 
1187 /* This function examines a list of SLengthListData structures to see if
1188  * one of them matches llp. If so, the value of num_appearances in that
1189  * list is incremented by one and llp is freed, otherwise llp is added
1190  * to the end of the list.
1191  * The function returns a pointer to the list of LenghtListData structures.
1192  */
1193 static SLengthListPtr
1195 (SLengthListPtr list,
1196  SLengthListPtr llp)
1197 {
1198  SLengthListPtr prev_llp;
1199 
1200  if (list == NULL) {
1201  list = llp;
1202  } else {
1203  prev_llp = list;
1204  while ( prev_llp->next && ! s_DoLengthPatternsMatch (prev_llp, llp)) {
1205  prev_llp = prev_llp->next;
1206  }
1207  if (s_DoLengthPatternsMatch (prev_llp, llp)) {
1208  prev_llp->num_appearances ++;
1209  s_LengthListFree (llp);
1210  } else {
1211  prev_llp->next = llp;
1212  }
1213  }
1214  return list;
1215 }
1216 
1217 
1218 /* This set of functions is used for storing and analyzing individual lines
1219  * or tokens from an alignment file.
1220  */
1221 
1222 /* This function allocates memory for a new SLineInfo structure and
1223  * initializes the structure with a saved copy of string and the specified
1224  * values of line_num and line_offset.
1225  * The function returns a pointer to the new SLineInfo structure.
1226  */
1227 static TLineInfoPtr
1229 (const char * string,
1230  int line_num,
1231  int line_offset)
1232 {
1233  TLineInfoPtr lip;
1234 
1235  lip = (TLineInfoPtr) malloc (sizeof (SLineInfo));
1236  if (lip == NULL) {
1237  return NULL;
1238  }
1239  lip->data = strdup (string);
1240  lip->line_num = line_num + 1;
1241  lip->line_offset = line_offset;
1242  lip->delete_me = eFalse;
1243  lip->next = NULL;
1244  return lip;
1245 }
1246 
1247 
1248 /* This function recursively frees the memory associated with the structures
1249  * and members of the structures in a linked list of SLineInfo structures.
1250  */
1251 static void s_LineInfoFree (TLineInfoPtr lip)
1252 {
1253  TLineInfoPtr next_lip;
1254  if (lip == NULL) {
1255  return;
1256  }
1257  while (lip != NULL) {
1258  next_lip = lip->next;
1259  lip->next = NULL;
1260  free (lip->data);
1261  free (lip);
1262  lip = next_lip;
1263  }
1264 }
1265 
1266 
1267 /* This function deletes from a linked list of SLineInfo structures
1268  * those structures for which the delete_me flag has been set. The function
1269  * returns a pointer to the beginning of the new list.
1270  */
1272 {
1274  TLineInfoPtr lip, nextlip;
1275 
1276  lip = list;
1277  while (lip != NULL) {
1278  nextlip = lip->next;
1279  if (lip->delete_me) {
1280  if (prev != NULL) {
1281  prev->next = lip->next;
1282  } else {
1283  list = lip->next;
1284  }
1285  lip->next = NULL;
1286  s_LineInfoFree (lip);
1287  } else {
1288  prev = lip;
1289  }
1290  lip = nextlip;
1291  }
1292  return list;
1293 }
1294 
1295 
1296 /* This function creates a new SLineInfo structure, populates it with
1297  * a copy of string and the specified line_num and line_offset values,
1298  * and appends it to the end of "list" if list is not NULL.
1299  * The function will return a pointer to the newly created structure
1300  * if list is NULL, otherwise the function will return list.
1301  */
1302 static TLineInfoPtr
1304 (TLineInfoPtr list,
1305  const char * string,
1306  int line_num,
1307  int line_offset)
1308 {
1309  TLineInfoPtr lip, p;
1310 
1311  if (string == NULL) {
1312  return list;
1313  }
1314  lip = s_LineInfoNew (string, line_num, line_offset);
1315  if (lip == NULL) {
1316  return NULL;
1317  }
1318  if (list == NULL) {
1319  list = lip;
1320  } else {
1321  p = list;
1322  while (p != NULL && p->next != NULL) {
1323  p = p->next;
1324  }
1325  p->next = lip;
1326  }
1327  return list;
1328 }
1329 
1330 /* This function creates a new bracketed comment */
1333  const char * string,
1334  int line_num,
1335  int line_offset)
1336 {
1337  TBracketedCommentListPtr comment;
1338 
1339  comment = (TBracketedCommentListPtr) malloc (sizeof (SBracketedCommentList));
1340  if (comment == NULL) {
1341  return NULL;
1342  }
1343  comment->comment_lines = s_LineInfoNew (string, line_num, line_offset);
1344  comment->next = NULL;
1345 
1346  if (list != NULL) {
1347  while (list->next != NULL) {
1348  list = list->next;
1349  }
1350  list->next = comment;
1351  }
1352 
1353  return comment;
1354 }
1355 
1356 /* This function frees a bracketed comment list. */
1358 {
1359  if (list == NULL) {
1360  return;
1361  }
1363  list->next = NULL;
1364  s_LineInfoFree (list->comment_lines);
1365 }
1366 
1367 /* This function adds a line to a bracketed comment. */
1369 (TBracketedCommentListPtr comment,
1370  const char * string,
1371  int line_num,
1372  int line_offset)
1373 {
1374  if (comment == NULL) {
1375  return;
1376  }
1377 
1378  comment->comment_lines = s_AddLineInfo(comment->comment_lines, string, line_num, line_offset);
1379 }
1380 
1381 /* This function counts the sequences found in a bracketed comment. */
1383 {
1384  TLineInfoPtr lip;
1385  int num_segments = 0;
1386  EBool skipped_line_since_last_defline = eTrue;
1387 
1388  if (comment == NULL || comment->comment_lines == NULL) {
1389  return 0;
1390  }
1391 
1392  lip = comment->comment_lines;
1393  /* First line must be left bracket on a line by itself */
1394  if (lip->data[0] != '[' || strspn(lip->data + 1, " \t\r\n") != strlen (lip->data + 1))
1395  {
1396  return 0;
1397  }
1398  lip = lip->next;
1399  while (lip != NULL && lip->next != NULL)
1400  {
1401  if (lip->data[0] == '>')
1402  {
1403  if (!skipped_line_since_last_defline)
1404  {
1405  return 0;
1406  }
1407  else
1408  {
1409  ++num_segments;
1410  skipped_line_since_last_defline = eFalse;
1411  }
1412  }
1413  else
1414  {
1415  skipped_line_since_last_defline = eTrue;
1416  }
1417  lip = lip->next;
1418  }
1419  /* Last line must be right bracket on a line by itself */
1420  /* First line must be left bracket on a line by itself */
1421  if (lip != NULL &&
1422  lip->data != NULL &&
1423  (lip->data[0] != ']' || strspn (lip->data + 1, " \t\r\n") != strlen (lip->data + 1)))
1424  {
1425  return 0;
1426  }
1427 
1428  return num_segments;
1429 }
1430 
1431 /* This function counts the number of sequences that appear in
1432  * bracketed comments. If the number of sequences is inconsistent,
1433  * the function will issue error messages and return a 1, otherwise
1434  * the function will return the number of sequences that appear in
1435  * each bracketed comment.
1436  */
1438 (TBracketedCommentListPtr comment_list,
1439  FReportErrorFunction errfunc,
1440  void * errdata)
1441 {
1442  TBracketedCommentListPtr comment;
1443  TSizeInfoPtr segcount_list = NULL;
1444  int num_segments = 1;
1445  int num_segments_this_bracket;
1446  int num_segments_expected;
1447  TSizeInfoPtr best;
1448 
1449  if (comment_list == NULL)
1450  {
1451  return num_segments;
1452  }
1453 
1454  for (comment = comment_list; comment != NULL; comment = comment->next)
1455  {
1456  num_segments_this_bracket = s_CountSequencesInBracketedComment(comment);
1457  segcount_list = s_AddSizeInfoAppearances (segcount_list,
1458  num_segments_this_bracket,
1459  1);
1460  if (comment != comment_list && segcount_list->next != NULL)
1461  {
1462  best = s_GetMostPopularSizeInfo (segcount_list);
1463  num_segments_expected = best->size_value;
1464 
1465  if (num_segments_expected != num_segments_this_bracket)
1466  {
1468  num_segments_this_bracket, num_segments_expected,
1469  errfunc, errdata);
1470  }
1471  }
1472  }
1473  if (segcount_list != NULL && segcount_list->next == NULL && segcount_list->size_value > 0)
1474  {
1475  num_segments = segcount_list->size_value;
1476  }
1477  s_SizeInfoFree (segcount_list);
1478  return num_segments;
1479 }
1480 
1481 /* This function gets a list of the offsets of the
1482  * sequences in bracketed comments.
1483  */
1485 {
1486  TIntLinkPtr new_offset, offset_list = NULL;
1487  TBracketedCommentListPtr comment;
1488  TLineInfoPtr lip;
1489 
1490  if (comment_list == NULL)
1491  {
1492  return NULL;
1493  }
1494 
1495  for (comment = comment_list; comment != NULL; comment = comment->next)
1496  {
1497  if (s_CountSequencesInBracketedComment(comment) == 0)
1498  {
1499  continue;
1500  }
1501  for (lip = comment->comment_lines; lip != NULL; lip = lip->next)
1502  {
1503  if (lip->data != NULL && lip->data[0] == '>')
1504  {
1505  new_offset = s_IntLinkNew (lip->line_num + 1, offset_list);
1506  if (offset_list == NULL) offset_list = new_offset;
1507  }
1508  }
1509  }
1510  return offset_list;
1511 }
1512 
1513 static char * s_TokenizeString (char * str, const char *delimiter, char **last)
1514 {
1515  size_t skip;
1516  size_t length;
1517 
1518  if (str == NULL) {
1519  str = *last;
1520  }
1521  if (delimiter == NULL) {
1522  *last = NULL;
1523  return NULL;
1524  }
1525 
1526  if (str == NULL || *str == 0) {
1527  return NULL;
1528  }
1529  skip = strspn (str, delimiter);
1530  str += skip;
1531  length = strcspn (str, delimiter);
1532  *last = str + length;
1533  if (**last != 0) {
1534  **last = 0;
1535  (*last) ++;
1536  }
1537  return str;
1538 }
1539 
1540 
1541 /* This function creates a new list of SLineInfo structures by tokenizing
1542  * each data element from line_list into multiple tokens at whitespace.
1543  * The function returns a pointer to the new list. The original list is
1544  * unchanged.
1545  */
1547 {
1548  TLineInfoPtr first_token, lip;
1549  char * tmp;
1550  char * piece;
1551  char * last;
1552  size_t line_pos;
1553 
1554  first_token = NULL;
1555 
1556  for (lip = line_list; lip != NULL; lip = lip->next) {
1557  if (lip->data != NULL && (tmp = strdup(lip->data)) != NULL) {
1558  piece = s_TokenizeString(tmp, " \t\r", &last);
1559  while (piece != NULL) {
1560  line_pos = piece - tmp;
1561  line_pos += lip->line_offset;
1562  first_token = s_AddLineInfo (first_token, piece,
1563  lip->line_num,
1564  line_pos);
1565  piece = s_TokenizeString (NULL, " \t\r", &last);
1566  }
1567  free(tmp);
1568  }
1569  }
1570  return first_token;
1571 }
1572 
1573 
1574 /* This function takes a list of SLineInfo structures, allocates memory
1575  * to hold their contents contiguously, and stores their contents, minus
1576  * the whitespace, in the newly allocated memory.
1577  * The function returns a pointer to this newly allocated memory.
1578  */
1580 {
1581  TLineInfoPtr lip;
1582  size_t len;
1583  char * result;
1584  char * cp_to;
1585  char * cp_from;
1586 
1587  if (list == NULL) {
1588  return NULL;
1589  }
1590  len = 0;
1591  for (lip = list; lip != NULL; lip = lip->next) {
1592  if (lip->data != NULL) {
1593  len += strlen(lip->data);
1594  }
1595  }
1596  result = (char *) malloc(len + 1);
1597  if (result == NULL) {
1598  return result;
1599  }
1600  cp_to = result;
1601  for (lip = list; lip != NULL; lip = lip->next) {
1602  if (lip->data != NULL) {
1603  cp_from = lip->data;
1604  while (*cp_from != 0) {
1605  if (! isspace((unsigned char)*cp_from)) {
1606  *cp_to = *cp_from;
1607  cp_to ++;
1608  }
1609  cp_from ++;
1610  }
1611  }
1612  }
1613  *cp_to = 0;
1614  return result;
1615 }
1616 
1617 
1618 /* The following functions are used to manage the SLineInfoReader
1619  * structure. The intention is to allow the user to access the data
1620  * from a linked list of SLineInfo structures using a given position
1621  * in the data based on the number of sequence data characters rather than
1622  * any particular line number or position in the line. This is useful
1623  * for matching up a data position in a record with a match character with
1624  * the same data position in the first or master record. This is also useful
1625  * for determining how to interpret special characters that may have
1626  * context-sensitive meanings. For example, a ? could indicate a missing
1627  * character if it is inside a sequence but indicate a gap if it is outside
1628  * a sequence.
1629  */
1630 
1631 /* This function is used to advance the current data position pointer
1632  * for a SLineInfoReader structure past white space and blank lines
1633  * in sequence data.
1634  */
1636 {
1637  if (lirp->curr_line_pos == NULL) {
1638  return;
1639  }
1640  while ( isspace ((unsigned char) *lirp->curr_line_pos)
1641  || *lirp->curr_line_pos == 0) {
1642  while ( isspace ((unsigned char)*lirp->curr_line_pos)) {
1643  lirp->curr_line_pos ++;
1644  }
1645  if (*lirp->curr_line_pos == 0) {
1646  lirp->curr_line = lirp->curr_line->next;
1647  while (lirp->curr_line != NULL
1648  && lirp->curr_line->data == NULL) {
1649  lirp->curr_line = lirp->curr_line->next;
1650  }
1651  if (lirp->curr_line == NULL) {
1652  lirp->curr_line_pos = NULL;
1653  return;
1654  } else {
1655  lirp->curr_line_pos = lirp->curr_line->data;
1656  }
1657  }
1658  }
1659 }
1660 
1661 
1662 /* This function sets the current data position pointer to the first
1663  * non-whitespace character in the sequence data.
1664  */
1666 {
1667  if (lirp == NULL) {
1668  return;
1669  }
1670  lirp->curr_line = lirp->first_line;
1671 
1672  while (lirp->curr_line != NULL && lirp->curr_line->data == NULL) {
1673  lirp->curr_line = lirp->curr_line->next;
1674  }
1675  if (lirp->curr_line == NULL) {
1676  lirp->curr_line_pos = NULL;
1677  lirp->data_pos = -1;
1678  } else {
1679  lirp->curr_line_pos = lirp->curr_line->data;
1681  if (lirp->curr_line_pos == NULL) {
1682  lirp->data_pos = -1;
1683  } else {
1684  lirp->data_pos = 0;
1685  }
1686  }
1687 }
1688 
1689 
1690 /* This function creates a new SLineInfoReader structure and initializes
1691  * its member variables. The current data position pointer is set to the
1692  * first non-whitespace character in the sequence data, and the data position
1693  * counter is set to zero. The function returns a pointer to the new
1694  * LineInfoReader data structure.
1695  */
1697 {
1698  TLineInfoReaderPtr lirp;
1699 
1700  if (line_list == NULL) {
1701  return NULL;
1702  }
1703  lirp = (TLineInfoReaderPtr) malloc (sizeof (SLineInfoReader));
1704  if (lirp == NULL) {
1705  return NULL;
1706  }
1707 
1708  lirp->first_line = line_list;
1709  s_LineInfoReaderReset (lirp);
1710  return lirp;
1711 }
1712 
1713 
1714 /* This function safely interprets the current line number of the
1715  * SLineInfoReader structure. If the structure is NULL or the
1716  * current line is NULL (usually because the data position has been
1717  * advanced to the end of the available sequence data), the function
1718  * returns -1, since the current data position does not actually exist.
1719  * Otherwise, the line number of the character at the current data position
1720  * is returned.
1721  */
1723 {
1724  if (lirp == NULL || lirp->curr_line == NULL) {
1725  return -1;
1726  } else {
1727  return lirp->curr_line->line_num;
1728  }
1729 }
1730 
1731 
1732 /* This function safely interprets the position of the current data position
1733  * of the SLineInfoReader structure. If the structure is NULL or the
1734  * current line is NULL or the current line position is NULL (usually because
1735  * the data position has been advanced to the end of the available sequence
1736  * data), the function returns -1, since the current data position does not
1737  * actually exist.
1738  * Otherwise, the position within the line of the character at the current
1739  * data position is returned.
1740  */
1742 {
1743  if (lirp == NULL || lirp->curr_line == NULL
1744  || lirp->curr_line_pos == NULL) {
1745  return -1;
1746  } else {
1747  return lirp->curr_line->line_offset + lirp->curr_line_pos
1748  - lirp->curr_line->data;
1749  }
1750 }
1751 
1752 
1753 /* This function frees the memory associated with the SLineInfoReader
1754  * structure. Notice that this function does NOT free the SLineInfo list.
1755  * This is by design.
1756  */
1758 {
1759  if (lirp == NULL) {
1760  return;
1761  }
1762  free (lirp);
1763  lirp = NULL;
1764 }
1765 
1766 
1767 /* This function retrieves the "pos"th sequence data character from the lines
1768  * of sequence data. If the data position requested is greater than the
1769  * current position, the current data pointer will be advanced until the
1770  * current position is the requested position or there is no more data. If
1771  * there is no more data, the function returns a 0. If the data position
1772  * requested is lower than the current position, the current position is reset
1773  * to the beginning of the sequence and advanced from there.
1774  * As a result, it is clearly more efficient to read the data in the forward
1775  * direction, but it is still possible to access the data randomly.
1776  */
1777 static char
1779 (TLineInfoReaderPtr lirp,
1780  int pos)
1781 {
1782  if (lirp == NULL || lirp->first_line == NULL || pos < 0
1783  || lirp->data_pos == -1) {
1784  return 0;
1785  }
1786 
1787  if (lirp->data_pos == pos) {
1788  if (lirp->curr_line_pos == NULL) {
1789  return 0;
1790  } else {
1791  return *lirp->curr_line_pos;
1792  }
1793  }
1794  if (lirp->data_pos > pos) {
1795  s_LineInfoReaderReset (lirp);
1796  }
1797 
1798  while (lirp->data_pos < pos && lirp->curr_line != NULL) {
1799  lirp->curr_line_pos ++;
1800  /* skip over spaces, progress to next line if necessary */
1802  lirp->data_pos ++;
1803  }
1804  if (lirp->curr_line_pos != NULL) {
1805  return *lirp->curr_line_pos;
1806  } else {
1807  return 0;
1808  }
1809 }
1810 
1811 
1812 /* The following functions are used to manage the SStringCount structure.
1813  * These functions are useful for determining whether a string is unique
1814  * or whether only one string is used for a particular purpose.
1815  * The structure also tracks the line numbers on which a particular string
1816  * appeared.
1817  */
1818 
1819 /* This function allocates memory for a new SStringCount structure,
1820  * initializes its member variables. The function also places the
1821  * structure at the end of list if list is not NULL.
1822  * The function returns a pointer to the newly allocated SStringCount
1823  * structure.
1824  */
1826 {
1827  TStringCountPtr new_item, last;
1828 
1829  new_item = (TStringCountPtr) malloc (sizeof (SStringCount));
1830  if (new_item == NULL) {
1831  return NULL;
1832  }
1833  new_item->string = NULL;
1834  new_item->num_appearances = 0;
1835  new_item->line_numbers = NULL;
1836  new_item->next = NULL;
1837 
1838  last = list;
1839  while (last != NULL && last->next != NULL) {
1840  last = last->next;
1841  }
1842  if (last != NULL) {
1843  last->next = new_item;
1844  }
1845  return new_item;
1846 }
1847 
1848 
1849 /* This function recursively frees data associated with the structures
1850  * and structure member variables in a linked list of SStringCount
1851  * structures.
1852  */
1854 {
1855  if (list == NULL) {
1856  return;
1857  }
1858  s_StringCountFree (list->next);
1859  s_IntLinkFree (list->line_numbers);
1860  free (list);
1861 }
1862 
1863 
1864 /* This function searches list to see if the string matches any of the
1865  * existing entries. If so, the num_appearances value for that entry is
1866  * increased and the line_num is added to that entry's list of line numbers.
1867  * Otherwise a new entry is created at the end of the list.
1868  * The function returns list if list was not NULL, or a pointer to the
1869  * newly created SStringCount structure otherwise.
1870  */
1872  char * string,
1873  int line_num,
1874  TStringCountPtr list
1875 )
1876 {
1877  TStringCountPtr add_to, last = NULL;
1878  TIntLinkPtr new_offset;
1879 
1880  if (string == NULL) {
1881  for (add_to = list;
1882  add_to != NULL && add_to->string != NULL;
1883  add_to = add_to->next) {
1884  last = add_to;
1885  }
1886  } else {
1887  for (add_to = list;
1888  add_to != NULL
1889  && (add_to->string == NULL
1890  || strcmp (string, add_to->string) != 0);
1891  add_to = add_to->next) {
1892  last = add_to;
1893  }
1894  }
1895 
1896  if (add_to == NULL) {
1897  add_to = s_StringCountNew (last);
1898  if (list == NULL) list = add_to;
1899  if (add_to != NULL) {
1900  add_to->string = string;
1901  }
1902  }
1903  if (add_to != NULL) {
1904  add_to->num_appearances ++;
1905  new_offset = s_IntLinkNew (line_num, add_to->line_numbers);
1906  if (add_to->line_numbers == NULL) {
1907  add_to->line_numbers = new_offset;
1908  }
1909  }
1910  return list;
1911 }
1912 
1913 /* The following functions are replacements for strncasecmp and strcasecmp */
1914 
1915 /* This function returns -1 if str1 is less than str2 in the first cmp_count
1916  * characters (using case-insensitive comparisons), 0 if they are equal,
1917  * and 1 if str1 is greater than str2.
1918  */
1919 static int s_StringNICmp (const char * str1, const char *str2, int cmp_count)
1920 {
1921  const char * cp1;
1922  const char * cp2;
1923  int char_count, diff;
1924 
1925  if (str1 == NULL && str2 == NULL) {
1926  return 0;
1927  }
1928  if (str1 == NULL) {
1929  return -1;
1930  }
1931  if (str2 == NULL) {
1932  return 1;
1933  }
1934  cp1 = str1;
1935  cp2 = str2;
1936  char_count = 0;
1937  while (*cp1 != 0 && *cp2 != 0 && char_count < cmp_count) {
1938  diff = toupper ((unsigned char)(*cp1)) - toupper ((unsigned char)(*cp2));
1939  if (diff != 0) {
1940  return diff;
1941  }
1942  char_count ++;
1943  cp1++;
1944  cp2++;
1945  }
1946  if (char_count == cmp_count) {
1947  return 0;
1948  } else if (*cp1 == 0 && *cp2 != 0) {
1949  return -1;
1950  } else if (*cp1 != 0 && *cp2 == 0) {
1951  return 1;
1952  } else {
1953  return 0;
1954  }
1955 }
1956 
1957 
1958 /* This function returns -1 if str1 is less than str2 using case-insensitive
1959  * comparisons), 0 if they are equal, and 1 if str1 is greater than str2.
1960  */
1961 static int s_StringICmp (const char * str1, const char *str2)
1962 {
1963  const char * cp1;
1964  const char * cp2;
1965  int diff;
1966 
1967  if (str1 == NULL && str2 == NULL) {
1968  return 0;
1969  }
1970  if (str1 == NULL) {
1971  return -1;
1972  }
1973  if (str2 == NULL) {
1974  return 1;
1975  }
1976  cp1 = str1;
1977  cp2 = str2;
1978  while (*cp1 != 0 && *cp2 != 0) {
1979  diff = toupper ((unsigned char) *cp1) - toupper ((unsigned char) *cp2);
1980  if (diff != 0) {
1981  return diff;
1982  }
1983  cp1++;
1984  cp2++;
1985  }
1986  if (*cp1 == 0 && *cp2 != 0) {
1987  return -1;
1988  } else if (*cp1 != 0 && *cp2 == 0) {
1989  return 1;
1990  } else {
1991  return 0;
1992  }
1993 }
1994 
1995 
1996 /* The following functions are used to analyze specific kinds of lines
1997  * found in alignment files for information regarding the number of
1998  * expected sequences, the expected length of those sequences, and the
1999  * characters used to indicate missing, gap, and match characters.
2000  */
2001 
2002 /* This function reads two numbers separated by whitespace from the
2003  * beginning of the string and uses them to set the expected number of
2004  * sequences and the expected number of characters per sequence.
2005  */
2006 static void
2008 (char * str,
2009  SAlignRawFilePtr afrp)
2010 {
2011  char * cp;
2012  char * cpend;
2013  char replace;
2014  int first, second;
2015 
2016  if (str == NULL || afrp == NULL) {
2017  return;
2018  }
2019  cp = str;
2020  while (! isdigit ((unsigned char)*cp) && *cp != 0) {
2021  cp++;
2022  }
2023 
2024  cpend = cp;
2025  while (isdigit ((unsigned char)*cpend) && *cpend != 0) {
2026  cpend++;
2027  }
2028  if (cp == cpend) {
2029  return;
2030  }
2031  replace = *cpend;
2032  *cpend = 0;
2033  first = atol (cp);
2034  *cpend = replace;
2035 
2036  cp = cpend;
2037  while (! isdigit ((unsigned char)*cp) && *cp != 0) {
2038  cp++;
2039  }
2040 
2041  cpend = cp;
2042  while (isdigit ((unsigned char)*cpend) && *cpend != 0) {
2043  cpend++;
2044  }
2045  if (cp == cpend) {
2046  return;
2047  }
2048  replace = *cpend;
2049  *cpend = 0;
2050  second = atol (cp);
2051  *cpend = replace;
2052 
2053  if (first > 0 && second > 0) {
2054  afrp->expected_num_sequence = first;
2055  afrp->expected_sequence_len = second;
2056  }
2057 
2058 }
2059 
2060 
2061 /* This function examines the string str to see if it begins with two
2062  * numbers separated by whitespace. The function returns eTrue if so,
2063  * otherwise it returns eFalse.
2064  */
2066 {
2067  const char * cp;
2068  EBool found_first_number = eFalse;
2069  EBool found_dividing_space = eFalse;
2070  EBool found_second_number = eFalse;
2071  EBool found_second_number_end = eFalse;
2072 
2073  if (str == NULL) {
2074  return eFalse;
2075  }
2076  cp = str;
2077  while (*cp != 0) {
2078  if (! isdigit ((unsigned char)*cp) && ! isspace ((unsigned char)*cp)) {
2079  return eFalse;
2080  }
2081  if (! found_first_number) {
2082  if (! isdigit ((unsigned char)*cp)) {
2083  return eFalse;
2084  }
2085  found_first_number = eTrue;
2086  } else if (! found_dividing_space) {
2087  if ( isspace ((unsigned char) *cp)) {
2088  found_dividing_space = eTrue;
2089  } else if ( ! isdigit ((unsigned char)*cp)) {
2090  return eFalse;
2091  }
2092  } else if (! found_second_number) {
2093  if ( isdigit ((unsigned char)*cp)) {
2094  found_second_number = eTrue;
2095  } else if (! isspace ((unsigned char) *cp)) {
2096  return eFalse;
2097  }
2098  } else if (! found_second_number_end) {
2099  if ( isspace ((unsigned char) *cp)) {
2100  found_second_number_end = eTrue;
2101  } else if (! isdigit ((unsigned char)*cp)) {
2102  return eFalse;
2103  }
2104  } else if (! isspace ((unsigned char) *cp)) {
2105  return eFalse;
2106  }
2107  cp++;
2108  }
2109  if (found_second_number) {
2110  return eTrue;
2111  }
2112  return eFalse;
2113 }
2114 
2115 
2116 /* This function finds a value name in a string, looks for an equals sign
2117  * after the value name, and then looks for an integer value after the
2118  * equals sign. If the integer value is found, the function copies the
2119  * integer value into the val location and returns eTrue, otherwise the
2120  * function returns eFalse.
2121  */
2122 static EBool
2124 (const char * str,
2125  const char * valname,
2126  int * val)
2127 {
2129  char * cpstart;
2130  char * cpend;
2131  size_t maxlen;
2132 
2133  if (str == NULL || valname == NULL || val == NULL) {
2134  return eFalse;
2135  }
2136 
2137  cpstart = (char*) strstr (str, valname);
2138  if (cpstart == NULL) {
2139  return eFalse;
2140  }
2141  cpstart += strlen (valname);
2142  while (*cpstart != 0 && isspace ((unsigned char)*cpstart)) {
2143  cpstart++;
2144  }
2145  if (*cpstart != '=') {
2146  return eFalse;
2147  }
2148  cpstart ++;
2149  while (*cpstart != 0 && isspace ((unsigned char)*cpstart)) {
2150  cpstart++;
2151  }
2152 
2153  if (! isdigit ((unsigned char)*cpstart)) {
2154  return eFalse;
2155  }
2156  cpend = cpstart + 1;
2157  while ( *cpend != 0 && isdigit ((unsigned char)*cpend)) {
2158  cpend ++;
2159  }
2160  maxlen = cpend - cpstart;
2161  if (maxlen > kMaxPrintedIntLen)
2162  maxlen = kMaxPrintedIntLen;
2163 
2164  strncpy(buf, cpstart, maxlen);
2165  buf [maxlen] = 0;
2166  *val = atoi (buf);
2167  return eTrue;
2168 }
2169 
2170 
2171 /* This function looks for Nexus-style comments to indicate the number of
2172  * sequences and the number of characters per sequence expected from this
2173  * alignment file. If the function finds these comments, it returns eTrue,
2174  * otherwise it returns eFalse.
2175  */
2176 static void
2178 (const char * str,
2179  EBool * found_ntax,
2180  EBool * found_nchar,
2181  SAlignRawFilePtr afrp)
2182 {
2183  int num_sequences;
2184  int num_chars;
2185 
2186  if (str == NULL || found_nchar == NULL
2187  || found_ntax == NULL || afrp == NULL) {
2188  return;
2189  }
2190  if (! *found_ntax &&
2191  (s_GetOneNexusSizeComment (str, "ntax", &num_sequences)
2192  || s_GetOneNexusSizeComment (str, "NTAX", &num_sequences))) {
2193  afrp->expected_num_sequence = num_sequences;
2194  afrp->align_format_found = eTrue;
2195  *found_ntax = eTrue;
2196  }
2197  if (! *found_nchar &&
2198  (s_GetOneNexusSizeComment (str, "nchar", &num_chars)
2199  || s_GetOneNexusSizeComment (str, "NCHAR", &num_chars))) {
2200  afrp->expected_sequence_len = num_chars;
2201  afrp->align_format_found = eTrue;
2202  *found_nchar = eTrue;
2203  }
2204 }
2205 
2206 
2207 /* This function looks for characters in Nexus-style comments to
2208  * indicate values for specific kinds of characters (match, missing, gap...).
2209  * If the string str contains val_name followed by an equals sign, the function
2210  * will return the first non-whitespace character following the equals sign,
2211  * otherwise the function will return a 0.
2212  */
2213 static char s_GetNexusTypechar (const char * str, const char * val_name)
2214 {
2215  const char * cp;
2216  const char * cpend;
2217 
2218  if (str == NULL || val_name == NULL) {
2219  return 0;
2220  }
2221  cpend = strstr (str, ";");
2222  if (cpend == NULL) {
2223  return 0;
2224  }
2225  cp = strstr (str, val_name);
2226  if (cp == NULL || cp > cpend) {
2227  return 0;
2228  }
2229  cp += strlen (val_name);
2230  while ( isspace ((unsigned char)*cp)) {
2231  cp ++;
2232  }
2233  if (*cp != '=') {
2234  return 0;
2235  }
2236  cp++;
2237  while ( isspace ((unsigned char)*cp) || *cp == '\'') {
2238  cp ++;
2239  }
2240  return *cp;
2241 }
2242 
2243 
2244 /* This function reads a Nexus-style comment line for the characters
2245  * specified for missing, match, and gap and compares the characters from
2246  * the comment with the characters specified in sequence_info. If any
2247  * discrepancies are found, the function reports the errors and returns eFalse,
2248  * otherwise the function returns eTrue.
2249  */
2251 (const char * str,
2252  TSequenceInfoPtr sequence_info,
2253  FReportErrorFunction errfunc,
2254  void * errdata)
2255 {
2256  const char * cp;
2257  char c;
2258 
2259  if (str == NULL || sequence_info == NULL) {
2260  return eFalse;
2261  }
2262 
2263  cp = strstr (str, "format ");
2264  if (cp == NULL) {
2265  cp = strstr (str, "FORMAT ");
2266  }
2267  if (cp == NULL) {
2268  return eFalse;
2269  }
2270 
2271  if (errfunc == NULL) {
2272  return eTrue;
2273  }
2274 
2275  c = s_GetNexusTypechar (cp + 7, "missing");
2276  if (c == 0) {
2277  c = s_GetNexusTypechar (cp + 7, "MISSING");
2278  }
2279  if (c != 0 && sequence_info->missing != NULL
2280  && strchr (sequence_info->missing, c) == NULL)
2281  {
2282  s_ReportCharCommentError (sequence_info->missing, c, "MISSING",
2283  errfunc, errdata);
2284  }
2285 
2286  c = s_GetNexusTypechar (cp + 7, "gap");
2287  if (c == 0) {
2288  c = s_GetNexusTypechar (cp + 7, "GAP");
2289  }
2290  if (c != 0 && sequence_info->middle_gap != NULL
2291  && strchr (sequence_info->middle_gap, c) == NULL)
2292  {
2293  s_ReportCharCommentError (sequence_info->middle_gap, c, "GAP",
2294  errfunc, errdata);
2295  }
2296 
2297  c = s_GetNexusTypechar (cp + 7, "match");
2298  if (c == 0) {
2299  c = s_GetNexusTypechar (cp + 7, "MATCH");
2300  }
2301  if (c != 0 && sequence_info->match != NULL
2302  && strchr (sequence_info->match, c) == NULL)
2303  {
2304  s_ReportCharCommentError (sequence_info->match, c, "MATCH",
2305  errfunc, errdata);
2306  }
2307  return eTrue;
2308 }
2309 
2310 
2311 static char * s_ReplaceNexusTypeChar (char *str, char c)
2312 {
2313  if (str == NULL
2314  || c != *str
2315  || *(str + 1) != 0)
2316  {
2317  if (str != NULL)
2318  {
2319  free (str);
2320  }
2321  str = (char *)malloc (2 * sizeof (char));
2322  if (str != NULL)
2323  {
2324  str [0] = c;
2325  str [1] = 0;
2326  }
2327  }
2328  return str;
2329 }
2330 
2331 /* This function reads a Nexus-style comment line for the characters
2332  * specified for missing, match, and gap and sets those values in sequence_info.
2333  * The function returns eTrue if a Nexus comment was found, eFalse otherwise.
2334  */
2336 (const char * str,
2337  TSequenceInfoPtr sequence_info)
2338 {
2339  char * cp;
2340  char c;
2341 
2342  if (str == NULL || sequence_info == NULL) {
2343  return eFalse;
2344  }
2345 
2346  cp = (char*) strstr (str, "format ");
2347  if (cp == NULL) {
2348  cp = (char*) strstr (str, "FORMAT ");
2349  }
2350  if (cp == NULL) {
2351  return eFalse;
2352  }
2353 
2354  c = s_GetNexusTypechar (cp + 7, "missing");
2355  if (c == 0) {
2356  c = s_GetNexusTypechar (cp + 7, "MISSING");
2357  }
2358  sequence_info->missing = s_ReplaceNexusTypeChar (sequence_info->missing, c);
2359 
2360  c = s_GetNexusTypechar (cp + 7, "gap");
2361  if (c == 0) {
2362  c = s_GetNexusTypechar (cp + 7, "GAP");
2363  }
2364  sequence_info->beginning_gap = s_ReplaceNexusTypeChar (sequence_info->beginning_gap, c);
2365  sequence_info->middle_gap = s_ReplaceNexusTypeChar (sequence_info->middle_gap, c);
2366  sequence_info->end_gap = s_ReplaceNexusTypeChar (sequence_info->end_gap, c);
2367 
2368  c = s_GetNexusTypechar (cp + 7, "match");
2369  if (c == 0) {
2370  c = s_GetNexusTypechar (cp + 7, "MATCH");
2371  }
2372  sequence_info->match = s_ReplaceNexusTypeChar (sequence_info->match, c);
2373 
2374  return eTrue;
2375 }
2376 
2377 
2378 /* This function examines the string str to see if it consists entirely of
2379  * asterisks, colons, periods, and whitespace. If so, this line is assumed
2380  * to be a Clustal-style consensus line and the function returns eTrue.
2381  * otherwise the function returns false;
2382  */
2383 static EBool s_IsConsensusLine (char * str)
2384 {
2385  if (str == NULL
2386  || strspn (str, "*:. \t\r\n") < strlen (str)
2387  || (strchr (str, '*') == NULL
2388  && strchr (str, ':') == NULL
2389  && strchr (str, '.') == NULL)) {
2390  return eFalse;
2391  } else {
2392  return eTrue;
2393  }
2394 }
2395 
2396 
2397 /* This function identifies lines that begin with a NEXUS keyword and end
2398  * with a semicolon - they will not contain sequence data. The function
2399  * returns eTrue if the line contains only a NEXUS comment, eFalse otherwise.
2400  */
2402 {
2403  char * last_semicolon;
2404 
2405  if (str == NULL) {
2406  return eFalse;
2407  }
2408  last_semicolon = strrchr (str, ';');
2409  if (last_semicolon == NULL
2410  || strspn (last_semicolon + 1, " \t\r") != strlen (last_semicolon + 1)
2411  || strchr (str, ';') != last_semicolon) {
2412  return eFalse;
2413  }
2414  if (s_StringNICmp (str, "format ", 7) == 0
2415  || s_StringNICmp (str, "dimensions ", 11) == 0
2416  || s_StringNICmp (str, "options ", 8) == 0
2417  || s_StringNICmp (str, "begin characters", 16) == 0
2418  || s_StringNICmp (str, "begin data", 10) == 0
2419  || s_StringNICmp (str, "begin ncbi", 10) == 0) {
2420  return eTrue;
2421  } else {
2422  return eFalse;
2423  }
2424 }
2425 
2426 
2428 {
2429  if (str == NULL) {
2430  return eFalse;
2431  }
2432 
2433  while (*str != 0) {
2434  if (!isspace (*str) && !isdigit(*str)) {
2435  return eFalse;
2436  }
2437  ++str;
2438  }
2439  return eTrue;
2440 }
2441 
2442 
2443 /* This function determines whether the contents of str are "skippable"
2444  * in that they do not contain sequence data and therefore should not be
2445  * considered part of any block patterns or sequence data.
2446  */
2447 static EBool s_SkippableString (char * str)
2448 {
2449  if (str == NULL
2450  || s_StringNICmp (str, "matrix", 6) == 0
2451  || s_StringNICmp (str, "sequin", 6) == 0
2452  || s_StringNICmp (str, "#NEXUS", 6) == 0
2453  || s_StringNICmp (str, "CLUSTAL W", 9) == 0
2457  || s_IsConsensusLine (str)
2458  || str [0] == ';') {
2459  return eTrue;
2460  } else {
2461  return eFalse;
2462  }
2463 }
2464 
2465 
2466 /* This function determines whether str contains a indication
2467  * that this is real alignment format (nexus, clustal, etc.)
2468  */
2470 {
2471  if (s_StringNICmp (str, "matrix", 6) == 0
2472  || s_StringNICmp (str, "#NEXUS", 6) == 0
2473  || s_StringNICmp (str, "CLUSTAL W", 9) == 0
2476  || s_IsConsensusLine (str)) {
2477  return eTrue;
2478  } else {
2479  return eFalse;
2480  }
2481 }
2482 
2483 
2484 /* This function determines whether or not str contains a blank line.
2485  */
2486 static EBool s_IsBlank (char * str)
2487 {
2488  size_t len;
2489 
2490  if (str == NULL) {
2491  return eTrue;
2492  }
2493  len = strspn (str, " \t\r");
2494  if (len == strlen (str)) {
2495  return eTrue;
2496  }
2497  return eFalse;
2498 }
2499 
2500 
2501 /* This function determines whether or not linestring contains a line
2502  * indicating the end of sequence data (organism information and definition
2503  * lines may occur after this line).
2504  */
2505 static EBool s_FoundStopLine (char * linestring)
2506 {
2507  if (linestring == NULL) {
2508  return eFalse;
2509  }
2510  if (s_StringNICmp (linestring, "endblock", 8) == 0
2511  || s_StringNICmp (linestring, "end;", 4) == 0) {
2512  return eTrue;
2513  }
2514  return eFalse;
2515 }
2516 
2517 
2518 /* This function identifies the beginning line of an ASN.1 file, which
2519  * cannot be read by the alignment reader.
2520  */
2521 static EBool s_IsASN1 (char * linestring)
2522 {
2523  if (linestring != NULL && strstr (linestring, "::=") != NULL) {
2524  return eTrue;
2525  } else {
2526  return eFalse;
2527  }
2528 }
2529 
2530 
2531 /* The following functions are used to locate and read comments enclosed
2532  * in brackets. These comments sometimes include organism information.
2533  */
2534 
2535 /* This function frees memory associated with a SCommentLoc structure. */
2537 {
2538  if (clp == NULL) {
2539  return;
2540  }
2541  s_CommentLocFree (clp->next);
2542  free (clp);
2543 }
2544 
2545 
2546 /* This function finds the first comment enclosed in brackets and creates
2547  * a SCommentLoc structure to indicate the position of the comment
2548  * in the string. The function returns a pointer to this structure if a
2549  * comment is found or a NULL if the string does not contain a bracketed
2550  * comment.
2551  */
2552 static TCommentLocPtr s_FindComment (char * string)
2553 {
2554  char * cp_start;
2555  char * cp_end;
2556  TCommentLocPtr clp;
2557 
2558  if (string == NULL) {
2559  return NULL;
2560  }
2561  cp_start = strstr (string, "[");
2562  if (cp_start != NULL) {
2563  cp_end = strstr (cp_start, "]");
2564  if (cp_end != NULL) {
2565  clp = (TCommentLocPtr) malloc (sizeof (SCommentLoc));
2566  if (clp == NULL) {
2567  return NULL;
2568  }
2569  clp->start = cp_start;
2570  clp->end = cp_end;
2571  clp->next = NULL;
2572  return clp;
2573  }
2574  }
2575  return NULL;
2576 }
2577 
2578 
2579 /* This function removes a comment from a line. */
2580 static void s_RemoveCommentFromLine (char * linestring)
2581 {
2582  TCommentLocPtr clp;
2583  size_t offset;
2584 
2585  if (linestring == NULL) {
2586  return;
2587  }
2588 
2589  clp = s_FindComment (linestring);
2590  while (clp != NULL) {
2591  strcpy (clp->start, clp->end + 1);
2592  s_CommentLocFree (clp);
2593  clp = s_FindComment (linestring);
2594  }
2595 
2596  /* if we have read an organism comment and that's all there was on the
2597  * line, get rid of the arrow character as well so it doesn't end up
2598  * in the sequence data
2599  */
2600  if ( linestring [0] == '>') {
2601  offset = 1;
2602  while (isspace(linestring[offset])) {
2603  offset++;
2604  }
2605  if (linestring[offset] == 0) {
2606  linestring[0] = 0;
2607  }
2608  }
2609 
2610  /* if the line now contains only space, truncate it */
2611  if (strspn (linestring, " \t\r") == strlen (linestring)) {
2612  linestring [0] = 0;
2613  }
2614 
2615 }
2616 
2617 
2618 /* This function determines whether or not a comment describes an organism
2619  * by looking for org= or organism= inside the brackets.
2620  */
2622 {
2623  int len;
2624  char * cp;
2625  char * cp_end;
2626 
2627  if (clp == NULL || clp->start == NULL || clp->end == NULL) {
2628  return eFalse;
2629  }
2630 
2631  cp = clp->start;
2632  if (*cp != '[') {
2633  return eFalse;
2634  }
2635  cp ++;
2636  len = strspn ( clp->start, " \t\r");
2637  cp = cp + len;
2638  cp_end = strstr (cp, "=");
2639  if (cp_end == NULL) {
2640  return eFalse;
2641  }
2642  cp_end --;
2643  while (cp_end > cp && isspace ((unsigned char)*cp_end)) {
2644  cp_end --;
2645  }
2646  cp_end ++;
2647  if ((cp_end - cp == 3 && s_StringNICmp (cp, "org", 3) == 0)
2648  || (cp_end - cp == 8 && s_StringNICmp (cp, "organism", 8) == 0)) {
2649  return eTrue;
2650  }
2651  return eFalse;
2652 }
2653 
2654 
2655 /* This function finds an organism comment, which includes the first bracketed
2656  * comment with org= or organism=, plus any additional bracketed comments.
2657  * The function returns a pointer to a SCommentLoc structure describing
2658  * the location of the organism comment.
2659  */
2661 {
2662  TCommentLocPtr clp, next_clp;
2663 
2664  if (string == NULL) {
2665  return NULL;
2666  }
2667 
2668  clp = s_FindComment (string);
2669  while (clp != NULL && ! s_IsOrganismComment (clp)) {
2670  char * pos = clp->end;
2671  free(clp);
2672  clp = s_FindComment (pos);
2673  }
2674 
2675  if (clp == NULL) {
2676  return NULL;
2677  }
2678 
2679  next_clp = s_FindComment (clp->end);
2680  while (next_clp != NULL &&
2681  !s_IsOrganismComment(next_clp))
2682  {
2683  clp->end = next_clp->end;
2684  free(next_clp);
2685  next_clp = s_FindComment (clp->end);
2686  }
2687  free(next_clp);
2688  return clp;
2689 }
2690 
2691 
2692 /* This function removes an organism comment from a line. */
2693 static void s_RemoveOrganismCommentFromLine (char * string)
2694 {
2695  TCommentLocPtr clp;
2696  char pbuf1024[1024];
2697 
2698  while ((clp = s_FindOrganismComment (string)) != NULL) {
2699  if (clp->end != NULL) {
2700  const char* to = clp->start;
2701  const char* from = clp->end + 1;
2702  size_t diff = from - to;
2703  size_t len = strlen(from);
2704  if (diff < len-1) {
2705  char* pbuf = pbuf1024;
2706  if (len > sizeof(pbuf1024)-1) {
2707  pbuf = (char*) malloc(len + 1);
2708  }
2709  strcpy(pbuf, clp->end + 1);
2710  strcpy(clp->start, pbuf);
2711  if (pbuf != pbuf1024) {
2712  free(pbuf);
2713  }
2714  }
2715  else {
2716  strcpy (clp->start, clp->end + 1);
2717  }
2718  }
2719  s_CommentLocFree (clp);
2720  }
2721 }
2722 
2723 
2724 /* This function creates an ordered list of comments within an organism
2725  * comment and returns a pointer to the first item in the linked list.
2726  * In an ordered org name, the org= value appears first, followed by other
2727  * bracketed values in alphabetical order.
2728  */
2730 {
2731  TCommentLocPtr clp, prev_clp, next_clp, clp_list, ordered_start;
2732  int next_len, this_len, len;
2733 
2734  if (org_clp == NULL) {
2735  return NULL;
2736  }
2737 
2738  clp_list = s_FindComment (org_clp->start); /* this is the org= */
2739  prev_clp = NULL;
2740  ordered_start = s_FindComment (clp_list->end);
2741  if (s_IsOrganismComment (ordered_start))
2742  {
2743  s_CommentLocFree (ordered_start);
2744  ordered_start = NULL;
2745  }
2746  if (ordered_start == NULL) {
2747  return clp_list;
2748  }
2749  clp = s_FindComment (ordered_start->end);
2750  while (clp != NULL && clp->start < org_clp->end) {
2751  /* insert new comment into list */
2752  prev_clp = NULL;
2753  next_clp = ordered_start;
2754  next_len = next_clp->end - next_clp->start;
2755  this_len = clp->end - clp->start;
2756  len = next_len > this_len ? next_len : this_len;
2757  while (next_clp != NULL
2758  && strncmp (next_clp->start, clp->start, len) < 0)
2759  {
2760  prev_clp = next_clp;
2761  next_clp = next_clp->next;
2762  if (next_clp != NULL) {
2763  next_len = next_clp->end - next_clp->start;
2764  len = next_len > this_len ? next_len : this_len;
2765  }
2766  }
2767  if (prev_clp == NULL) {
2768  clp->next = ordered_start;
2769  ordered_start = clp;
2770  } else {
2771  clp->next = prev_clp->next;
2772  prev_clp->next = clp;
2773  }
2774  clp = s_FindComment (clp->end);
2775  }
2776  clp_list->next = ordered_start;
2777  return clp_list;
2778 }
2779 
2780 
2781 /* This function creates an ordered organism name based on the bracketed
2782  * comments contained in the location described by org_clp.
2783  */
2785 {
2786  TCommentLocPtr clp, clp_list;
2787  char * ordered_org_name;
2788  char * cp;
2789 
2790  if (org_clp == NULL) {
2791  return NULL;
2792  }
2793 
2794  ordered_org_name = (char *)malloc (org_clp->end - org_clp->start + 2);
2795  if (ordered_org_name == NULL) {
2796  return NULL;
2797  }
2798  ordered_org_name [0] = 0;
2799  clp_list = s_CreateOrderedOrgCommentList (org_clp);
2800  cp = ordered_org_name;
2801  for (clp = clp_list; clp != NULL; clp = clp->next) {
2802  strncpy (cp, clp->start, clp->end - clp->start + 1);
2803  cp += clp->end - clp->start + 1;
2804  *cp = 0;
2805  }
2806 
2807  s_CommentLocFree (clp_list);
2808 
2809  return ordered_org_name;
2810 }
2811 
2813 (char *defline,
2814  int line_num,
2815  int defline_offset,
2816  SAlignRawFilePtr afrp)
2817 {
2818  TLineInfoPtr lip;
2819  int org_num, defline_num, new_len;
2820  char *empty_defline, *new_defline;
2821 
2822  if (afrp == NULL || defline == NULL) {
2823  return;
2824  }
2825 
2826  /* make sure that we are adding the definition line to the correct position
2827  * in the list - should match last organism name */
2828  lip = afrp->organisms;
2829  org_num = 0;
2830  while (lip != NULL)
2831  {
2832  org_num++;
2833  lip = lip->next;
2834  }
2835 
2836  lip = afrp->deflines;
2837  defline_num = 0;
2838  while (lip != NULL && defline_num < org_num) {
2839  lip = lip->next;
2840  defline_num ++;
2841  }
2842 
2843  if (defline_num == org_num && lip != NULL) {
2844  /* if previous defline is empty, replace with new defline */
2845  if (strlen (lip->data) == 0)
2846  {
2847  free (lip->data);
2848  lip->data = defline;
2849  }
2850  else
2851  {
2852  /* append defline to the end of the existing entry */
2853  new_len = strlen (lip->data) + strlen (defline) + 2;
2854  new_defline = (char *) malloc (new_len * sizeof (char));
2855  if (new_defline != NULL)
2856  {
2857  strcpy (new_defline, lip->data);
2858  strcat (new_defline, " ");
2859  strcat (new_defline, defline);
2860  free (lip->data);
2861  lip->data = new_defline;
2862  free (defline);
2863  defline = NULL;
2864  }
2865  }
2866  /* use new line numbers */
2867  lip->line_num = line_num + 1;
2868  lip->line_offset = defline_offset;
2869  lip->delete_me = eFalse;
2870  }
2871  else
2872  {
2873  /* add empty deflines to get to the correct position */
2874  while (defline_num < org_num - 1)
2875  {
2876  empty_defline = (char *) malloc (sizeof (char));
2877  if (empty_defline != NULL)
2878  {
2879  *empty_defline = 0;
2880  afrp->deflines = s_AddLineInfo (afrp->deflines,
2881  empty_defline, 0,
2882  0);
2883  afrp->num_deflines ++;
2884  }
2885  defline_num++;
2886  }
2887  /* now add new defline in correct position */
2888  afrp->deflines = s_AddLineInfo (afrp->deflines, defline,
2889  line_num, defline_offset);
2890  afrp->num_deflines ++;
2891  }
2892 }
2893 
2894 /* This function is used to read any organism names that may appear in
2895  * string, including any modifiers that may appear after the organism name.
2896  */
2898 (char * string,
2899  int line_num,
2900  SAlignRawFilePtr afrp)
2901 {
2902  TCommentLocPtr clp;
2903  char * org_name;
2904  char * cp;
2905  char * defline;
2906  char * comment_end;
2907  int defline_offset;
2908 
2909  if (string == NULL || string[0] != '>' || afrp == NULL) {
2910  return;
2911  }
2912 
2913  clp = s_FindOrganismComment (string);
2914  if (clp == NULL && (strstr (string, "org=") != NULL || strstr (string, "organism=") != NULL))
2915  {
2917  }
2918  if (clp == NULL) {
2919  // if the line does not come with an organism mod and a defline
2920  // we still need to create and record dummies to remain in sync
2921  // with the sequence data:
2922  //
2923  char dummy = '\0';
2924  const int linelen = strlen(string);
2925  afrp->organisms = s_AddLineInfo(
2926  afrp->organisms, &dummy, line_num, linelen);
2927  afrp->num_organisms ++;
2928  s_AddDeflineFromOrganismLine(&dummy, line_num, linelen, afrp);
2929  return;
2930  }
2931  while (clp != NULL) {
2932 
2933  org_name = s_CreateOrderedOrgName (clp);
2934  afrp->organisms = s_AddLineInfo (afrp->organisms, org_name, line_num,
2935  clp->start - string);
2936  free (org_name);
2937  afrp->num_organisms ++;
2938  defline = NULL;
2939  defline_offset = 0;
2940  if (*clp->end != 0) {
2941  cp = clp->end + 1;
2942  cp += strspn (cp, " \t\r\n");
2943  if (*cp != 0) {
2944  defline = clp->end + 1;
2945  defline_offset = clp->end - string + 1;
2946  }
2947  }
2948  s_AddDeflineFromOrganismLine (defline, line_num, defline_offset, afrp);
2949 
2950  comment_end = clp->end;
2951  s_CommentLocFree (clp);
2952  clp = s_FindOrganismComment (comment_end);
2953  }
2954 }
2955 
2956 
2957 /* The following group of functions manages the SAlignRawSeq structure,
2958  * which is used to track the IDs of sequences in the file, the sequence
2959  * characters for those IDs, and the locations of the IDs and sequence
2960  * characters.
2961  */
2962 
2963 /* This function allocates memory for an SAlignRawSeq structure,
2964  * initializes its member variables, and returns a pointer to the newly
2965  * allocated structure.
2966  */
2968 {
2969  TAlignRawSeqPtr arsp, last;
2970 
2971  arsp = (TAlignRawSeqPtr)malloc (sizeof (SAlignRawSeq));
2972  if (arsp == NULL) {
2973  return NULL;
2974  }
2975  arsp->id = NULL;
2976  arsp->sequence_data = NULL;
2977  arsp->id_lines = NULL;
2978  arsp->next = NULL;
2979 
2980  last = list;
2981  while (last != NULL && last->next != NULL) {
2982  last = last->next;
2983  }
2984  if (last != NULL) {
2985  last->next = arsp;
2986  }
2987  return arsp;
2988 }
2989 
2990 
2991 /* This function frees the memory associated with an SAlignRawSeq
2992  * structure's member variables and with the structure itself.
2993  */
2995 {
2996  if (arsp == NULL) {
2997  return;
2998  }
2999  s_AlignRawSeqFree (arsp->next);
3000  free (arsp->id);
3001  s_LineInfoFree (arsp->sequence_data);
3002  s_IntLinkFree (arsp->id_lines);
3003  free (arsp);
3004 }
3005 
3006 
3007 /* This function returns a pointer to the sequence in list with the specified
3008  * ID, unless there is no such sequence, in which case the function returns
3009  * NULL.
3010  */
3011 static TAlignRawSeqPtr
3013 (TAlignRawSeqPtr list,
3014  char * id)
3015 {
3016  TAlignRawSeqPtr arsp;
3017 
3018  for (arsp = list; arsp != NULL; arsp = arsp->next) {
3019  if (strcmp (arsp->id, id) == 0) {
3020  return arsp;
3021  }
3022  }
3023  return NULL;
3024 }
3025 
3026 
3027 /* This function finds the position of a given ID in the sequence list,
3028  * unless the ID is not found in the list, in which case the function returns
3029  * -1.
3030  */
3031 static int
3033 (TAlignRawSeqPtr list,
3034  char * id)
3035 {
3036  TAlignRawSeqPtr arsp;
3037  int offset;
3038 
3039  for (arsp = list, offset = 0; arsp != NULL; arsp = arsp->next, offset++) {
3040  if (strcmp (arsp->id, id) == 0) {
3041  return offset;
3042  }
3043  }
3044  return -1;
3045 }
3046 
3047 
3048 /* This function returns a pointer to the memory in which the ID for the
3049  * Nth sequence is stored, unless there aren't that many sequences, in which
3050  * case NULL is returned.
3051  */
3052 static char *
3054 (TAlignRawSeqPtr list,
3055  int offset)
3056 {
3057  TAlignRawSeqPtr arsp;
3058  int index;
3059 
3060  arsp = list;
3061  index = 0;
3062  while ( arsp != NULL && index != offset ) {
3063  arsp = arsp->next;
3064  index++;
3065  }
3066  if (index == offset && arsp != NULL) {
3067  return arsp->id;
3068  } else {
3069  return NULL;
3070  }
3071 }
3072 
3073 
3074 /* This function adds data to a sequence by looking for the specified ID in
3075  * the list. If the id is not found, a new sequence with that ID is added to
3076  * the end of the list.
3077  * The function returns a pointer to the first item in the list.
3078  */
3079 static TAlignRawSeqPtr
3081 (TAlignRawSeqPtr list,
3082  char * id,
3083  char * data,
3084  int id_line_num,
3085  int data_line_num,
3086  int data_line_offset)
3087 {
3088  TAlignRawSeqPtr arsp;
3089  TIntLinkPtr ilp;
3090 
3091  arsp = s_FindAlignRawSeqById (list, id);
3092  if (arsp == NULL) {
3093  arsp = s_AlignRawSeqNew (list);
3094  if (arsp == NULL) {
3095  return NULL;
3096  }
3097  if (list == NULL) list = arsp;
3098  arsp->id = strdup (id);
3099  }
3100  arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
3101  data,
3102  data_line_num,
3103  data_line_offset);
3104  ilp = s_IntLinkNew (id_line_num, arsp->id_lines);
3105  if (arsp->id_lines == NULL) arsp->id_lines = ilp;
3106  return list;
3107 }
3108 
3109 
3110 /* This function adds data to the Nth sequence in the sequence list and
3111  * returns eTrue, unless there aren't that many sequences in the list, in
3112  * which case the function returns eFalse.
3113  */
3114 static EBool
3116 (TAlignRawSeqPtr list,
3117  int index,
3118  char * data,
3119  int data_line_num,
3120  int data_line_offset)
3121 {
3122  TAlignRawSeqPtr arsp;
3123  int curr;
3124 
3125  curr = 0;
3126  for (arsp = list; arsp != NULL && curr < index; arsp = arsp->next) {
3127  curr++;
3128  }
3129  if (arsp == NULL) {
3130  return eFalse;
3131  } else {
3132  arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
3133  data,
3134  data_line_num,
3135  data_line_offset);
3136  return eTrue;
3137  }
3138 }
3139 
3140 
3141 /* This function frees memory associated with the SAlignRawFileData structure.
3142  */
3144 {
3145  if (afrp == NULL) {
3146  return;
3147  }
3148 
3149  s_LineInfoFree (afrp->organisms);
3150  s_LineInfoFree (afrp->deflines);
3151  s_LineInfoFree (afrp->line_list);
3152  s_AlignRawSeqFree (afrp->sequences);
3153  s_IntLinkFree (afrp->offset_list);
3154  free (afrp->alphabet);
3155  free (afrp);
3156 }
3157 
3158 
3159 /* This function allocates memory for an SAlignRawFileData structure and
3160  * initializes its member variables. The function returns a pointer to
3161  * the newly allocated structure.
3162  */
3164 {
3165  SAlignRawFilePtr afrp;
3166 
3167  afrp = (SAlignRawFilePtr)malloc (sizeof (SAlignRawFileData));
3168  if (afrp == NULL) {
3169  return NULL;
3170  }
3171  afrp->marked_ids = eFalse;
3172  afrp->line_list = NULL;
3173  afrp->organisms = NULL;
3174  afrp->num_organisms = 0;
3175  afrp->deflines = NULL;
3176  afrp->num_deflines = 0;
3177  afrp->block_size = 0;
3178  afrp->offset_list = NULL;
3179  afrp->sequences = NULL;
3180  afrp->report_error = NULL;
3181  afrp->report_error_userdata = NULL;
3182  afrp->alphabet = NULL;
3183  afrp->expected_num_sequence = 0;
3184  afrp->expected_sequence_len = 0;
3185  afrp->num_segments = 1;
3186  afrp->align_format_found = eFalse;
3187  return afrp;
3188 }
3189 
3190 
3191 /* The following functions are used to analyze the structure of a file and
3192  * assemble the sequences listed in the file.
3193  * Sequence data in a file is organized in one of two general formats -
3194  * interleaved or contiguous. Interleaved data can be recognized by looking
3195  * for repeated blocks of the same number of lines within a file separated
3196  * by blank or skippable lines from other lines in the file. The first of
3197  * these blocks must have at least two elements separated by whitespace
3198  * in each line, the first of these elements is the ID for the sequence in
3199  * that row and for the sequences in that position within the block for the
3200  * remainder of the file.
3201  * Contiguous data can be recognized by either looking for "marked" sequence
3202  * IDs, which begin with a '>' character, or by looking for repeated patterns
3203  * of lines with the same numbers of characters.
3204  */
3205 
3206 /* The following functions are used to analyze interleaved data. */
3207 
3208 /* This function creates a SLengthListData structure that describes the pattern
3209  * of character lengths in the string pointed to by cp.
3210  */
3211 static SLengthListPtr s_GetBlockPattern (const char * cp)
3212 {
3213  SLengthListPtr this_pattern;
3214  int len;
3215 
3216  this_pattern = s_LengthListNew (NULL);
3217  if (this_pattern == NULL) {
3218  return NULL;
3219  }
3220 
3221  this_pattern->num_appearances = 1;
3222  while (*cp != 0) {
3223  len = strcspn (cp, " \t\r");
3224  s_AddLengthRepeat (this_pattern, len);
3225  cp += len;
3226  cp += strspn (cp, " \t\r");
3227  }
3228  return this_pattern;
3229 }
3230 
3231 
3232 /* This function attempts to predict whether the following lines will be
3233  * an interleaved block. If so, the function returns the location of the
3234  * beginning of the block, otherwise the function returns -1.
3235  */
3236 static int
3238 (SLengthListPtr pattern_list,
3239  TIntLinkPtr next_offset,
3240  int line_start,
3241  int block_size)
3242 {
3243  int line_counter;
3244  SLengthListPtr llp;
3245 
3246  line_counter = line_start;
3247  if (next_offset != NULL
3248  && next_offset->ival - line_counter < block_size) {
3249  return -1;
3250  }
3251 
3252  for (llp = pattern_list;
3253  llp != NULL
3254  && (next_offset == NULL || line_counter < next_offset->ival - 1)
3255  && line_counter - line_start < block_size;
3256  llp = llp->next)
3257  {
3258  if (llp->lengthrepeats == NULL) {
3259  return -1;
3260  }
3261  line_counter += llp->num_appearances;
3262  }
3263  if (line_counter - line_start == block_size) {
3264  /* we've found a combination of groups of similarly sized lines
3265  * that add up to the desired block size - is the next line blank,
3266  * or are there additional non-blank lines?
3267  */
3268  if (llp == NULL /* The block ended with the last line in the file */
3269  || llp->lengthrepeats == NULL) { /* or the next line is blank */
3270  return line_start;
3271  }
3272  }
3273  return -1;
3274 }
3275 
3276 
3277 /* This function looks for malformed blocks between the identified blocks
3278  * indicated by the offset_list. It returns a pointer to the list with the
3279  * new locations inserted at the appropriate locations.
3280  */
3281 static TIntLinkPtr
3283 (SLengthListPtr pattern_list,
3284  TIntLinkPtr offset_list,
3285  int block_size)
3286 {
3287  int line_counter;
3288  SLengthListPtr llp;
3289  TIntLinkPtr next_offset, prev_offset, new_offset;
3290  int forecast_pos;
3291 
3292  prev_offset = NULL;
3293  next_offset = offset_list;
3294  line_counter = 0;
3295  llp = pattern_list;
3296  while (llp != NULL) {
3297  if (next_offset != NULL && line_counter == next_offset->ival) {
3298  prev_offset = next_offset;
3299  next_offset = next_offset->next;
3300  /* skip past the lines for this block */
3301  while (line_counter - prev_offset->ival < block_size
3302  && llp != NULL)
3303  {
3304  line_counter += llp->num_appearances;
3305  llp = llp->next;
3306  }
3307  } else {
3308  forecast_pos = s_ForecastBlockPattern (llp, next_offset,
3309  line_counter,
3310  block_size);
3311  if (forecast_pos > 0) {
3312  new_offset = s_IntLinkNew (forecast_pos, NULL);
3313  if (new_offset == NULL) {
3314  return NULL;
3315  }
3316  if (prev_offset == NULL) {
3317  new_offset->next = offset_list;
3318  offset_list = new_offset;
3319  } else {
3320  new_offset->next = next_offset;
3321  prev_offset->next = new_offset;
3322  }
3323  prev_offset = new_offset;
3324  /* skip past the lines for this block */
3325  while (line_counter - prev_offset->ival < block_size
3326  && llp != NULL)
3327  {
3328  line_counter += llp->num_appearances;
3329  llp = llp->next;
3330  }
3331  } else {
3332  line_counter += llp->num_appearances;
3333  llp = llp->next;
3334  }
3335  }
3336  }
3337  return offset_list;
3338 }
3339 
3340 
3341 /* This function looks for lines that could not be assigned to an interleaved
3342  * block. It returns eTrue if it finds any such lines after the first offset,
3343  * eFalse otherwise, and reports all instances of unused lines as errors.
3344  */
3345 static EBool
3347 (SLengthListPtr pattern_list,
3348  SAlignRawFilePtr afrp)
3349 {
3351  SLengthListPtr llp;
3352  int line_counter;
3353  int block_line_counter;
3354  EBool rval = eFalse;
3355  TLineInfoPtr line_val;
3356  int skip;
3357 
3358  if (pattern_list == NULL || afrp == NULL
3359  || afrp->offset_list == NULL || afrp->block_size < 2) {
3360  return eFalse;
3361  }
3362 
3363  offset = afrp->offset_list;
3364  llp = pattern_list;
3365  line_counter = 0;
3366  line_val = afrp->line_list;
3367 
3368  while (llp != NULL && line_val != NULL) {
3369  while (llp != NULL && line_val != NULL
3370  && (offset == NULL || line_counter < offset->ival)) {
3371  if (llp->lengthrepeats != NULL) {
3372  s_ReportUnusedLine (line_counter,
3373  line_counter + llp->num_appearances - 1,
3374  line_val,
3375  afrp->report_error,
3376  afrp->report_error_userdata);
3377  if (offset != afrp->offset_list) {
3378  rval = eTrue;
3379  }
3380  }
3381  line_counter += llp->num_appearances;
3382  for (skip = 0;
3383  skip < llp->num_appearances && line_val != NULL;
3384  skip++) {
3385  line_val = line_val->next;
3386  }
3387  llp = llp->next;
3388  }
3389  block_line_counter = 0;
3390  while (block_line_counter < afrp->block_size && llp != NULL) {
3391  block_line_counter += llp->num_appearances;
3392  line_counter += llp->num_appearances;
3393  for (skip = 0;
3394  skip < llp->num_appearances && line_val != NULL;
3395  skip++) {
3396  line_val = line_val->next;
3397  }
3398  llp = llp->next;
3399  }
3400  if (offset != NULL) {
3401  offset = offset->next;
3402  }
3403  }
3404  return rval;
3405 }
3406 
3407 
3408 /* This function examines a list of line lengths, looking for interleaved
3409  * blocks. If it finds them, it will set the SAlignRawFileData offset_list
3410  * member variable to point to a list of locations for the blocks.
3411  */
3412 static void
3414 (SLengthListPtr pattern_list,
3415  SAlignRawFilePtr afrp)
3416 {
3417  SLengthListPtr llp, llp_next;
3418  TSizeInfoPtr size_list, best_ptr;
3419  TIntLinkPtr new_offset;
3420  int line_counter;
3421 
3422  afrp->block_size = 0;
3423  size_list = NULL;
3424  afrp->offset_list = NULL;
3425  for (llp = pattern_list; llp != NULL; llp = llp->next) {
3426  llp_next = llp->next;
3427  if (llp->num_appearances > 1
3428  && (llp_next == NULL || llp_next->lengthrepeats == NULL)) {
3429  size_list = s_AddSizeInfo (size_list, llp->num_appearances);
3430  }
3431  }
3432  if (size_list == NULL) {
3433  return;
3434  }
3435  best_ptr = s_GetMostPopularSizeInfo (size_list);
3436  if (best_ptr != NULL
3437  && (best_ptr->num_appearances > 1 ||
3438  (size_list->next == NULL && size_list->size_value > 1))) {
3439  afrp->block_size = best_ptr->size_value;
3440  line_counter = 0;
3441  for (llp = pattern_list; llp != NULL; llp = llp->next) {
3442  llp_next = llp->next;
3443  if (llp->num_appearances == afrp->block_size
3444  && (llp_next == NULL || llp_next->lengthrepeats == NULL))
3445  {
3446  new_offset = s_IntLinkNew (line_counter, afrp->offset_list);
3447  if (new_offset == NULL) {
3448  return;
3449  }
3450  if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
3451  }
3452  line_counter += llp->num_appearances;
3453  }
3454  afrp->offset_list = s_AugmentBlockPatternOffsetList (pattern_list,
3455  afrp->offset_list,
3456  afrp->block_size);
3457  }
3458  if (s_FindUnusedLines (pattern_list, afrp)) {
3459  s_IntLinkFree (afrp->offset_list);
3460  afrp->offset_list = NULL;
3461  afrp->block_size = 0;
3462  } else {
3463  afrp->align_format_found = eTrue;
3464  }
3465  s_SizeInfoFree (size_list);
3466 
3467 }
3468 
3469 static void s_TrimSpace(char** ppline)
3470 {
3471  int len = 0;
3472  char* ptmp = 0;
3473 
3474  if (ppline == NULL || *ppline == NULL) {
3475  return;
3476  }
3477  len = strlen (*ppline);
3478  ptmp = *ppline + len - 1;
3479  while (ptmp > *ppline && (*ptmp == ' ' || *ptmp == '\t' || *ptmp == '\r' || *ptmp == '\n'))
3480  {
3481  *ptmp = 0;
3482  ptmp--;
3483  }
3484  len = strspn (*ppline, " \t\r\n");
3485  if (len > 0) {
3486  ptmp = *ppline;
3487  *ppline = strdup(*ppline + len);
3488  free(ptmp);
3489  }
3490 }
3491 
3492 static EBool
3494  SAlignRawFilePtr afrp,
3495  FReadLineFunction readfunc,
3496  void* pfile)
3497 {
3498  int overall_line_count = 0;
3499  EBool in_taxa_comment = eFalse;
3500  char* linestring = readfunc (pfile);
3501  TLineInfoPtr last_line = NULL, next_line = NULL;
3502 
3503  if (s_IsASN1 (linestring)) {
3505  return eFalse;
3506  }
3507 
3508  while (linestring != NULL && linestring [0] != EOF) {
3509  s_TrimSpace (&linestring);
3510  if (!in_taxa_comment && s_FoundStopLine(linestring)) {
3511  linestring [0] = 0;
3512  }
3513  if (in_taxa_comment) {
3514  if (strncmp (linestring, "end;", 4) == 0) {
3515  in_taxa_comment = eFalse;
3516  }
3517  linestring [0] = 0;
3518  } else if (strncmp (linestring, "begin taxa;", 11) == 0) {
3519  linestring [0] = 0;
3520  in_taxa_comment = eTrue;
3521  afrp->align_format_found = eTrue;
3522  }
3523  next_line = s_LineInfoNew (linestring, overall_line_count, 0);
3524  if (last_line == NULL) {
3525  afrp->line_list = next_line;
3526  } else {
3527  last_line->next = next_line;
3528  }
3529  last_line = next_line;
3530 
3531  free (linestring);
3532  linestring = readfunc (pfile);
3533  overall_line_count ++;
3534  }
3535  return eTrue;
3536 }
3537 
3538 static void
3540  SAlignRawFilePtr afrp,
3542  EBool * last_line_was_marked_id,
3543  char* linestr,
3544  int overall_line_count)
3545 {
3546  TIntLinkPtr new_offset = NULL;
3547  SLengthListPtr this_pattern = NULL;
3548  int len = 0;
3549  char* cp;
3550  SLengthListPtr last_pattern;
3551 
3552  if (patterns == NULL || last_line_was_marked_id == NULL) {
3553  return;
3554  }
3555  last_pattern = *patterns;
3556 
3557  /* find last pattern in list */
3558  if (last_pattern != NULL) {
3559  while (last_pattern->next != NULL) {
3560  last_pattern = last_pattern->next;
3561  }
3562  }
3563 
3564  /* ID line
3565  */
3566  if (linestr [0] == '>') {
3567  /* this could be a block of organism lines in a
3568  * NEXUS file. If there is no sequence data between
3569  * the lines, don't process this file for marked IDs.
3570  */
3571  if (*last_line_was_marked_id)
3572  {
3573  afrp->marked_ids = eFalse;
3574 // eFormat = ALNFMT_UNKNOWN;
3575  }
3576  else
3577  {
3578  afrp->marked_ids = eTrue;
3579 // eFormat = ALNFMT_FASTAGAP;
3580  }
3581  new_offset = s_IntLinkNew (overall_line_count + 1,
3582  afrp->offset_list);
3583  if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
3584  *last_line_was_marked_id = eTrue;
3585  return;
3586  }
3587 
3588  /* Data line
3589  */
3590  *last_line_was_marked_id = eFalse;
3591  /* add to length list for interleaved block search */
3592  len = strcspn (linestr, " \t\r");
3593  if (len > 0) {
3594  cp = linestr + len;
3595  len = strspn (cp, " \t\r");
3596  if (len > 0) {
3597  cp = cp + len;
3598  }
3599  if (*cp == 0) {
3600  this_pattern = s_GetBlockPattern (linestr);
3601  } else {
3602  this_pattern = s_GetBlockPattern (cp);
3603  }
3604  } else {
3605  this_pattern = s_GetBlockPattern (linestr);
3606  }
3607 
3608  if (last_pattern == NULL) {
3609  *patterns = this_pattern;
3610  } else if (s_DoLengthPatternsMatch (last_pattern, this_pattern)) {
3611  last_pattern->num_appearances ++;
3612  s_LengthListFree (this_pattern);
3613  } else {
3614  last_pattern->next = this_pattern;
3615  }
3616 }
3617 
3618 static SAlignRawFilePtr
3620 (FReadLineFunction readfunc,
3621  void * userdata,
3622  TSequenceInfoPtr sequence_info,
3623  EBool use_nexus_file_info,
3624  FReportErrorFunction errfunc,
3625  void * errdata,
3626  EAlignFormat* pformat)
3627 {
3628  char * linestring;
3629  SAlignRawFilePtr afrp;
3630  int overall_line_count;
3631  EBool found_expected_ntax = eFalse;
3632  EBool found_expected_nchar = eFalse;
3633  EBool found_char_comment = eFalse;
3634  SLengthListPtr pattern_list = NULL;
3635  SLengthListPtr this_pattern, last_pattern = NULL;
3636  char * cp;
3637  size_t len;
3638  TIntLinkPtr new_offset;
3639  EBool in_bracketed_comment = eFalse;
3640  TBracketedCommentListPtr comment_list = NULL, last_comment = NULL;
3641  EBool last_line_was_marked_id = eFalse;
3642  TLineInfoPtr next_line;
3643 
3644  if (readfunc == NULL || sequence_info == NULL) {
3645  return NULL;
3646  }
3647 
3648  afrp = s_AlignFileRawNew ();
3649  if (afrp == NULL) {
3650  return NULL;
3651  }
3652 
3653  afrp->alphabet = strdup (sequence_info->alphabet);
3654  afrp->report_error = errfunc;
3655  afrp->report_error_userdata = errdata;
3656 
3657  if (eFalse == s_AfrpInitLineData(afrp, readfunc, userdata)) {
3658  s_AlignFileRawFree (afrp);
3659  return NULL;
3660  }
3661 
3662  for (next_line = afrp->line_list; next_line != NULL; next_line = next_line->next) {
3663  linestring = next_line->data;
3664  overall_line_count = next_line->line_num-1;
3665 
3666  s_ReadOrgNamesFromText (linestring, overall_line_count, afrp);
3667  if (*pformat == ALNFMT_FASTAGAP) {
3668  s_AfrpProcessFastaGap(afrp, & pattern_list, & last_line_was_marked_id, linestring, overall_line_count);
3669  continue;
3670  }
3671  /* we want to remove the comment from the line for the purpose
3672  * of looking for blank lines and skipping,
3673  * but save comments for storing in array if line is not skippable or
3674  * blank
3675  */
3676 
3677  if (! found_expected_ntax || ! found_expected_nchar) {
3678  if (s_IsTwoNumbersSeparatedBySpace (linestring)) {
3679  s_GetFASTAExpectedNumbers (linestring, afrp);
3680  found_expected_ntax = eTrue;
3681  found_expected_nchar = eTrue;
3682  afrp->align_format_found = eTrue;
3683  } else {
3684  s_GetNexusSizeComments (linestring, &found_expected_ntax,
3685  &found_expected_nchar, afrp);
3686  }
3687  }
3688  if (! found_char_comment) {
3689  if (use_nexus_file_info) {
3690  found_char_comment = s_UpdateNexusCharInfo (linestring, sequence_info);
3691  } else {
3692  found_char_comment = s_CheckNexusCharInfo (linestring, sequence_info,
3693  afrp->report_error,
3694  afrp->report_error_userdata);
3695  }
3696  }
3697 
3698  /* remove complete single-line bracketed comments from line
3699  *before checking for multiline bracketed comments */
3700  s_RemoveCommentFromLine (linestring);
3701 
3702  if (in_bracketed_comment) {
3703  len = strspn (linestring, " \t\r\n");
3704  if (last_comment != NULL)
3705  {
3706  s_BracketedCommentListAddLine (last_comment, linestring + len,
3707  overall_line_count, len);
3708  }
3709  if (strchr (linestring, ']') != NULL) {
3710  in_bracketed_comment = eFalse;
3711  }
3712  linestring [0] = 0;
3713  } else if (linestring [0] == '[' && strchr (linestring, ']') == NULL) {
3714  in_bracketed_comment = eTrue;
3715  len = strspn (linestring, " \t\r\n");
3716  last_comment = s_BracketedCommentListNew (comment_list,
3717  linestring + len,
3718  overall_line_count, len);
3719  if (comment_list == NULL)
3720  {
3721  comment_list = last_comment;
3722  }
3723  linestring [0] = 0;
3724  }
3725 
3726  if (!afrp->align_format_found) {
3727  afrp->align_format_found = s_IsAlnFormatString (linestring);
3728  }
3729  if (s_SkippableString (linestring)) {
3730  linestring[0] = 0;
3731  }
3732 
3733  /* "junk" line: Just record the empty pattern to keep line counts in sync.
3734  */
3735  if (linestring[0] == 0) {
3736  last_line_was_marked_id = eFalse;
3737  this_pattern = s_GetBlockPattern ("");
3738  if (pattern_list == NULL) {
3739  pattern_list = this_pattern;
3740  last_pattern = this_pattern;
3741  } else {
3742  last_pattern->next = this_pattern;
3743  last_pattern = this_pattern;
3744  }
3745  continue;
3746  }
3747 
3748  /* Presumably fasta ID:
3749  */
3750  if (linestring [0] == '>') {
3751  /* this could be a block of organism lines in a
3752  * NEXUS file. If there is no sequence data between
3753  * the lines, don't process this file for marked IDs.
3754  */
3755  if (last_line_was_marked_id)
3756  {
3757  afrp->marked_ids = eFalse;
3758  *pformat = ALNFMT_UNKNOWN;
3759  }
3760  else
3761  {
3762  *pformat = ALNFMT_FASTAGAP;
3763  s_AfrpProcessFastaGap(afrp, & pattern_list, & last_line_was_marked_id, linestring, overall_line_count);
3764  continue;
3765  }
3766  new_offset = s_IntLinkNew (overall_line_count + 1,
3767  afrp->offset_list);
3768  if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
3769  last_line_was_marked_id = eTrue;
3770  continue;
3771  }
3772 
3773  /* default case: some real data at last ...
3774  */
3775  last_line_was_marked_id = eFalse;
3776  /* add to length list for interleaved block search */
3777  len = strcspn (linestring, " \t\r");
3778  if (len > 0) {
3779  cp = linestring + len;
3780  len = strspn (cp, " \t\r");
3781  if (len > 0) {
3782  cp = cp + len;
3783  }
3784  if (*cp == 0) {
3785  this_pattern = s_GetBlockPattern (linestring);
3786  } else {
3787  this_pattern = s_GetBlockPattern (cp);
3788  }
3789  } else {
3790  this_pattern = s_GetBlockPattern (linestring);
3791  }
3792 
3793  if (pattern_list == NULL) {
3794  pattern_list = this_pattern;
3795  last_pattern = this_pattern;
3796  } else if (s_DoLengthPatternsMatch (last_pattern, this_pattern)) {
3797  last_pattern->num_appearances ++;
3798  s_LengthListFree (this_pattern);
3799  } else {
3800  last_pattern->next = this_pattern;
3801  last_pattern = this_pattern;
3802  }
3803  }
3804  afrp->num_segments = s_GetNumSegmentsInAlignment (comment_list, errfunc, errdata);
3805  if (afrp->num_segments > 1)
3806  {
3807  if (afrp->offset_list != NULL)
3808  {
3810  errfunc, errdata);
3811  s_AlignFileRawFree (afrp);
3812  s_LengthListFree (pattern_list);
3813  s_BracketedCommentListFree (comment_list);
3814  return NULL;
3815  }
3816  else
3817  {
3818  afrp->offset_list = GetSegmentOffsetList (comment_list);
3819  afrp->marked_ids = eTrue;
3820  }
3821  }
3822  if (! afrp->marked_ids) {
3823  s_FindInterleavedBlocks (pattern_list, afrp);
3824  }
3825  s_LengthListFree (pattern_list);
3826  s_BracketedCommentListFree (comment_list);
3827  return afrp;
3828 }
3829 
3830 
3831 /* This function analyzes a block to see if it contains, as the first element
3832  * of any of its lines, one of the sequence IDs already identified. If the
3833  * one of the lines does begin with a sequence ID, all of the lines are
3834  * assumed to begin with sequence IDs and the function returns eTrue, otherwise
3835  * the function returns eFalse.
3836  */
3837 static EBool
3839 (SAlignRawFilePtr afrp,
3840  TLineInfoPtr first_line,
3841  int num_lines_in_block)
3842 {
3843  TLineInfoPtr lip;
3844  char * linestring;
3845  char * this_id;
3846  TAlignRawSeqPtr arsp;
3847  size_t len;
3848  int block_offset;
3849 
3850  if (afrp->sequences == NULL) {
3851  return eTrue;
3852  }
3853 
3854  for (lip = first_line, block_offset = 0;
3855  lip != NULL && block_offset < num_lines_in_block;
3856  lip = lip->next, block_offset++)
3857  {
3858  linestring = lip->data;
3859  if (linestring != NULL) {
3860  len = strcspn (linestring, " \t\r");
3861  if (len > 0 && len < strlen (linestring)) {
3862  this_id = (char *) malloc (len + 1);
3863  if (this_id == NULL) {
3864  return eFalse;
3865  }
3866  strncpy (this_id, linestring, len);
3867  this_id [len] = 0;
3868  arsp = s_FindAlignRawSeqById (afrp->sequences, this_id);
3869  free (this_id);
3870  if (arsp != NULL) {
3871  return eTrue;
3872  }
3873  }
3874  }
3875  }
3876  return eFalse;
3877 }
3878 
3879 
3880 /* This function analyzes the lines of the block to see if the pattern of
3881  * the lengths of the whitespace-separated pieces of sequence data matches
3882  * for all lines within the block. The function returns eTrue if this is so,
3883  * otherwise the function returns eFalse.
3884  */
3885 static EBool
3887 (SAlignRawFilePtr afrp,
3888  TLineInfoPtr first_line,
3889  int num_lines_in_block,
3890  EBool has_ids,
3891  EBool first_block)
3892 {
3893  TLineInfoPtr lip;
3894  SLengthListPtr list, this_pattern, best;
3895  int len, block_offset, id_offset;
3896  char * tmp_id;
3897  EBool rval;
3898  char * cp;
3899 
3900  rval = eTrue;
3901  list = NULL;
3902  for (lip = first_line, block_offset = 0;
3903  lip != NULL && block_offset < num_lines_in_block;
3904  lip = lip->next, block_offset ++)
3905  {
3906  cp = lip->data;
3907  if (has_ids) {
3908  len = strcspn (cp, " \t\r");
3909  if (first_block && len == strlen (cp)) {
3910  /* PHYLIP IDs are exactly 10 characters long
3911  * and may not have a space between the ID and
3912  * the sequence.
3913  */
3914  len = 10;
3915  }
3916  tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
3917  if (tmp_id == NULL) {
3918  return eFalse;
3919  }
3920  strncpy (tmp_id, cp, len);
3921  tmp_id [len] = 0;
3922  id_offset = s_FindAlignRawSeqOffsetById (afrp->sequences, tmp_id);
3923  if (id_offset != block_offset && ! first_block) {
3924  rval = eFalse;
3925  s_ReportInconsistentID (tmp_id, lip->line_num,
3926  afrp->report_error,
3927  afrp->report_error_userdata);
3928  }
3929  free (tmp_id);
3930  cp += len;
3931  cp += strspn (cp, " \t\r");
3932  }
3933  this_pattern = s_GetBlockPattern (cp);
3934  list = s_AddLengthList (list, this_pattern);
3935  }
3936 
3937  /* Now find the pattern with the most appearances */
3938  best = NULL;
3939  for (this_pattern = list;
3940  this_pattern != NULL;
3941  this_pattern = this_pattern->next)
3942  {
3943  if (this_pattern->num_appearances == 0) continue;
3944  if (best == NULL
3945  || this_pattern->num_appearances > best->num_appearances)
3946  {
3947  best = this_pattern;
3948  }
3949  }
3950 
3951  /* now identify and report inconsistent lines */
3952  for (lip = first_line, block_offset = 0;
3953  lip != NULL && block_offset < num_lines_in_block;
3954  lip = lip->next, block_offset ++)
3955  {
3956  cp = lip->data;
3957  if (has_ids) {
3958  len = strcspn (cp, " \t\r");
3959  if (first_block && len == strlen (cp)) {
3960  /* PHYLIP IDs are exactly 10 characters long
3961  * and may not have a space between the ID and
3962  * the sequence.
3963  */
3964  len = 10;
3965  }
3966  tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
3967  if (tmp_id == NULL) {
3968  return eFalse;
3969  }
3970  strncpy (tmp_id, cp, len);
3971  tmp_id [len] = 0;
3972  cp += len;
3973  cp += strspn (cp, " \t\r");
3974  } else {
3975  tmp_id = s_GetAlignRawSeqIDByOffset (afrp->sequences, block_offset);
3976  }
3977  this_pattern = s_GetBlockPattern (cp);
3978  if ( ! s_DoLengthPatternsMatch (this_pattern, best)) {
3979  rval = eFalse;
3981  afrp->report_error,
3982  afrp->report_error_userdata);
3983  }
3984  s_LengthListFree (this_pattern);
3985  if (has_ids) {
3986  free (tmp_id);
3987  }
3988  }
3989  s_LengthListFree (list);
3990  return rval;
3991 }
3992 
3993 
3994 /* This function processes a block of lines and adds the sequence data from
3995  * each line in the block to the appropriate sequence in the list.
3996  */
3997 static void
3999 (SAlignRawFilePtr afrp,
4000  TLineInfoPtr lines,
4001  int num_lines_in_block,
4002  EBool first_block)
4003 {
4004  TLineInfoPtr lip;
4005  char * linestring;
4006  char * cp;
4007  char * this_id;
4008  int len;
4009  int line_number;
4010  EBool this_block_has_ids;
4011  TAlignRawSeqPtr arsp;
4012 
4013  this_block_has_ids = s_DoesBlockHaveIds (afrp, lines, num_lines_in_block);
4014  s_BlockIsConsistent (afrp, lines, num_lines_in_block, this_block_has_ids,
4015  first_block);
4016  for (lip = lines, line_number = 0;
4017  lip != NULL && line_number < num_lines_in_block;
4018  lip = lip->next, line_number ++)
4019  {
4020  linestring = lip->data;
4021  if (linestring != NULL) {
4022  if (this_block_has_ids) {
4023  len = strcspn (linestring, " \t\r");
4024  if (first_block && len == strlen (linestring)) {
4025  /* PHYLIP IDs are exactly ten characters long,
4026  * and may not have a space before the start of
4027  * the sequence data.
4028  */
4029  len = 10;
4030  }
4031  this_id = (char *) malloc (len + 1);
4032  if (this_id == NULL) {
4033  return;
4034  }
4035  strncpy (this_id, linestring, len);
4036  this_id [len] = 0;
4037  cp = linestring + len;
4038  len = strspn (cp, " \t\r");
4039  cp += len;
4040 
4041  /* Check for duplicate IDs in the first block */
4042  if (first_block)
4043  {
4044  arsp = s_FindAlignRawSeqById (afrp->sequences, this_id);
4045  if (arsp != NULL)
4046  {
4047  s_ReportDuplicateIDError (this_id, lip->line_num,
4048  afrp->report_error,
4049  afrp->report_error_userdata);
4050  }
4051  }
4052  afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
4053  this_id, cp,
4054  lip->line_num,
4055  lip->line_num,
4056  lip->line_offset + cp - linestring);
4057  free (this_id);
4058  } else {
4059  if (! s_AddAlignRawSeqByIndex (afrp->sequences, line_number,
4060  linestring,
4061  lip->line_num, lip->line_offset))
4062  {
4064  afrp->block_size,
4065  line_number,
4066  afrp->report_error,
4067  afrp->report_error_userdata);
4068  }
4069  }
4070  }
4071  }
4072 }
4073 
4074 
4075 /* This function removes comments from the lines of an interleaved block of
4076  * data.
4077  */
4078 static void
4080 (TLineInfoPtr first_line,
4081  int num_lines_in_block)
4082 {
4083  TLineInfoPtr lip;
4084  int block_offset;
4085 
4086  for (lip = first_line, block_offset = 0;
4087  lip != NULL && block_offset < num_lines_in_block;
4088  lip = lip->next)
4089  {
4091  }
4092 }
4093 
4094 
4095 /* This function processes the interleaved block of data found at each
4096  * location listed in afrp->offset_list.
4097  */
4099 {
4100  int line_counter;
4101  TIntLinkPtr offset_ptr;
4102  TLineInfoPtr lip;
4103  EBool first_block = eTrue;
4104  EBool in_taxa_comment = eFalse;
4105 
4106  if (afrp == NULL) {
4107  return;
4108  }
4109 
4110  line_counter = 0;
4111  offset_ptr = afrp->offset_list;
4112  lip = afrp->line_list;
4113  while (lip != NULL && offset_ptr != NULL
4114  && (in_taxa_comment || ! s_FoundStopLine (lip->data))) {
4115  if (in_taxa_comment) {
4116  if (strncmp (lip->data, "end;", 4) == 0) {
4117  in_taxa_comment = eFalse;
4118  }
4119  } else if (lip->data != NULL
4120  && strncmp (lip->data, "begin taxa;", 11) == 0) {
4121  in_taxa_comment = eTrue;
4122  }
4123  if (line_counter == offset_ptr->ival) {
4125  s_ProcessBlockLines (afrp, lip, afrp->block_size, first_block);
4126  first_block = eFalse;
4127  offset_ptr = offset_ptr->next;
4128  }
4129  lip = lip->next;
4130  line_counter ++;
4131  }
4132 }
4133 
4134 
4135 /* The following functions are used to analyze contiguous data. */
4136 
4137 static void
4139 (TLineInfoPtr token_list,
4140  TIntLinkPtr offset_list,
4141  SLengthListPtr * anchorpattern,
4142  SAlignRawFilePtr afrp,
4143  EBool gen_local_ids)
4144 {
4145  TLineInfoPtr lip;
4146  int line_counter;
4147  TIntLinkPtr offset_ptr, next_offset_ptr;
4148  char * curr_id;
4149  TSizeInfoPtr sip;
4150  int pattern_line_counter;
4151  int curr_seg;
4152 
4153  static int next_local_id = 1;
4154 
4155  if (token_list == NULL || offset_list == NULL
4156  || anchorpattern == NULL
4157  || afrp == NULL)
4158  {
4159  return;
4160  }
4161  for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
4162  {
4163  if (anchorpattern [curr_seg] == NULL || anchorpattern [curr_seg]->lengthrepeats == NULL)
4164  {
4165  return;
4166  }
4167  }
4168 
4169  line_counter = 0;
4170  lip = token_list;
4171  curr_seg = 0;
4172 
4173  for (offset_ptr = offset_list;
4174  offset_ptr != NULL && lip != NULL;
4175  offset_ptr = offset_ptr->next)
4176  {
4177  next_offset_ptr = offset_ptr->next;
4178  while (line_counter < offset_ptr->ival - 1 && lip != NULL) {
4179  lip = lip->next;
4180  line_counter ++;
4181  }
4182  if (lip != NULL) {
4183  if (gen_local_ids) {
4184  char * replacement_id = (char *) malloc(32 +strlen(lip->data));
4185  sprintf(replacement_id, "lcl|%d %s", next_local_id++, lip->data+1);
4186  free(lip->data);
4187  lip->data = replacement_id;
4188  }
4189  curr_id = lip->data;
4190  lip = lip->next;
4191  line_counter ++;
4192  for (sip = anchorpattern[curr_seg]->lengthrepeats;
4193  sip != NULL
4194  && lip != NULL
4195  && (next_offset_ptr == NULL
4196  || line_counter < next_offset_ptr->ival - 1);
4197  sip = sip->next)
4198  {
4199  for (pattern_line_counter = 0;
4200  pattern_line_counter < sip->num_appearances
4201  && lip != NULL
4202  && (next_offset_ptr == NULL
4203  || line_counter < next_offset_ptr->ival - 1);
4204  pattern_line_counter ++)
4205  {
4206  if (lip->data [0] != ']' && lip->data [0] != '[') {
4207  if ((int) strlen (lip->data) != sip->size_value) {
4208  s_ReportLineLengthError (curr_id, lip,
4209  sip->size_value,
4210  afrp->report_error,
4211  afrp->report_error_userdata);
4212  }
4213  afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
4214  curr_id,
4215  lip->data,
4216  lip->line_num,
4217  lip->line_num,
4218  lip->line_offset);
4219  }
4220  lip = lip->next;
4221  line_counter ++;
4222  }
4223  }
4224  if (sip != NULL && lip != NULL) {
4225  s_ReportBlockLengthError (curr_id, lip->line_num,
4226  afrp->block_size,
4227  line_counter - offset_ptr->ival,
4228  afrp->report_error,
4229  afrp->report_error_userdata);
4230  }
4231  }
4232  curr_seg ++;
4233  if (curr_seg >= afrp->num_segments)
4234  {
4235  curr_seg = 0;
4236  }
4237  }
4238 }
4239 
4240 
4241 /* The following functions are used for analyzing contiguous data with
4242  * marked IDs.
4243  */
4244 
4245 /* This function creates a new LengthList pattern for each marked ID.
4246  * After each new list is created, the function checks to see if the
4247  * new pattern matches any pattern already in the list of patterns seen.
4248  * If so, the function deletes the new pattern and increments
4249  * num_appearances for the pattern in the list, otherwise the function
4250  * adds the new pattern to the list.
4251  * When the list is complete, the function finds the pattern with the
4252  * most appearances and returns that pattern as the anchor pattern to use
4253  * when checking sequence data blocks for consistency with one another.
4254  */
4255 static SLengthListPtr *
4257 (SAlignRawFilePtr afrp)
4258 {
4259  SLengthListPtr * list;
4260  SLengthListPtr * best;
4261  SLengthListPtr this_pattern;
4262  char * cp;
4263  TLineInfoPtr lip;
4264  int curr_seg;
4265 
4266  if (afrp == NULL || afrp->num_segments < 1) {
4267  return NULL;
4268  }
4269 
4270  /* initialize length lists */
4271  list = (SLengthListPtr *) malloc (afrp->num_segments * sizeof (SLengthListPtr));
4272  if (list == NULL)
4273  {
4274  return NULL;
4275  }
4276  for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
4277  {
4278  list[curr_seg] = NULL;
4279  }
4280  /* initialize best ptrs */
4281  /* list is one element longer, to hold null terminator */
4282  best = (SLengthListPtr *) malloc ((afrp->num_segments + 1) * sizeof (SLengthListPtr));
4283  if (best == NULL)
4284  {
4285  return NULL;
4286  }
4287  for (curr_seg = 0; curr_seg < afrp->num_segments + 1; curr_seg ++)
4288  {
4289  best[curr_seg] = NULL;
4290  }
4291 
4292  /* initialize pattern */
4293  this_pattern = NULL;
4294 
4295  curr_seg = 0;
4296  for (lip = afrp->line_list;
4297  lip != NULL && ! s_FoundStopLine (lip->data);
4298  lip = lip->next)
4299  {
4300  if (lip->data == NULL) continue;
4301  if (lip->data [0] == ']' || lip->data [0] == '[') continue;
4302  if (lip->data [0] == '>') {
4303  if (this_pattern != NULL) {
4304  list [curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
4305  curr_seg ++;
4306  if (curr_seg >= afrp->num_segments)
4307  {
4308  curr_seg = 0;
4309  }
4310  }
4311  this_pattern = s_LengthListNew (NULL);
4312  if (this_pattern == NULL) {
4313  for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
4314  {
4315  s_LengthListFree (list [curr_seg]);
4316  }
4317  free (list);
4318  free (best);
4319  return NULL;
4320  }
4321  this_pattern->num_appearances = 1;
4322  } else if (this_pattern != NULL) {
4323  /* This section gets rid of base pair number comments */
4324  cp = lip->data;
4325  while ( isspace ((unsigned char)*cp) || isdigit ((unsigned char)*cp)) {
4326  cp++;
4327  }
4328  s_AddLengthRepeat (this_pattern, strlen (cp));
4329  }
4330  }
4331  if (this_pattern != NULL) {
4332  list[curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
4333  }
4334 
4335  /* Now find the pattern with the most appearances for each segment*/
4336  for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg++)
4337  {
4338  for (this_pattern = list [curr_seg];
4339  this_pattern != NULL;
4340  this_pattern = this_pattern->next)
4341  {
4342  if (this_pattern->num_appearances == 0) continue;
4343  if (best [curr_seg] == NULL
4344  || this_pattern->num_appearances > best[curr_seg]->num_appearances)
4345  {
4346  best[curr_seg] = this_pattern;
4347  }
4348 
4349  }
4350 
4351  /* free all patterns before and after anchor pattern */
4352  if (best [curr_seg] != NULL) {
4353  s_LengthListFree (best [curr_seg]->next);
4354  best [curr_seg]->next = NULL;
4355  }
4356 
4357  if (best [curr_seg] != list [curr_seg]) {
4358  this_pattern = list [curr_seg];
4359  while ( this_pattern != NULL && this_pattern->next != best[curr_seg] ) {
4360  this_pattern = this_pattern->next;
4361  }
4362  if (this_pattern != NULL) {
4363  this_pattern->next = NULL;
4364  s_LengthListFree (list [curr_seg]);
4365  }
4366  }
4367  }
4368 
4369  /* free list. note that all of the elements of list that are not pointed to by best
4370  have already been freed*/
4371  free (list);
4372 
4373 
4374  for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
4375  {
4376  if (best[curr_seg] == NULL)
4377  {
4378  for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
4379  {
4380  s_LengthListFree (best [curr_seg]);
4381  }
4382  free (best);
4383  return NULL;
4384  }
4385  }
4386 
4387  return best;
4388 }
4389 
4390 
4391 /* This function removes base pair count comments from the data sections
4392  * for contiguous marked ID sequences.
4393  */
4395 {
4396  TIntLinkPtr this_offset, next_offset;
4397  TLineInfoPtr lip;
4398  int line_count;
4399  char * cp;
4400 
4401  if (afrp == NULL || afrp->offset_list == NULL) {
4402  return;
4403  }
4404  this_offset = afrp->offset_list;
4405  next_offset = this_offset->next;
4406  lip = afrp->line_list;
4407  line_count = 0;
4408  while (lip != NULL && this_offset != NULL) {
4409  if (line_count == this_offset->ival) {
4410  while (lip != NULL &&
4411  (next_offset == NULL
4412  || line_count < next_offset->ival - 1)) {
4413  cp = lip->data;
4414  if (cp != NULL) {
4415  cp += strspn (cp, " \t\r\n1234567890");
4416  if (cp != lip->data) {
4417  strcpy (lip->data, cp);
4418  }
4419  }
4420  line_count ++;
4421  lip = lip->next;
4422  }
4423  this_offset = this_offset->next;
4424  if (this_offset != NULL) {
4425  next_offset = this_offset->next;
4426  }
4427  } else {
4428  line_count ++;
4429  lip = lip->next;
4430  }
4431  }
4432 }
4433 
4434 
4435 /* This function assumes that the offset_list has already been populated
4436  * with the locations of the data blocks. It analyzes the blocks of data
4437  * to find the most frequently occurring pattern of lengths of data and then
4438  * uses that pattern to attach the data to the correct IDs and report any
4439  * errors in formatting.
4440  */
4442  SAlignRawFilePtr afrp,
4443  EBool gen_local_ids)
4444 {
4445  SLengthListPtr * anchorpattern;
4446 
4447  if (afrp == NULL) {
4448  return;
4449  }
4450 
4452  anchorpattern = s_CreateAnchorPatternForMarkedIDs (afrp);
4453  if (anchorpattern == NULL || afrp->offset_list == NULL) {
4454  return;
4455  }
4457  anchorpattern, afrp, gen_local_ids);
4458  s_LengthListFree(*anchorpattern);
4459  free(anchorpattern);
4460 }
4461 
4462 
4463 /* The following functions are used for analyzing contiguous sequence data
4464  * without marked IDs.
4465  */
4466 
4467 /* This function left-shifts a string, character by character. */
4468 static void
4470 (char * cp_from,
4471  char * cp_to)
4472 {
4473  if (cp_from == cp_to || cp_from == NULL || cp_to == NULL) {
4474  return;
4475  }
4476  while (*cp_to != 0) {
4477  *cp_from = *cp_to;
4478  cp_from++;
4479  cp_to++;
4480  }
4481  *cp_from = 0;
4482 }
4483 
4484 
4485 /* This function removes bracketed comments from a linked list of
4486  * SLineInfo structures. The function returns a pointer to the
4487  * list without the comments.
4488  */
4490 {
4491  TLineInfoPtr lip;
4492  int num_comment_starts;
4493  char * cp_r;
4494  char * cp;
4495  EBool in_comment;
4496 
4497  num_comment_starts = 0;
4498  in_comment = eFalse;
4499  for (lip = list; lip != NULL; lip = lip->next) {
4500  if (lip->data == NULL) {
4501  lip->delete_me = eTrue;
4502  } else {
4503  cp_r = NULL;
4504  for (cp = lip->data; *cp != 0; cp++) {
4505  if (*cp == ']') {
4506  if (cp_r == NULL) {
4507  s_StringLeftShift (lip->data, cp + 1);
4508  cp = lip->data - 1;
4509  } else {
4510  s_StringLeftShift (cp_r, cp + 1);
4511  cp = cp_r;
4512  if (cp_r > lip->data) {
4513  cp_r --;
4514  while (cp_r >= lip->data && *cp_r != '[') {
4515  cp_r --;
4516  }
4517  if (cp_r < lip->data) {
4518  cp_r = NULL;
4519  }
4520  } else {
4521  cp_r = NULL;
4522  }
4523  }
4524  if (num_comment_starts > 0) {
4525  num_comment_starts --;
4526  }
4527  } else if (*cp == '[') {
4528  cp_r = cp;
4529  num_comment_starts ++;
4530  }
4531  }
4532  if (in_comment) {
4533  if (num_comment_starts == 0) {
4534  in_comment = eFalse;
4535  } else {
4536  lip->delete_me = eTrue;
4537  }
4538  } else if (num_comment_starts > 0) {
4539  cp_r = strchr (lip->data, '[');
4540  if (cp_r != NULL) {
4541  *cp_r = 0;
4542  }
4543  in_comment = eTrue;
4544  }
4545  if (lip->data [0] == 0) {
4546  lip->delete_me = eTrue;
4547  }
4548  }
4549  }
4550  list = s_DeleteLineInfos (list);
4551  return list;
4552 }
4553 
4554 
4555 /* This function removes Nexus comments from a linked list of SLineInfo
4556  * structures. The function returns a pointer to the list without the
4557  * comments.
4558  */
4560 {
4561  TLineInfoPtr lip, start_lip, end_lip;
4562 
4563  lip = list;
4564  start_lip = NULL;
4565  end_lip = NULL;
4566  while (lip != NULL) {
4567  if (s_StringICmp (lip->data, "#NEXUS") == 0) {
4568  start_lip = lip;
4569  end_lip = lip;
4570  while (end_lip != NULL
4571  && s_StringICmp (end_lip->data, "matrix") != 0) {
4572  end_lip = end_lip->next;
4573  }
4574  if (end_lip != NULL) {
4575  while (start_lip != end_lip) {
4576  start_lip->delete_me = eTrue;
4577  start_lip = start_lip->next;
4578  }
4579  end_lip->delete_me = eTrue;
4580  lip = end_lip->next;
4581  } else {
4582  lip = lip->next;
4583  }
4584  } else {
4585  lip = lip->next;
4586  }
4587  }
4588  list = s_DeleteLineInfos (list);
4589  return list;
4590 }
4591 
4592 
4593 /* This function finds the number of characters that occur most frequently
4594  * in a token and returns a pointer to a SSizeInfo structure that
4595  * describes the most common length and the number of times it appears.
4596  */
4597 static TSizeInfoPtr
4599 (TSizeInfoPtr list,
4600  int not_this_size)
4601 {
4602  TSizeInfoPtr list_ptr, new_list, best_ptr, return_best;
4603 
4604  new_list = NULL;
4605  for (list_ptr = list; list_ptr != NULL; list_ptr = list_ptr->next) {
4606  if (not_this_size != list_ptr->size_value) {
4607  new_list = s_AddSizeInfoAppearances (new_list,
4608  list_ptr->size_value,
4609  list_ptr->num_appearances);
4610  }
4611  }
4612  best_ptr = s_GetMostPopularSizeInfo (new_list);
4613  return_best = NULL;
4614  if (best_ptr != NULL) {
4615  return_best = s_SizeInfoNew (NULL);
4616  if (return_best != NULL) {
4617  return_best->size_value = best_ptr->size_value;
4618  return_best->num_appearances = best_ptr->num_appearances;
4619  }
4620  }
4621  s_SizeInfoFree (new_list);
4622  return return_best;
4623 }
4624 
4625 
4626 /* This function examines all instances of an anchor pattern in the data
4627  * and checks to see if the line immediately after the anchor pattern should
4628  * be used as part of the anchor pattern. This function exists because
4629  * frequently, but not always, contiguous data will consist of multiple lines
4630  * of data of the same length (for example, 80 characters), followed by one
4631  * shorter line with the remaining data. We must also make sure that we do
4632  * not accidentally include the ID of the next sequence in the data of the
4633  * previous sequence.
4634  */
4635 static void
4637 (SLengthListPtr anchorpattern,
4638  TSizeInfoPtr line_lengths)
4639 {
4640  TSizeInfoPtr last_line_lengths, sip, sip_next, twoafter;
4641  int best_last_line_length;
4642  int anchor_line_length;
4643 
4644  if (anchorpattern == NULL
4645  || anchorpattern->lengthrepeats == NULL
4646  || line_lengths == NULL) {
4647  return;
4648  }
4649 
4650  last_line_lengths = NULL;
4651  anchor_line_length = anchorpattern->lengthrepeats->size_value;
4652 
4653  /* also check to make sure that there's more than one line between
4654  * this pattern and the next pattern, otherwise the next line is the
4655  * ID for the next pattern and shouldn't be included in the anchor
4656  */
4657  for (sip = line_lengths; sip != NULL; sip = sip->next) {
4658  if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
4659  sip_next = sip->next;
4660  if (sip_next != NULL
4661  && sip_next->size_value > 0
4662  && sip_next->size_value != anchor_line_length
4663  && ((twoafter = sip_next->next) == NULL
4664  || twoafter->size_value != anchor_line_length))
4665  {
4666  last_line_lengths = s_AddSizeInfo (last_line_lengths,
4667  sip_next->size_value);
4668  }
4669  }
4670  }
4671  best_last_line_length = s_GetMostPopularSize (last_line_lengths);
4672  if (best_last_line_length > 0) {
4673  s_AddLengthRepeat (anchorpattern, best_last_line_length);
4674  }
4675  s_SizeInfoFree (last_line_lengths);
4676 }
4677 
4678 
4679 /* This function looks for the most frequently occurring pattern, where a
4680  * pattern is considered to be N contiguous tokens of M characters. The
4681  * function then checks to see if there is usually a token of a particular
4682  * length that immediately follows this pattern that is not the ID for the
4683  * next sequence. If so, this line length is added to the pattern.
4684  * The function returns a pointer to this pattern.
4685  */
4687 {
4688  SLengthListPtr patternlist, newpattern;
4689  TSizeInfoPtr sip, popular_line_length;
4690  SLengthListPtr index, best;
4691  int not_this_length;
4692 
4693  patternlist = NULL;
4694  for (sip = list; sip != NULL; sip = sip->next) {
4695  if (sip->size_value > 0) {
4696  newpattern = s_LengthListNew (NULL);
4697  if (newpattern == NULL) {
4698  s_LengthListFree (patternlist);
4699  return NULL;
4700  }
4701  newpattern->num_appearances = 1;
4702  newpattern->lengthrepeats = s_SizeInfoNew (NULL);
4703  if (newpattern->lengthrepeats == NULL) {
4704  s_LengthListFree (patternlist);
4705  return NULL;
4706  }
4707  newpattern->lengthrepeats->size_value = sip->size_value;
4708  newpattern->lengthrepeats->num_appearances = sip->num_appearances;
4709  patternlist = s_AddLengthList (patternlist, newpattern);
4710  }
4711  }
4712  if (patternlist == NULL) {
4713  return NULL;
4714  }
4715 
4716  best = NULL;
4717  for (index = patternlist; index != NULL; index = index->next) {
4718  if (index->lengthrepeats->num_appearances < 2) {
4719  continue;
4720  }
4721  if (best==NULL || best->num_appearances < index->num_appearances) {
4722  best = index;
4723  } else if (best->num_appearances == index->num_appearances
4724  && best->lengthrepeats->size_value <
4725  index->lengthrepeats->size_value) {
4726  best = index;
4727  }
4728  }
4729 
4730  /* Free data in list before best pattern */
4731  index = patternlist;
4732  while ( index != NULL && index->next != best ) {
4733  index = index->next;
4734  }
4735  if (index != NULL) {
4736  index->next = NULL;
4737  s_LengthListFree (patternlist);
4738  }
4739  /* Free data in list after best pattern */
4740  if (best != NULL) {
4741  s_LengthListFree (best->next);
4742  best->next = NULL;
4743  }
4744 
4745  popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list, 0);
4746 
4747  if (best != NULL && best->lengthrepeats != NULL
4748  && popular_line_length != NULL
4749  && best->lengthrepeats->size_value == popular_line_length->size_value)
4750  {
4751  not_this_length = popular_line_length->size_value;
4752  s_SizeInfoFree (popular_line_length);
4753  popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list,
4754  not_this_length);
4755  }
4756 
4757  if (best == NULL
4758  || (popular_line_length != NULL
4759  && popular_line_length->size_value > best->lengthrepeats->size_value
4760  && popular_line_length->num_appearances > best->num_appearances))
4761  {
4762  if (best == NULL) {
4763  best = s_LengthListNew (NULL);
4764  if (best == NULL) {
4765  return NULL;
4766  }
4767  }
4768  best->lengthrepeats = s_SizeInfoNew (NULL);
4769  if (best->lengthrepeats == NULL) {
4770  return NULL;
4771  }
4772  best->lengthrepeats->size_value = popular_line_length->size_value;
4773  best->lengthrepeats->num_appearances = 1;
4774  } else {
4775  /* extend anchor pattern to include best length of last line */
4776  s_ExtendAnchorPattern (best, list);
4777  }
4778 
4779  s_SizeInfoFree (popular_line_length);
4780 
4781  return best;
4782 }
4783 
4784 
4785 /* This function creates an SIntLink list to describe the locations
4786  * of occurrences of the anchorpattern in the SSizeInfo list.
4787  * The function returns a pointer to the SIntLink list.
4788  */
4789 static TIntLinkPtr
4791 (TSizeInfoPtr list,
4792  SLengthListPtr anchorpattern)
4793 {
4794  int line_counter;
4795  TIntLinkPtr offset_list, new_offset;
4796  TSizeInfoPtr sip;
4797 
4798  if (list == NULL || anchorpattern == NULL) {
4799  return NULL;
4800  }
4801  line_counter = 0;
4802  offset_list = NULL;
4803  for (sip = list; sip != NULL; sip = sip->next) {
4804  if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
4805  new_offset = s_IntLinkNew (line_counter, offset_list);
4806  if (new_offset == NULL) {
4807  s_IntLinkFree (offset_list);
4808  return NULL;
4809  }
4810  if (offset_list == NULL) {
4811  offset_list = new_offset;
4812  }
4813  }
4814 
4815  line_counter += sip->num_appearances;
4816  }
4817  return offset_list;
4818 }
4819 
4820 
4821 /* This function determines whether or not the number of expected sequence
4822  * characters are available starting at a token after line_start and stopping
4823  * at least one token before the next known sequence data block in the list.
4824  * If so, the function returns the number of the token at which the sequence
4825  * data begins. Otherwise the function returns -1.
4826  */
4827 static int
4829 (int line_start,
4830  int pattern_length,
4831  TIntLinkPtr next_offset,
4832  int sip_offset,
4833  TSizeInfoPtr list)
4834 {
4835  int offset, end_offset;
4836  TSizeInfoPtr sip;
4837  int line_counter, num_chars;
4838 
4839  if (list == NULL) {
4840  return -1;
4841  }
4842 
4843  for (offset = sip_offset; offset < list->num_appearances; offset++) {
4844  line_counter = line_start + offset;
4845  num_chars = list->size_value * (list->num_appearances - offset);
4846  sip = list;
4847  while (num_chars < pattern_length
4848  && (next_offset == NULL || line_counter < next_offset->ival)
4849  && sip->next != NULL)
4850  {
4851  sip = sip->next;
4852  for (end_offset = 0;
4853  end_offset < sip->num_appearances
4854  && num_chars < pattern_length
4855  && (next_offset == NULL
4856  || line_counter < next_offset->ival);
4857  end_offset ++)
4858  {
4859  num_chars += sip->size_value;
4860  line_counter ++;
4861  }
4862  }
4863  if (num_chars == pattern_length) {
4864  return line_start + offset;
4865  }
4866  }
4867  return -1;
4868 }
4869 
4870 
4871 /* This function examines the offset list and searches for holes where blocks
4872  * of sequence data without the exact expected formatting might exist. The
4873  * function adds the offsets of any new blocks to the list and returns a
4874  * pointer to the augmented offset list.
4875  */
4876 static TIntLinkPtr
4878 (TIntLinkPtr offset_list,
4879  TSizeInfoPtr list,
4880  SLengthListPtr anchorpattern)
4881 {
4882  int pattern_length;
4883  TSizeInfoPtr sip;
4884  TIntLinkPtr prev_offset, next_offset, new_offset;
4885  int line_counter, forecast_position, line_skip;
4886  EBool skipped_previous = eFalse;
4887  int num_chars;
4888  int num_additional_offsets = 0;
4889  int max_additional_offsets = 5000; /* if it's that bad, forget it */
4890 
4891  if (list == NULL || anchorpattern == NULL) {
4892  return offset_list;
4893  }
4894 
4895  pattern_length = 0;
4896  for (sip = anchorpattern->lengthrepeats; sip != NULL; sip = sip->next) {
4897  pattern_length += (sip->size_value * sip->num_appearances);
4898  }
4899  if (pattern_length == 0) {
4900  return offset_list;
4901  }
4902 
4903  prev_offset = NULL;
4904  next_offset = offset_list;
4905  line_counter = 0;
4906  sip = list;
4907  while (sip != NULL && num_additional_offsets < max_additional_offsets) {
4908  /* if we are somehow out of synch, don't get caught in infinite loop */
4909  if (next_offset != NULL && line_counter > next_offset->ival) {
4910  next_offset = next_offset->next;
4911  } else if (next_offset != NULL && line_counter == next_offset->ival) {
4912  skipped_previous = eFalse;
4913  prev_offset = next_offset;
4914  next_offset = next_offset->next;
4915  /* advance sip and line counter past the end of this pattern */
4916  num_chars = 0;
4917  while (num_chars < pattern_length && sip != NULL) {
4918  num_chars += sip->size_value * sip->num_appearances;
4919  line_counter += sip->num_appearances;
4920  sip = sip->next;
4921  }
4922  } else if (skipped_previous) {
4923  line_skip = 0;
4924  while (sip != NULL && line_skip < sip->num_appearances
4925  && num_additional_offsets < max_additional_offsets
4926  && (next_offset == NULL
4927  || line_counter < next_offset->ival)) {
4928  /* see if we can build a pattern that matches the pattern
4929  * length we want
4930  */
4931  forecast_position = s_ForecastPattern (line_counter,
4932  pattern_length,
4933  next_offset, line_skip,
4934  sip);
4935  if (forecast_position > 0) {
4936  new_offset = s_IntLinkNew (forecast_position, NULL);
4937  num_additional_offsets++;
4938  if (new_offset == NULL) {
4939  return NULL;
4940  }
4941  if (prev_offset == NULL) {
4942  new_offset->next = offset_list;
4943  offset_list = new_offset;
4944  } else {
4945  new_offset->next = next_offset;
4946  prev_offset->next = new_offset;
4947  }
4948  prev_offset = new_offset;
4949  /* now advance sip and line counter past the end
4950  * of the pattern we have just created
4951  */
4952  num_chars = 0;
4953  while (num_chars < pattern_length && sip != NULL) {
4954  for (line_skip = 0;
4955  line_skip < sip->num_appearances
4956  && num_chars < pattern_length;
4957  line_skip++)
4958  {
4959  num_chars += sip->size_value;
4960  line_counter ++;
4961  }
4962  if (line_skip == sip->num_appearances) {
4963  sip = sip->next;
4964  line_skip = 0;
4965  }
4966  }
4967  } else {
4968  line_counter += sip->num_appearances;
4969  sip = sip->next;
4970  line_skip = 0;
4971  }
4972  }
4973  } else {
4974  skipped_previous = eTrue;
4975  line_counter += sip->num_appearances;
4976  sip = sip->next;
4977  }
4978  }
4979  if (num_additional_offsets >= max_additional_offsets)
4980  {
4981  s_IntLinkFree (offset_list);
4982  offset_list = NULL;
4983  }
4984  return offset_list;
4985 }
4986 
4987 
4988 /* This function finds the most frequently occurring distance between
4989  * two sequence data blocks and returns that value.
4990  */
4992 {
4993  int line_counter, best_length;
4994  TSizeInfoPtr pattern_length_list;
4996 
4997  if (offset_list == NULL) {
4998  return -1;
4999  }
5000 
5001  line_counter = -1;
5002  pattern_length_list = NULL;
5003  for (offset = offset_list; offset != NULL; offset = offset->next) {
5004  if (line_counter != -1) {
5005  pattern_length_list = s_AddSizeInfo (pattern_length_list,
5006  offset->ival - line_counter);
5007  }
5008  line_counter = offset->ival;
5009  }
5010  best_length = s_GetMostPopularSize (pattern_length_list);
5011  s_SizeInfoFree (pattern_length_list);
5012  return best_length;
5013 }
5014 
5015 
5016 /* This function finds the most frequently appearing number of characters
5017  * in a block of sequence data and returns that value.
5018  */
5019 static int
5021 (TLineInfoPtr token_list,
5022  TIntLinkPtr offset_list,
5023  int block_length)
5024 {
5025  TLineInfoPtr lip;
5026  TIntLinkPtr prev_offset, new_offset;
5027  int line_diff, num_chars, best_num_chars;
5028  TSizeInfoPtr pattern_length_list = NULL;
5029 
5030  if (token_list == NULL || offset_list == NULL || block_length < 1) {
5031  return -1;
5032  }
5033  /* get length of well-formatted block size */
5034  lip = token_list;
5035  prev_offset = NULL;
5036  for (new_offset = offset_list;
5037  new_offset != NULL && lip != NULL;
5038  new_offset = new_offset->next)
5039  {
5040  if (prev_offset == NULL) {
5041  /* skip first tokens */
5042  for (line_diff = 0;
5043  line_diff < new_offset->ival && lip != NULL;
5044  line_diff ++)
5045  {
5046  lip = lip->next;
5047  }
5048  }
5049  if (prev_offset != NULL) {
5050  num_chars = 0;
5051  for (line_diff = 0;
5052  line_diff < new_offset->ival - prev_offset->ival
5053  && lip != NULL;
5054  line_diff ++)
5055  {
5056  if (line_diff < new_offset->ival - prev_offset->ival - 1) {
5057  num_chars += strlen (lip->data);
5058  }
5059  lip = lip->next;
5060  }
5061  if (new_offset->ival - prev_offset->ival == block_length) {
5062  pattern_length_list = s_AddSizeInfo (pattern_length_list,
5063  num_chars);
5064  }
5065  }
5066  prev_offset = new_offset;
5067  }
5068  best_num_chars = s_GetMostPopularSize (pattern_length_list);
5069  if (best_num_chars == 0 && pattern_length_list != NULL) {
5070  best_num_chars = pattern_length_list->size_value;
5071  }
5072  s_SizeInfoFree (pattern_length_list);
5073  pattern_length_list = NULL;
5074  return best_num_chars;
5075 }
5076 
5077 
5078 static int
5080 (TLineInfoPtr list,
5081  int distance,
5082  int desired_num_chars)
5083 {
5084  int line_diff;
5085  size_t num_chars, total_chars, pattern_length, num_starts;
5086  TLineInfoPtr lip;
5087  TIntLinkPtr length_list, start_list, start_ptr, length;
5088  int start_of_unknown;
5089  int num_additional_offsets_needed;
5090 
5091  if (list == NULL || distance == 0 || desired_num_chars == 0) {
5092  return 0;
5093  }
5094 
5095  /* because the first offset is the start of a known pattern, we should
5096  * skip to the end of that pattern and start looking for additional
5097  * offsets
5098  */
5099  total_chars = 0;
5100  for (lip = list, line_diff = 0;
5101  lip != NULL && line_diff < distance
5102  && total_chars < desired_num_chars;
5103  lip = lip->next, line_diff++) {
5104  num_chars = strlen (lip->data);
5105  total_chars += num_chars;
5106  }
5107  while (lip != NULL && line_diff < distance && s_IsBlank (lip->data)) {
5108  lip = lip->next;
5109  line_diff++;
5110  }
5111  /* skip over line we would need for ID */
5112  if (lip != NULL) {
5113  lip = lip->next;
5114  line_diff++;
5115  }
5116 
5117  if (lip == NULL || line_diff == distance) {
5118  return 0;
5119  }
5120 
5121  list = lip->next;
5122  start_of_unknown = line_diff;
5123 
5124  length_list = NULL;
5125  total_chars = 0;
5126  for (lip = list;
5127  lip != NULL && line_diff < distance;
5128  lip = lip->next, line_diff++)
5129  {
5130  num_chars = strlen (lip->data);
5131  length = s_IntLinkNew (num_chars, length_list);
5132  if (length_list == NULL) {
5133  length_list = length;
5134  }
5135  total_chars += num_chars;
5136  }
5137 
5138  /* how many offsets do we need? */
5139  num_additional_offsets_needed = (total_chars / desired_num_chars);
5140  if (num_additional_offsets_needed == 0) {
5141  return 0;
5142  }
5143 
5144  /* Find all the places you could start and get the exact right number
5145  * of characters
5146  */
5147  start_list = NULL;
5148  num_starts = 0;
5149  pattern_length = 0;
5150  for (start_ptr = length_list, line_diff = start_of_unknown;
5151  start_ptr != NULL && line_diff < distance
5152  && pattern_length < distance - line_diff ;
5153  start_ptr = start_ptr->next, line_diff++) {
5154  num_chars = start_ptr->ival;
5155  pattern_length = 1;
5156  length = start_ptr->next;
5157  while (num_chars < desired_num_chars
5158  && pattern_length + line_diff < distance
5159  && length != NULL)
5160  {
5161  num_chars += length->ival;
5162  pattern_length ++;
5163  length = length->next;
5164  }
5165  if (num_chars == desired_num_chars) {
5166  length = s_IntLinkNew (line_diff, start_list);
5167  if (start_list == NULL) {
5168  start_list = length;
5169  }
5170  num_starts ++;
5171  }
5172  }
5173 
5174  /* now select best set of start points */
5175 
5176  s_IntLinkFree (length_list);
5177  s_IntLinkFree (start_list);
5178  return 0;
5179 }
5180 
5181 
5182 /* This function inserts new block locations into the offset_list
5183  * by looking for likely starts of abnormal patterns.
5184  */
5186 (TLineInfoPtr token_list,
5187  TIntLinkPtr offset_list,
5188  int block_length,
5189  int best_num_chars,
5190  char * alphabet)
5191 {
5192  TLineInfoPtr lip;
5193  TIntLinkPtr prev_offset, new_offset, splice_offset;
5194  int line_diff, num_chars, line_start;
5195 
5196  if (token_list == NULL || offset_list == NULL
5197  || block_length < 1 || best_num_chars < 1)
5198  {
5199  return;
5200  }
5201 
5202  lip = token_list;
5203  prev_offset = NULL;
5204  for (new_offset = offset_list;
5205  new_offset != NULL && lip != NULL;
5206  new_offset = new_offset->next) {
5207  if (prev_offset == NULL) {
5208  /* just advance through tokens */
5209  for (line_diff = 0;
5210  line_diff < new_offset->ival && lip != NULL;
5211  line_diff ++) {
5212  lip = lip->next;
5213  }
5214  } else {
5215  if (new_offset->ival - prev_offset->ival == block_length) {
5216  /* just advance through tokens */
5217  for (line_diff = 0;
5218  line_diff < new_offset->ival - prev_offset->ival
5219  && lip != NULL;
5220  line_diff ++) {
5221  lip = lip->next;
5222  }
5223  } else {
5224  /* look for intermediate breaks */
5225  num_chars = 0;
5226  for (line_diff = 0;
5227  line_diff < new_offset->ival - prev_offset->ival
5228  && lip != NULL && num_chars < best_num_chars;
5229  line_diff ++) {
5230  num_chars += strlen (lip->data);
5231  lip = lip->next;
5232  }
5233  if (lip == NULL) {
5234  return;
5235  }
5236  /* set new offset at first line of next pattern */
5237  line_diff ++;
5238  lip = lip->next;
5239  if (line_diff < new_offset->ival - prev_offset->ival) {
5240  line_start = line_diff + prev_offset->ival;
5241  /* advance token pointer to new piece */
5242  while (line_diff < new_offset->ival - prev_offset->ival
5243  && lip != NULL)
5244  {
5245  lip = lip->next;
5246  line_diff ++;
5247  }
5248  /* insert new offset value */
5249  splice_offset = s_IntLinkNew (line_start, NULL);
5250  if (splice_offset == NULL) {
5251  return;
5252  }
5253  splice_offset->next = new_offset;
5254  prev_offset->next = splice_offset;
5255 
5257  new_offset->ival - splice_offset->ival,
5258  best_num_chars);
5259  }
5260  }
5261  }
5262  prev_offset = new_offset;
5263  }
5264 
5265  /* iterate through the last block */
5266  for (line_diff = 0;
5267  line_diff < block_length && lip != NULL;
5268  line_diff ++) {
5269  lip = lip->next;
5270  }
5271 
5272  /* if we have room for one more sequence, or even most of one more sequence, add it */
5273  if (lip != NULL && ! s_SkippableString (lip->data)) {
5274  s_IntLinkNew (line_diff + prev_offset->ival, prev_offset);
5275  }
5276 }
5277 
5278 
5279 /* This function returns true if the string contains digits, false otherwise */
5281 {
5282  char *cp;
5283 
5284  if (data == NULL) return eFalse;
5285  for (cp = data; *cp != 0; cp++) {
5286  if (isdigit ((unsigned char)(*cp))) {
5287  return eTrue;
5288  }
5289  }
5290  return eFalse;
5291 }
5292 
5293 
5294 /* This function processes the alignment file data by dividing the original
5295  * lines into pieces based on whitespace and looking for patterns of length
5296  * in the data.
5297  */
5299 {
5300  TLineInfoPtr token_list;
5301  SLengthListPtr list;
5302  TLineInfoPtr lip;
5303  SLengthListPtr anchorpattern[2];
5304  TIntLinkPtr offset_list;
5305  int best_length;
5306  int best_num_chars;
5307 
5308  if (afrp == NULL || afrp->line_list == NULL) {
5309  return;
5310  }
5311 
5312  token_list = s_BuildTokenList (afrp->line_list);
5313  token_list = s_RemoveCommentsFromTokens (token_list);
5314  token_list = s_RemoveNexusCommentsFromTokens (token_list);
5315 
5316  list = s_LengthListNew ( NULL );
5317  for (lip = token_list;
5318  lip != NULL && ! s_FoundStopLine (lip->data);
5319  lip = lip->next)
5320  {
5321  if (s_SkippableString (lip->data) || s_ContainsDigits(lip->data)) {
5322  s_AddLengthRepeat (list, 0);
5323  } else {
5324  s_AddLengthRepeat (list, strlen (lip->data));
5325  }
5326  }
5327 
5328  anchorpattern [0] = s_FindMostPopularPattern (list->lengthrepeats);
5329  anchorpattern [1] = NULL;
5330  if (anchorpattern [0] == NULL || anchorpattern[0]->lengthrepeats == NULL) {
5331  s_LengthListFree (list);
5332  return;
5333  }
5334 
5335  /* find anchor patterns in original list,
5336  * find distances between anchor patterns
5337  */
5338  offset_list = s_CreateOffsetList (list->lengthrepeats, anchorpattern[0]);
5339  offset_list = s_AugmentOffsetList (offset_list,
5340  list->lengthrepeats,
5341  anchorpattern[0]);
5342 
5343  /* resolve unusual distances between anchor patterns */
5344  best_length = s_GetMostPopularPatternLength (offset_list);
5345  if (best_length < 1 && offset_list != NULL && offset_list->next != NULL) {
5346  best_length = offset_list->next->ival - offset_list->ival;
5347  }
5348  best_num_chars = s_GetBestCharacterLength (token_list, offset_list,
5349  best_length);
5350  s_InsertNewOffsets (token_list, offset_list, best_length, best_num_chars,
5351  afrp->alphabet);
5352 
5353  /* use token before each anchor pattern as ID, use tokens for distance
5354  * between anchor patterns for sequence data
5355  */
5356  s_CreateSequencesBasedOnTokenPatterns (token_list, offset_list,
5357  anchorpattern, afrp, eFalse);
5358 
5359  s_LengthListFree (anchorpattern[0]);
5360  s_LengthListFree (list);
5361  s_LineInfoFree (token_list);
5362 }
5363 
5364 
5365 /* The following functions are used to convert data from the internal
5366  * representation into the form that will be passed to the calling
5367  * program. Information from the ID strings is parsed to remove
5368  * definition lines and organism information, the gap characters are
5369  * standardized to '-', the missing characters are standardizes to 'N',
5370  * match characters are replaced with characters from the first record,
5371  * and bad characters are reported.
5372  */
5373 
5374 /* This function allocates memory for a new AligmentFileData structure
5375  * and initializes its member variables.
5376  */
5378 {
5379  TAlignmentFilePtr afp;
5380 
5381  afp = (TAlignmentFilePtr) malloc (sizeof (SAlignmentFile));
5382  if (afp == NULL) {
5383  return NULL;
5384  }
5385  afp->num_sequences = 0;
5386  afp->num_organisms = 0;
5387  afp->num_deflines = 0;
5388  afp->num_segments = 0;
5389  afp->ids = NULL;
5390  afp->sequences = NULL;
5391  afp->organisms = NULL;
5392  afp->deflines = NULL;
5393  return afp;
5394 }
5395 
5396 
5397 /* This function frees the memory associated with an AligmentFileData
5398  * structure and its member variables.
5399  */
5401 {
5402  int index;
5403 
5404  if (afp == NULL) {
5405  return;
5406  }
5407  if (afp->ids != NULL) {
5408  for (index = 0; index < afp->num_sequences; index++) {
5409  free (afp->ids [index]);
5410  }
5411  free (afp->ids);
5412  afp->ids = NULL;
5413  }
5414  if (afp->sequences != NULL) {
5415  for (index = 0; index < afp->num_sequences; index++) {
5416  free (afp->sequences [index]);
5417  }
5418  free (afp->sequences);
5419  afp->sequences = NULL;
5420  }
5421  if (afp->organisms != NULL) {
5422  for (index = 0; index < afp->num_organisms; index++) {
5423  free (afp->organisms [index]);
5424  }
5425  free (afp->organisms);
5426  afp->sequences = NULL;
5427  }
5428  if (afp->deflines != NULL) {
5429  for (index = 0; index < afp->num_deflines; index++) {
5430  free (afp->deflines [index]);
5431  }
5432  free (afp->deflines);
5433  afp->deflines = NULL;
5434  }
5435  free (afp);
5436 }
5437 
5438 
5439 /* This function parses the identifier string used by the alignment file
5440  * to identify a sequence to find the portion of the string that is actually
5441  * an ID, as opposed to organism information or definition line.
5442  */
5443 static char * s_GetIdFromString (char * str)
5444 {
5445  char * cp;
5446  char * id;
5447  int len;
5448 
5449  if (str == NULL) {
5450  return NULL;
5451  }
5452 
5453  cp = str;
5454  cp += strspn (str, " >\t");
5455  len = strcspn (cp, " \t\r\n");
5456  if (len == 0) {
5457  return NULL;
5458  }
5459  id = (char *)malloc (len + 1);
5460  if (id == NULL) {
5461  return NULL;
5462  }
5463  strncpy (id, cp, len);
5464  id [ len ] = 0;
5465  return id;
5466 }
5467 
5468 
5469 /* This function pulls defline information from the ID string, if there is
5470  * any.
5471  */
5472 static char * s_GetDeflineFromIdString (char * str)
5473 {
5474  char * cp;
5475  int len;
5476 
5477  if (str == NULL) {
5478  return NULL;
5479  }
5480 
5481  cp = str;
5482  cp += strspn (str, " >\t");
5483  len = strcspn (cp, " \t\r\n");
5484  if (len == 0) {
5485  return NULL;
5486  }
5487  cp += len;
5488  len = strspn (cp, " \t\r\n");
5489  if (len == 0) {
5490  return NULL;
5491  }
5492  cp += len;
5493  if (*cp == 0) {
5494  return NULL;
5495  }
5496  return strdup (cp);
5497 }
5498 
5499 
5500 /* This function takes the ID strings read from the file and parses them
5501  * to obtain a defline (if there is extra text after the ID and/or
5502  * organism information) and to obtain the actual ID for the sequence.
5503  */
5505 {
5506  TStringCountPtr list, scp;
5507  TAlignRawSeqPtr arsp;
5508  TLineInfoPtr lip;
5509  char * id;
5510  int line_num;
5511  EBool rval = eTrue;
5512  char * defline;
5513 
5514  if (afrp == NULL) {
5515  return eFalse;
5516  }
5517 
5518  list = NULL;
5519  lip = afrp->deflines;
5520  for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
5521  if (arsp->id_lines != NULL) {
5522  line_num = arsp->id_lines->ival;
5523  } else {
5524  line_num = -1;
5525  }
5527  id = s_GetIdFromString (arsp->id);
5528  if (lip == NULL) {
5529  defline = s_GetDeflineFromIdString (arsp->id);
5530  afrp->deflines = s_AddLineInfo (afrp->deflines, defline,
5531  line_num, 0);
5532  free (defline);
5533  afrp->num_deflines ++;
5534  }
5535  free (arsp->id);
5536  arsp->id = id;
5537  list = s_AddStringCount (arsp->id, line_num, list);
5538  }
5539 
5540  for (scp = list; scp != NULL; scp = scp->next) {
5541  if (scp->num_appearances > 1) {
5542  rval = eFalse;
5543  s_ReportRepeatedId (scp, afrp->report_error,
5544  afrp->report_error_userdata);
5545  }
5546  }
5547  /* free string count list */
5548  s_StringCountFree (list);
5549  return rval;
5550 }
5551 
5552 
5553 /* This function reports unacceptable characters in a sequence. Frequently
5554  * there will be more than one character of the same kind (for instance,
5555  * when the user has incorrectly specified a gap character), so repeated
5556  * characters are reported together. The function advances the data
5557  * position in the SLineInfoReader structure lirp, and returns the
5558  * current data position for lirp.
5559  */
5560 static int
5562 (TLineInfoReaderPtr lirp,
5563  char * id,
5564  const char * reason,
5565  FReportErrorFunction report_error,
5566  void * report_error_userdata)
5567 {
5568  int bad_line_num, bad_line_offset;
5569  int num_bad_chars;
5570  char bad_char, curr_char;
5571  int data_position;
5572 
5573  bad_line_num = s_LineInfoReaderGetCurrentLineNumber (lirp);
5574  bad_line_offset = s_LineInfoReaderGetCurrentLineOffset (lirp);
5575  bad_char = *lirp->curr_line_pos;
5576  num_bad_chars = 1;
5577  data_position = lirp->data_pos + 1;
5578  while ((curr_char = s_FindNthDataChar (lirp, data_position)) == bad_char) {
5579  num_bad_chars ++;
5580  data_position ++;
5581  }
5582  s_ReportBadCharError (id, bad_char, num_bad_chars,
5583  bad_line_offset, bad_line_num, reason,
5584  report_error, report_error_userdata);
5585  return data_position;
5586 }
5587 
5588 
5589 /* This function does context-sensitive replacement of the missing,
5590  * match, and gap characters and also identifies bad characters.
5591  * Gap characters found in the wrong location in the sequence are
5592  * considered an error. Characters that are not missing, match, or
5593  * gap characters and are not in the specified sequence alphabet are
5594  * reported as errors. Match characters in the first sequence are also
5595  * reported as errors.
5596  * The function will return eTrue if any errors were found, or eFalse
5597  * if there were no errors.
5598  */
5599 static EBool
5601 (TAlignRawSeqPtr arsp,
5602  TAlignRawSeqPtr master_arsp,
5603  TSequenceInfoPtr sip,
5604  int num_segments,
5605  FReportErrorFunction report_error,
5606  void * report_error_userdata)
5607 {
5608  TLineInfoReaderPtr lirp, master_lirp;
5609  int data_position;
5610  int middle_start = 0;
5611  int middle_end = 0;
5612  char curr_char, master_char;
5613  EBool found_middle_start;
5614  EBool rval = eFalse;
5615  EBool match_not_in_beginning_gap;
5616  EBool match_not_in_end_gap;
5617 
5618  char beginning_gap = '-';
5619  char middle_gap = '-';
5620  char end_gap = '-';
5621 
5622  if (strlen(sip->beginning_gap) > 0 &&
5623  strchr(sip->beginning_gap, '-') == NULL){
5624  beginning_gap = sip->beginning_gap[0];
5625  }
5626 
5627  if (strlen(sip->middle_gap) > 0 &&
5628  strchr(sip->middle_gap, '-') == NULL){
5629  middle_gap = sip->middle_gap[0];
5630  }
5631 
5632  if (strlen(sip->end_gap) > 0 &&
5633  strchr(sip->end_gap, '-') == NULL){
5634  end_gap = sip->end_gap[0];
5635  }
5636 
5637 
5638  if (arsp == NULL || master_arsp == NULL || sip == NULL) {
5639  return eTrue;
5640  }
5641  lirp = s_LineInfoReaderNew (arsp->sequence_data);
5642  if (lirp == NULL) {
5643  return eTrue;
5644  }
5645  if (arsp != master_arsp) {
5646  master_lirp = s_LineInfoReaderNew (master_arsp->sequence_data);
5647  if (master_lirp == NULL) {
5648  s_LineInfoReaderFree (lirp);
5649  return eTrue;
5650  }
5651  } else {
5652  master_lirp = NULL;
5653  }
5654 
5655  if (strcspn (sip->beginning_gap, sip->match)
5656  == strlen (sip->beginning_gap)) {
5657  match_not_in_beginning_gap = eTrue;
5658  } else {
5659  match_not_in_beginning_gap = eFalse;
5660  }
5661 
5662  if (strcspn (sip->end_gap, sip->