NCBI C++ ToolKit
gene_info_writer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gene_info_writer.cpp 91825 2020-12-14 18:07:00Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Vahram Avagyan
27  *
28  */
29 
30 //==========================================================================//
31 #include <ncbi_pch.hpp>
33 #include "gene_file_defs.hpp"
34 
35 #include <algorithm>
36 
38 
39 //==========================================================================//
40 // General file processing
41 
43 (
45  CLineProcessor* pLineProcessor,
46  TTwoIntRecordVec& vecRecords,
47  int nMinLineLength
48 )
49 {
50  string buffer;
51  while (getline(in, buffer))
52  {
53  if (int(buffer.length()) >= nMinLineLength)
54  {
55  pLineProcessor->Process(buffer, vecRecords);
56  }
57  }
58 }
59 
60 //==========================================================================//
61 // Data conversion and comparison functions
62 
65  string& strName)
66 {
67  if (m_seqDb.Empty())
68  {
69  m_seqDb.Reset(new CSeqDBExpert());
70  }
71 
72  try
73  {
74  SSeqDBTaxInfo taxInfo;
75  m_seqDb->GetTaxInfo(nTaxId, taxInfo);
76  strName = taxInfo.scientific_name;
77  }
78  catch (CException)
79  {
80  strName = "unknown";
81  }
82 }
83 
84 bool CGeneFileWriter::x_GetOffsetForGeneId(int geneId, int& nOffset)
85 {
86  TIntToIntMap::iterator itIdToOffset =
87  m_mapIdToOffset.find(geneId);
88  if (itIdToOffset != m_mapIdToOffset.end())
89  {
90  nOffset = itIdToOffset->second;
91  return true;
92  }
93  return false;
94 }
95 
97 {
98  int nPubMedLinks = 0;
100  m_mapIdToNumPMIDs.find(geneId);
101  if (itPM != m_mapIdToNumPMIDs.end())
102  {
103  nPubMedLinks = itPM->second;
104  }
105  return nPubMedLinks;
106 }
107 
110  const STwoIntRecord& record2)
111 {
112  return record1.n1 < record2.n1 ||
113  (record1.n1 == record2.n1 &&
114  record1.n2 < record2.n2);
115 }
116 
119  const TFourIntRecord& record2)
120 {
121  return (record1.n[0] < record2.n[0] ||
122  (record1.n[0] == record2.n[0] &&
123  (record1.n[1] < record2.n[1] ||
124  (record1.n[1] == record2.n[1] &&
125  (record1.n[2] < record2.n[2] ||
126  (record1.n[2] == record2.n[2] &&
127  (record1.n[3] < record2.n[3])))))));
128 }
129 
130 //==========================================================================//
131 // Gene->Accession file processing
132 
134  x_Gene2Accn_ParseLine(const string& strLine,
135  SGene2AccnLine& lineData)
136 {
137  if (NStr::StartsWith(strLine, "#"))
138  return false;
139 
140  vector<string> strItems;
141  NStr::SplitByPattern(strLine, "\t", strItems);
142 
143  if (strItems.size() != GENE_2_ACCN_NUM_ITEMS)
144  {
145  CNcbiOstrstream oss;
146  oss << "Gene2Accession file format not recognized: found ";
147  oss << strItems.size() << " elements per line instead of ";
148  oss << GENE_2_ACCN_NUM_ITEMS << " in ";
150  NCBI_THROW(CGeneInfoException, eDataFormatError,
152  }
153 
154  // read taxId
155  if (strItems[GENE_2_ACCN_TAX_ID_INDEX] != "-")
156  lineData.nTaxId =
157  NStr::StringToNumeric<TTaxId>(strItems[GENE_2_ACCN_TAX_ID_INDEX]);
158  else
159  lineData.nTaxId = ZERO_TAX_ID;
160 
161  // read geneId
162  if (strItems[GENE_2_ACCN_GENE_ID_INDEX] != "-")
163  lineData.geneId =
165  else
166  lineData.geneId = 0;
167 
168  // read RNA nucleotide Gi
169  if (strItems[GENE_2_ACCN_RNA_GI_INDEX] != "-")
170  lineData.giRNANucl =
172  else
173  lineData.giRNANucl = 0;
174 
175  // read protein Gi
176  if (strItems[GENE_2_ACCN_PROT_GI_INDEX] != "-")
177  lineData.giProt =
179  else
180  lineData.giProt = 0;
181 
182  // read genomic nucleotide Gi
183  if (strItems[GENE_2_ACCN_GENOMIC_GI_INDEX] != "-")
184  lineData.giGenomicNucl =
186  else
187  lineData.giGenomicNucl = 0;
188 
189  return true;
190 }
191 
194  TTwoIntRecordVec& vecRecords)
195 {
196  if (lineData.nTaxId > ZERO_TAX_ID && lineData.geneId > 0)
197  {
198  STwoIntRecord record;
199  record.n2 = lineData.geneId;
200 
201  TFourIntRecord recordGeneIdToGi;
202  recordGeneIdToGi.n[0] = lineData.geneId;
203  recordGeneIdToGi.n[1] = 0;
204  recordGeneIdToGi.n[2] = 0;
205  recordGeneIdToGi.n[3] = 0;
206 
207  if (lineData.giRNANucl > 0)
208  {
209  record.n1 = lineData.giRNANucl;
210  vecRecords.push_back(record);
211 
213  eRNAGi));
214  m_nTotalGis++;
215  m_nRNAGis++;
216 
217  recordGeneIdToGi.n[1] = lineData.giRNANucl;
218  }
219  if (lineData.giProt > 0)
220  {
221  record.n1 = lineData.giProt;
222  vecRecords.push_back(record);
223 
225  eProtGi));
226  m_nTotalGis++;
227  m_nProtGis++;
228 
229  recordGeneIdToGi.n[2] = lineData.giProt;
230  }
231  if (lineData.giGenomicNucl > 0)
232  {
233  record.n1 = lineData.giGenomicNucl;
234  vecRecords.push_back(record);
235 
237  (lineData.giGenomicNucl, eGenomicGi));
238  m_nTotalGis++;
239  m_nGenomicGis++;
240 
241  recordGeneIdToGi.n[3] = lineData.giGenomicNucl;
242  }
243 
244  m_vecGeneIdToGiRecords.push_back(recordGeneIdToGi);
245  }
246 }
247 
249  Process(const string& strLine,
250  TTwoIntRecordVec& vecRecords)
251 {
252  SGene2AccnLine lineData;
253  if (m_pThis->x_Gene2Accn_ParseLine(strLine, lineData))
254  {
255  m_pThis->x_Gene2Accn_LineToRecord(lineData, vecRecords);
256  }
257 }
258 
260  x_Gene2Accn_Filter(const TTwoIntRecordVec& vecRecords,
261  size_t iRec,
262  bool& bUnique,
263  TTwoIntRecordVec& vecFiltered)
264 {
265  STwoIntRecord recordToAdd = vecRecords[iRec - 1];
266 
267  // has this record been added already?
268  bool bHasBeenAdded = false;
269  if (vecFiltered.size() > 0)
270  if (vecFiltered.back().n1 == recordToAdd.n1 &&
271  vecFiltered.back().n2 == recordToAdd.n2)
272  bHasBeenAdded = true;
273 
274  // should we add all records with this Gi?
275  bool bAddAllForGi =
277  m_mapGiToType[recordToAdd.n1] == eRNAGi) ||
279  m_mapGiToType[recordToAdd.n1] == eProtGi) ||
281  m_mapGiToType[recordToAdd.n1] == eGenomicGi);
282 
283  // is this record the last one in its group of same-Gi records?
284  bool bLastInGroup = false;
285  if (iRec < vecRecords.size())
286  bLastInGroup = vecRecords[iRec].n1 != recordToAdd.n1;
287  else
288  bLastInGroup = true;
289 
290  // should we add this record?
291  bool bAddPrev = false;
292  if (bAddAllForGi)
293  bAddPrev = !bHasBeenAdded;
294  else
295  bAddPrev = bUnique && bLastInGroup;
296 
297  if (bAddPrev)
298  vecFiltered.push_back(recordToAdd);
299 
300  if (iRec < vecRecords.size())
301  {
302  // update parameters for the next coming record (iRec)
303  if (bLastInGroup)
304  {
305  bUnique = true;
306  }
307  else if (bUnique)
308  {
309  bUnique = vecRecords[iRec].n2 == recordToAdd.n2;
310  }
311  }
312 }
313 
315 {
316  if (!bOverwrite &&
319  {
320  return; // files exist, do not overwrite
321  }
322 
323  // open the original and processed files
324 
325  CNcbiIfstream inGene2Accn;
326  CNcbiOfstream outGi2Gene, outGi2Offset, outGene2Gi;
327 
328  if (!OpenTextInputFile(m_strGene2AccessionFile, inGene2Accn))
329  {
330  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
331  "Cannot open Gene2Accession file for reading.");
332  }
333  if (!OpenBinaryOutputFile(m_strGi2GeneFile, outGi2Gene))
334  {
335  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
336  "Cannot open Gi2Gene file for writing.");
337  }
338  if (!OpenBinaryOutputFile(m_strGi2OffsetFile, outGi2Offset))
339  {
340  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
341  "Cannot open Gi2Offset file for writing.");
342  }
343  if (!OpenBinaryOutputFile(m_strGene2GiFile, outGene2Gi))
344  {
345  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
346  "Cannot open Gene2Gi file for writing.");
347  }
348 
349  // estimate the number of records we will have
350 
351  Int8 nTotalLenght = GetLength(m_strGene2AccessionFile);
352  TSeqPos nNumLinesEstimate = (TSeqPos)nTotalLenght / GENE_2_ACCN_LINE_MIN;
353 
354  // create the array of (gi, geneId) records
355 
356  TTwoIntRecordVec vecRecords;
357  vecRecords.reserve(nNumLinesEstimate);
358 
359  m_vecGeneIdToGiRecords.reserve(nNumLinesEstimate);
360 
361  // parse each line and populate the records array
362  // also populate the m_mapGiToType map with the Gi types
363 
364  unique_ptr<CLineProcessor> proc(new CGene2AccnProcessor(this));
366  (inGene2Accn,
367  proc.get(),
368  vecRecords,
370 
371  // sort the records, remove all those Gis linking to multiple Gene Ids
372 
373  sort(vecRecords.begin(), vecRecords.end(),
375 
376  TTwoIntRecordVec vecFiltered;
377  vecFiltered.reserve(vecRecords.size());
378 
379  if (vecRecords.size() <= 1)
380  {
381  NCBI_THROW(CGeneInfoException, eDataFormatError,
382  "Less than 2 records in the Gene2Accession file.");
383  }
384 
385  size_t iRec;
386  bool bUnique = true;
387  for (iRec = 1; iRec <= vecRecords.size(); iRec++) // (!) yes, <=
388  {
389  x_Gene2Accn_Filter(vecRecords, iRec,
390  bUnique, vecFiltered);
391  }
392 
393  // write the filtered records to the gi->geneId file
394  // and the corresponding (gi, offset) pairs to the gi->offset file
395 
396  STwoIntRecord recordGiToOffset;
397  for (iRec = 0; iRec < vecFiltered.size(); iRec++)
398  {
399  WriteRecord(outGi2Gene, vecFiltered[iRec]);
400 
401  if (x_GetOffsetForGeneId(vecFiltered[iRec].n2,
402  recordGiToOffset.n2))
403  {
404  recordGiToOffset.n1 = vecFiltered[iRec].n1;
405  WriteRecord(outGi2Offset, recordGiToOffset);
406  }
407  else
408  {
409  NCBI_THROW(CGeneInfoException, eDataFormatError,
410  "Offset not found for gene Id: " +
411  NStr::IntToString(vecFiltered[iRec].n2));
412  }
413  }
414 
415  // sort the (GeneId, RNAGi, ProteinGi, GenomicGi) records
416 
419 
420  // write the (GeneId, RNAGi, ProteinGi, GenomicGi) records
421  // to the GeneId->Gi file
422 
423  for (iRec = 0; iRec < m_vecGeneIdToGiRecords.size(); iRec++)
424  {
425  WriteRecord(outGene2Gi, m_vecGeneIdToGiRecords[iRec]);
426  }
427 }
428 
429 //==========================================================================//
430 // Gene Info file processing
431 
433  x_GeneInfo_ParseLine(const string& strLine,
434  SGeneInfoLine& lineData)
435 {
436  if (NStr::StartsWith(strLine, "#"))
437  return false;
438 
439  vector<string> strItems;
440  NStr::SplitByPattern(strLine, "\t", strItems);
441 
442  if (strItems.size() != GENE_INFO_NUM_ITEMS)
443  {
444  NCBI_THROW(CGeneInfoException, eDataFormatError,
445  "GeneInfo file format not recognized.\nLine: " + strLine +
446  "\nFound " + NStr::SizetToString(strItems.size()) + " items.");
447  }
448 
449  // read geneId
450  if (strItems[GENE_INFO_GENE_ID_INDEX] != "-")
451  lineData.geneId =
453  else
454  lineData.geneId = 0;
455 
456  // read taxId
457  if (strItems[GENE_INFO_TAX_ID_INDEX] != "-")
458  lineData.nTaxId =
459  NStr::StringToNumeric<TTaxId>(strItems[GENE_INFO_TAX_ID_INDEX]);
460  else
461  lineData.nTaxId = ZERO_TAX_ID;
462 
463  // read gene name
464  if (strItems[GENE_INFO_SYMBOL_INDEX] != "-")
465  lineData.strSymbol = strItems[GENE_INFO_SYMBOL_INDEX];
466  else
467  lineData.strSymbol = "n/a";
468 
469  // read gene description
470  if (strItems[GENE_INFO_DESCRIPTION_INDEX] != "-")
471  lineData.strDescription = strItems[GENE_INFO_DESCRIPTION_INDEX];
472  else
473  lineData.strDescription = "n/a";
474 
475  return true;
476 }
477 
480  TTwoIntRecordVec& vecRecords)
481 {
482  if (lineData.nTaxId > ZERO_TAX_ID && lineData.geneId > 0)
483  {
484  STwoIntRecord record;
485  record.n1 = lineData.geneId;
486  record.n2 = m_nCurrentOffset;
487  vecRecords.push_back(record);
488 
489  m_mapIdToOffset.insert(make_pair(record.n1, record.n2));
490 
491  string strOrgname;
492  x_GetOrgnameForTaxId(lineData.nTaxId, strOrgname);
493 
494  int nPubMedLinks =
496 
498  new CGeneInfo(lineData.geneId,
499  lineData.strSymbol,
500  lineData.strDescription,
501  strOrgname,
502  nPubMedLinks));
503 
505 
506  m_nGeneIds++;
507  }
508 }
509 
511  Process(const string& strLine,
512  TTwoIntRecordVec& vecRecords)
513 {
514  SGeneInfoLine lineData;
515  if (m_pThis->x_GeneInfo_ParseLine(strLine, lineData))
516  {
517  m_pThis->x_GeneInfo_LineToRecord(lineData, vecRecords);
518  }
519 }
520 
522 {
523  if (!bOverwrite &&
526  {
527  return; // files exist, do not overwrite
528  }
529 
530  // open the original and processed files
531 
532  CNcbiIfstream inGeneInfo;
533  CNcbiOfstream outGene2Offset;
534 
535  if (!OpenTextInputFile(m_strGeneInfoFile, inGeneInfo))
536  {
537  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
538  "Cannot open Gene Info file for reading.");
539  }
540  if (!OpenBinaryOutputFile(m_strGene2OffsetFile, outGene2Offset))
541  {
542  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
543  "Cannot open Gene2Offset file for writing.");
544  }
546  {
547  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
548  "Cannot open the Gene Data file for writing.");
549  }
550 
551  // estimate the number of records we will have
552 
553  Int8 nTotalLenght = GetLength(m_strGeneInfoFile);
554  TSeqPos nNumLinesEstimate = (TSeqPos)nTotalLenght / GENE_INFO_LINE_MIN;
555 
556  // create the array of (geneId, offset) records
557  // and clear the corresponding map
558 
559  TTwoIntRecordVec vecRecords;
560  vecRecords.reserve(nNumLinesEstimate);
561 
563 
564  // parse each line and populate the records array
565  // also, write the combined gene data to m_outAllData
566  // and populate the m_mapIdToOffset map with records
567  // (side effects within x_GeneInfo_LineToRecord)
568 
569  m_nCurrentOffset = 0;
570  unique_ptr<CLineProcessor> proc(new CGeneInfoProcessor(this));
572  (inGeneInfo,
573  proc.get(),
574  vecRecords,
576 
577  // sort the vector of records and output them to the file
578 
579  sort(vecRecords.begin(), vecRecords.end(),
581 
582  for (size_t iRec = 0; iRec < vecRecords.size(); iRec++)
583  {
584  WriteRecord(outGene2Offset, vecRecords[iRec]);
585  }
586 }
587 
588 //==========================================================================//
589 // Gene->PubMed file processing
590 
592  x_Gene2PM_ParseLine(const string& strLine,
593  SGene2PMLine& lineData)
594 {
595  if (NStr::StartsWith(strLine, "#"))
596  return false;
597 
598  vector<string> strItems;
599  NStr::SplitByPattern(strLine, "\t", strItems);
600 
601  if (strItems.size() != GENE_2_PM_NUM_ITEMS)
602  {
603  CNcbiOstrstream oss;
604  oss << "Gene2Pubmed file format not recognized: found ";
605  oss << strItems.size() << " elements per line instead of ";
606  oss << GENE_2_PM_NUM_ITEMS << " in ";
607  oss << m_strGene2PubMedFile;
608  NCBI_THROW(CGeneInfoException, eDataFormatError,
610  }
611 
612  // read geneId
613  if (strItems[GENE_2_PM_GENE_ID_INDEX] != "-")
614  lineData.geneId =
616  else
617  lineData.geneId = 0;
618 
619  // read PMID
620  if (strItems[GENE_2_PM_PMID_INDEX] != "-")
621  lineData.nPMID =
623  else
624  lineData.nPMID = 0;
625 
626  return true;
627 }
628 
630  x_Gene2PM_LineToRecord(const SGene2PMLine& lineData,
631  TTwoIntRecordVec& vecRecords)
632 {
633  if (lineData.geneId > 0)
634  {
635  STwoIntRecord record;
636  record.n1 = lineData.geneId;
637  record.n2 = lineData.nPMID;
638  vecRecords.push_back(record);
639  }
640 }
641 
643  Process(const string& strLine,
644  TTwoIntRecordVec& vecRecords)
645 {
646  SGene2PMLine lineData;
647  if (m_pThis->x_Gene2PM_ParseLine(strLine, lineData))
648  {
649  m_pThis->x_Gene2PM_LineToRecord(lineData, vecRecords);
650  }
651 }
652 
655 {
656  // open the original file
657 
658  CNcbiIfstream inGene2PM;
659  if (!OpenTextInputFile(m_strGene2PubMedFile, inGene2PM))
660  {
661  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
662  "Cannot open Gene2PubMed file for reading.");
663  }
664 
665  // estimate the number of records we will have
666 
667  Int8 nTotalLenght = GetLength(m_strGene2PubMedFile);
668  TSeqPos nNumLinesEstimate = (TSeqPos)nTotalLenght / GENE_2_PM_LINE_MIN;
669 
670  // create the array of Gene Id to PMID lines
671 
672  TTwoIntRecordVec vecRecords;
673  vecRecords.reserve(nNumLinesEstimate);
674 
675  // parse each line and populate the records array
676 
677  unique_ptr<CLineProcessor> proc(new CGene2PMProcessor(this));
679  (inGene2PM,
680  proc.get(),
681  vecRecords,
683 
684  if (vecRecords.size() == 0)
685  return; // no PubMed data
686 
687  // sort the records
688 
689  sort(vecRecords.begin(), vecRecords.end(),
691 
692  // write the records to the geneId->PMIDs map
693 
695 
696  int geneIdCur = vecRecords[0].n1;
697  int nCountCur = 1;
698  for (size_t iRec = 1; iRec < vecRecords.size(); iRec++)
699  {
700  if (vecRecords[iRec].n1 == geneIdCur)
701  {
702  nCountCur++;
703  }
704  else
705  {
706  m_mapIdToNumPMIDs.insert(make_pair(geneIdCur, nCountCur));
707  geneIdCur = vecRecords[iRec].n1;
708  nCountCur = 1;
709  }
710  }
711  m_mapIdToNumPMIDs.insert(make_pair(geneIdCur, nCountCur));
712 }
713 
714 //==========================================================================//
715 // Main interface
716 
718  CGeneFileWriter(const string& strGene2AccessionFile,
719  const string& strGeneInfoFile,
720  const string& strGene2PubMedFile,
721  const string& strOutputDirPath)
722 : m_strGene2AccessionFile(strGene2AccessionFile),
723  m_strGeneInfoFile(strGeneInfoFile),
724  m_strGene2PubMedFile(strGene2PubMedFile)
725 {
727  {
728  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
729  "Gene2Accession file not found.");
730  }
732  {
733  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
734  "GeneInfo file not found.");
735  }
737  {
738  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
739  "Gene2PubMed file not found.");
740  }
741 
742  m_strGi2GeneFile = strOutputDirPath +
744  m_strGene2OffsetFile = strOutputDirPath +
746  m_strGi2OffsetFile = strOutputDirPath +
748  m_strGene2GiFile = strOutputDirPath +
750  m_strAllGeneDataFile = strOutputDirPath +
752  m_strInfoFile = strOutputDirPath +
754 
758 
760  {
761  NCBI_THROW(CGeneInfoException, eFileNotFoundError,
762  "Cannot open the info/stats text file for writing.");
763  }
764 
765  m_nTotalGis = 0;
766  m_nRNAGis = 0;
767  m_nProtGis = 0;
768  m_nGenomicGis = 0;
769  m_nGeneIds = 0;
770 }
771 
773 {}
774 
777 {
778  m_bAllowMultipleIds_RNAGis = bEnable;
779 
780  if (bEnable)
781  m_outInfo << "Multiple GeneID's for RNA Gi's are enabled."
782  << endl;
783 }
784 
787 {
788  m_bAllowMultipleIds_ProtGis = bEnable;
789 
790  if (bEnable)
791  m_outInfo << "Multiple GeneID's for Protein Gi's are enabled."
792  << endl;
793 }
794 
797 {
799 
800  if (bEnable)
801  m_outInfo << "Multiple GeneID's for Genomic Gi's are enabled."
802  << endl;
803 }
804 
805 void CGeneFileWriter::ProcessFiles(bool bOverwrite)
806 {
808  x_GeneInfo_ProcessFile(bOverwrite);
809  x_Gene2Accn_ProcessFile(bOverwrite);
810 
811  m_outInfo << "\nTotal number of GeneID's accepted: "
812  << m_nGeneIds << endl;
813  m_outInfo << "Total number of Gi's processed: "
814  << m_nTotalGis << endl;
815  m_outInfo << "\nGi types encountered:" << endl;
816  m_outInfo << "\tRNA - " << m_nRNAGis << endl;
817  m_outInfo << "\tProtein - " << m_nProtGis << endl;
818  m_outInfo << "\tGenomic - " << m_nGenomicGis << endl;
819 }
820 
821 //==========================================================================//
822 
static bool OpenTextOutputFile(const string &strFileName, CNcbiOfstream &out)
Open the given text file for writing.
Definition: file_utils.cpp:101
static bool OpenBinaryOutputFile(const string &strFileName, CNcbiOfstream &out)
Open the given binary file for writing.
Definition: file_utils.cpp:112
static bool OpenTextInputFile(const string &strFileName, CNcbiIfstream &in)
Open the given text file for reading.
Definition: file_utils.cpp:75
static void WriteGeneInfo(CNcbiOfstream &out, CRef< CGeneInfo > info, int &nCurrentOffset)
Write a Gene info object to the file.
Definition: file_utils.cpp:123
static Int8 GetLength(const string &strFile)
Get the length of a file, given its name.
Definition: file_utils.cpp:66
static bool CheckExistence(const string &strFile)
Check if a file exists, given its name.
Definition: file_utils.cpp:60
static void WriteRecord(CNcbiOfstream &out, STwoIntRecord &record)
Write a pair of integers to the file.
Definition: file_utils.hpp:156
virtual void Process(const string &strLine, TTwoIntRecordVec &vecRecords)
Parse the given line and populate the vector of records.
virtual void Process(const string &strLine, TTwoIntRecordVec &vecRecords)
Parse the given line and populate the vector of records.
virtual void Process(const string &strLine, TTwoIntRecordVec &vecRecords)
Parse the given line and populate the vector of records.
Line processor base class.
CGeneFileWriter * m_pThis
Pointer to the calling instance of CGeneFileWriter.
virtual void Process(const string &strLine, TTwoIntRecordVec &vecRecords)=0
Parse the given line and populate the vector of records.
CNcbiOfstream m_outAllData
Temporary output file stream for the Gene Data file.
CNcbiOfstream m_outInfo
Temporary output file stream for the general info/stats file.
bool x_Gene2PM_ParseLine(const string &strLine, SGene2PMLine &lineData)
Parse a Gene->PubMed line.
void x_Gene2PM_ProcessFile()
Process the Gene Info text file.
string m_strAllGeneDataFile
Path to Gene Data output file.
static bool x_CompareFourIntRecords(const TFourIntRecord &record1, const TFourIntRecord &record2)
Compare four-integer records.
friend class CGene2AccnProcessor
TIntToIntMap m_mapGiToType
Temporary map storing Gi types.
bool m_bAllowMultipleIds_ProtGis
Are multiple Gene IDs allowed for Protein Gis.
void x_GeneInfo_ProcessFile(bool bOverwrite)
Process the Gene Info text file.
int m_nRNAGis
Total number of RNA Gis, for the info/stats file.
static bool x_CompareTwoIntRecords(const STwoIntRecord &record1, const STwoIntRecord &record2)
Compare two-integer records.
void x_Gene2PM_LineToRecord(const SGene2PMLine &lineData, TTwoIntRecordVec &vecRecords)
Convert a parsed Gene->PubMed line to a record.
CRef< CSeqDBExpert > m_seqDb
SeqDB object used to convert taxID to organism name.
TFourIntRecordVec m_vecGeneIdToGiRecords
Temporary vector storing all the records from gene->accession file in the form (GeneId,...
string m_strGi2GeneFile
Path to Gi to GeneID output file.
bool x_GeneInfo_ParseLine(const string &strLine, SGeneInfoLine &lineData)
Parse a Gene Info line.
string m_strGene2PubMedFile
Path to Gene to PubMed input file.
string m_strGene2GiFile
Path to GeneID to Gi output file.
void x_Gene2Accn_Filter(const TTwoIntRecordVec &vecRecords, size_t iRec, bool &bUnique, TTwoIntRecordVec &vecFiltered)
Filtering step for processing Gene->Accession records.
void EnableMultipleGeneIdsForGenomicGis(bool bEnable)
Enable/disable storing multiple Gene IDs for Genomic Gis.
friend class CGene2PMProcessor
int m_nProtGis
Total number of Protein Gis, for the info/stats file.
string m_strInfoFile
Path to the general info/stats output file.
bool x_GetOffsetForGeneId(int geneId, int &nOffset)
Get Gene Data offset given the Gene ID.
bool m_bAllowMultipleIds_RNAGis
Are multiple Gene IDs allowed for RNA Gis.
bool m_bAllowMultipleIds_GenomicGis
Are multiple Gene IDs allowed for Genomic Gis.
virtual ~CGeneFileWriter()
Destructor.
CGeneFileWriter(const string &strGene2AccessionFile, const string &strGeneInfoFile, const string &strGene2PubMedFile, const string &strOutputDirPath)
Construct using direct paths.
bool x_Gene2Accn_ParseLine(const string &strLine, SGene2AccnLine &lineData)
Parse a Gene->Accession line.
TIntToIntMap m_mapIdToNumPMIDs
Temporary map for GeneID to PMID conversion.
int m_nTotalGis
Total number of Gis, for the info/stats file.
TIntToIntMap m_mapIdToOffset
Temporary map for GeneID to Offset conversion.
int m_nCurrentOffset
Current offset into the Gene Data file.
void x_Gene2Accn_ProcessFile(bool bOverwrite)
Process the Gene->Accession text file.
void x_GeneInfo_LineToRecord(const SGeneInfoLine &lineData, TTwoIntRecordVec &vecRecords)
Convert a parsed Gene Info line to a record.
vector< STwoIntRecord > TTwoIntRecordVec
Vector type for two-integer records.
string m_strGeneInfoFile
Path to Gene Info input text file.
int x_GetNumPubMedLinksForGeneId(int geneId)
Get number of PubMed links given the Gene ID.
int m_nGenomicGis
Total number of Genomic Gis, for the info/stats file.
int m_nGeneIds
Total number of Gene IDs, for the info/stats file.
string m_strGi2OffsetFile
Path to Gi to Offset output file.
void EnableMultipleGeneIdsForRNAGis(bool bEnable)
Enable/disable storing multiple Gene IDs for RNA Gis.
void EnableMultipleGeneIdsForProteinGis(bool bEnable)
Enable/disable storing multiple Gene IDs for Protein Gis.
friend class CGeneInfoProcessor
void ProcessFiles(bool bOverwrite=false)
Process all the input files and generate the binary files.
void x_GetOrgnameForTaxId(TTaxId nTaxId, string &strName)
Get the scientific name of the organism given its TaxID.
void x_ReadAndProcessFile(CNcbiIfstream &in, CLineProcessor *pLineProcessor, TTwoIntRecordVec &vecRecords, int nMinLineLength)
Process a text file and generate an array of records.
string m_strGene2AccessionFile
Path to Gene to Accession input text file.
void x_Gene2Accn_LineToRecord(const SGene2AccnLine &lineData, TTwoIntRecordVec &vecRecords)
Convert a parsed Gene->Accession line to one or more records.
string m_strGene2OffsetFile
Path to GeneID to Offset output file.
CGeneInfoException.
Definition: gene_info.hpp:63
CGeneInfo.
Definition: gene_info.hpp:107
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CRef –.
Definition: ncbiobj.hpp:618
CSeqDBExpert.
Definition: seqdbexpert.hpp:55
static void GetTaxInfo(TTaxId taxid, SSeqDBTaxInfo &info)
Get taxonomy information.
Definition: seqdb.cpp:1105
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
container_type::value_type value_type
Definition: map.hpp:52
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Defines constants for reading and processing the Gene files.
#define GENE_2_PM_PMID_INDEX
Index of the PubMed ID item on a Gene->PubMed line.
#define GENE_INFO_NUM_ITEMS
Number of items on a valid Gene Info line.
#define GENE_2_PM_LINE_MIN
Minimum valid length of a Gene->PubMed line.
#define GENE_2_ACCN_GENE_ID_INDEX
Index of the Gene ID item on a Gene->Accession line.
#define GENE_2_ACCN_PROT_GI_INDEX
Index of the Protein Gi item on a Gene->Accession line.
#define GENE_2_PM_NUM_ITEMS
Number of items on a valid Gene->PubMed line.
#define GENE_INFO_GENE_ID_INDEX
Index of the Gene ID item on a Gene Info line.
#define GENE_INFO_LINE_MIN
Minimum valid length of a Gene Info line.
#define GENE_INFO_SYMBOL_INDEX
Index of the Gene Symbol item on a Gene Info line.
#define GENE_2_ACCN_GENOMIC_GI_INDEX
Index of the Genomic Gi item on a Gene->Accession line.
#define GENE_2_ACCN_TAX_ID_INDEX
Index of the taxonomy ID item on a Gene->Accession line.
#define GENE_2_PM_GENE_ID_INDEX
Index of the Gene ID item on a Gene->PubMed line.
#define GENE_2_ACCN_NUM_ITEMS
Number of items on a valid Gene->Accession line.
#define GENE_INFO_TAX_ID_INDEX
Index of the taxonomy ID item on a Gene Info line.
#define GENE_2_ACCN_LINE_MIN
Minimum valid length of a Gene->Accession line.
#define GENE_INFO_DESCRIPTION_INDEX
Index of the Gene Description item on a Gene Info line.
#define GENE_2_ACCN_RNA_GI_INDEX
Index of the RNA Gi item on a Gene->Accession line.
#define GENE_GENE2OFFSET_FILE_NAME
Name of the processed "GeneID to Offset" file.
#define GENE_GENERAL_INFO_FILE_NAME
Name of the general information/statistics file.
#define GENE_GI2GENE_FILE_NAME
Name of the processed "Gi to GeneID" file.
#define GENE_ALL_GENE_DATA_FILE_NAME
Name of the combined "Gene Data" file.
#define GENE_GI2OFFSET_FILE_NAME
Name of the processed "Gi to Offset" file.
#define GENE_GENE2GI_FILE_NAME
Name of the processed "Gene ID to Gi" file.
Defines a class for processing Gene files.
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static list< string > & SplitByPattern(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Variation of Split() with fSplit_ByPattern flag applied by default.
Definition: ncbistr.cpp:3503
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
if(yy_accept[yy_current_state])
static MDB_envinfo info
Definition: mdb_load.c:37
constexpr auto sort(_Init &&init)
std::istream & in(std::istream &in_, double &x_)
static pcre_uint8 * buffer
Definition: pcretest.c:1051
static const char * proc
Definition: stats.c:21
SMultiIntRecord - an n-tuple of integers.
Definition: file_utils.hpp:84
int n[k_nFields]
Array of integer fields of the record.
Definition: file_utils.hpp:86
STwoIntRecord - a pair of integers.
Definition: file_utils.hpp:70
int n1
First integer field of the record.
Definition: file_utils.hpp:72
int n2
Second integer field of the record.
Definition: file_utils.hpp:75
Structure representing a parsed gene->accession line.
int giRNANucl
RNA Gi corresponding to this Gene ID (0 if none).
int giProt
Protein Gi corresponding to this Gene ID (0 if none).
int giGenomicNucl
Genomic Gi corresponding to this Gene ID (0 if none).
Structure representing a parsed gene->pubmed line.
Structure representing a parsed gene info line.
string strDescription
Gene Description (plain text, may include several sentences).
SSeqDBTaxInfo.
string scientific_name
Scientific name, such as "Aotus vociferans".
Modified on Sat Dec 02 09:24:14 2023 by modify_doxy.py rev. 669887