53 if (
int(
buffer.length()) >= nMinLineLength)
90 nOffset = itIdToOffset->second;
103 nPubMedLinks = itPM->second;
112 return record1.
n1 < record2.
n1 ||
113 (record1.
n1 == record2.
n1 &&
114 record1.
n2 < record2.
n2);
121 return (record1.
n[0] < record2.
n[0] ||
122 (record1.
n[0] == record2.
n[0] &&
123 (record1.
n[1] < record2.
n[1] ||
124 (record1.
n[1] == record2.
n[1] &&
125 (record1.
n[2] < record2.
n[2] ||
126 (record1.
n[2] == record2.
n[2] &&
127 (record1.
n[3] < record2.
n[3])))))));
140 vector<string> strItems;
146 oss <<
"Gene2Accession file format not recognized: found ";
147 oss << strItems.size() <<
" elements per line instead of ";
202 recordGeneIdToGi.
n[0] = lineData.
geneId;
203 recordGeneIdToGi.
n[1] = 0;
204 recordGeneIdToGi.
n[2] = 0;
205 recordGeneIdToGi.
n[3] = 0;
210 vecRecords.push_back(record);
222 vecRecords.push_back(record);
229 recordGeneIdToGi.
n[2] = lineData.
giProt;
234 vecRecords.push_back(record);
268 bool bHasBeenAdded =
false;
269 if (vecFiltered.size() > 0)
270 if (vecFiltered.back().n1 == recordToAdd.
n1 &&
271 vecFiltered.back().n2 == recordToAdd.
n2)
272 bHasBeenAdded =
true;
284 bool bLastInGroup =
false;
285 if (iRec < vecRecords.size())
286 bLastInGroup = vecRecords[iRec].n1 != recordToAdd.
n1;
291 bool bAddPrev =
false;
293 bAddPrev = !bHasBeenAdded;
295 bAddPrev = bUnique && bLastInGroup;
298 vecFiltered.push_back(recordToAdd);
300 if (iRec < vecRecords.size())
309 bUnique = vecRecords[iRec].n2 == recordToAdd.
n2;
331 "Cannot open Gene2Accession file for reading.");
336 "Cannot open Gi2Gene file for writing.");
341 "Cannot open Gi2Offset file for writing.");
346 "Cannot open Gene2Gi file for writing.");
357 vecRecords.reserve(nNumLinesEstimate);
373 sort(vecRecords.begin(), vecRecords.end(),
377 vecFiltered.reserve(vecRecords.size());
379 if (vecRecords.size() <= 1)
382 "Less than 2 records in the Gene2Accession file.");
387 for (iRec = 1; iRec <= vecRecords.size(); iRec++)
390 bUnique, vecFiltered);
397 for (iRec = 0; iRec < vecFiltered.size(); iRec++)
402 recordGiToOffset.
n2))
404 recordGiToOffset.
n1 = vecFiltered[iRec].n1;
410 "Offset not found for gene Id: " +
439 vector<string> strItems;
445 "GeneInfo file format not recognized.\nLine: " + strLine +
487 vecRecords.push_back(record);
515 if (m_pThis->x_GeneInfo_ParseLine(strLine, lineData))
517 m_pThis->x_GeneInfo_LineToRecord(lineData, vecRecords);
538 "Cannot open Gene Info file for reading.");
543 "Cannot open Gene2Offset file for writing.");
548 "Cannot open the Gene Data file for writing.");
560 vecRecords.reserve(nNumLinesEstimate);
579 sort(vecRecords.begin(), vecRecords.end(),
582 for (
size_t iRec = 0; iRec < vecRecords.size(); iRec++)
598 vector<string> strItems;
604 oss <<
"Gene2Pubmed file format not recognized: found ";
605 oss << strItems.size() <<
" elements per line instead of ";
638 vecRecords.push_back(record);
647 if (m_pThis->x_Gene2PM_ParseLine(strLine, lineData))
649 m_pThis->x_Gene2PM_LineToRecord(lineData, vecRecords);
662 "Cannot open Gene2PubMed file for reading.");
673 vecRecords.reserve(nNumLinesEstimate);
684 if (vecRecords.size() == 0)
689 sort(vecRecords.begin(), vecRecords.end(),
696 int geneIdCur = vecRecords[0].n1;
698 for (
size_t iRec = 1; iRec < vecRecords.size(); iRec++)
700 if (vecRecords[iRec].n1 == geneIdCur)
707 geneIdCur = vecRecords[iRec].n1;
719 const string& strGeneInfoFile,
720 const string& strGene2PubMedFile,
721 const string& strOutputDirPath)
729 "Gene2Accession file not found.");
734 "GeneInfo file not found.");
739 "Gene2PubMed file not found.");
762 "Cannot open the info/stats text file for writing.");
781 m_outInfo <<
"Multiple GeneID's for RNA Gi's are enabled."
791 m_outInfo <<
"Multiple GeneID's for Protein Gi's are enabled."
801 m_outInfo <<
"Multiple GeneID's for Genomic Gi's are enabled."
811 m_outInfo <<
"\nTotal number of GeneID's accepted: "
813 m_outInfo <<
"Total number of Gi's processed: "
815 m_outInfo <<
"\nGi types encountered:" << endl;
static bool OpenTextOutputFile(const string &strFileName, CNcbiOfstream &out)
Open the given text file for writing.
static bool OpenBinaryOutputFile(const string &strFileName, CNcbiOfstream &out)
Open the given binary file for writing.
static bool OpenTextInputFile(const string &strFileName, CNcbiIfstream &in)
Open the given text file for reading.
static void WriteGeneInfo(CNcbiOfstream &out, CRef< CGeneInfo > info, int &nCurrentOffset)
Write a Gene info object to the file.
static Int8 GetLength(const string &strFile)
Get the length of a file, given its name.
static bool CheckExistence(const string &strFile)
Check if a file exists, given its name.
static void WriteRecord(CNcbiOfstream &out, STwoIntRecord &record)
Write a pair of integers to the file.
virtual void Process(const string &strLine, TTwoIntRecordVec &vecRecords)
Parse the given line and populate the vector of records.
virtual void Process(const string &strLine, TTwoIntRecordVec &vecRecords)
Parse the given line and populate the vector of records.
virtual void Process(const string &strLine, TTwoIntRecordVec &vecRecords)
Parse the given line and populate the vector of records.
Line processor base class.
CGeneFileWriter * m_pThis
Pointer to the calling instance of CGeneFileWriter.
virtual void Process(const string &strLine, TTwoIntRecordVec &vecRecords)=0
Parse the given line and populate the vector of records.
CNcbiOfstream m_outAllData
Temporary output file stream for the Gene Data file.
CNcbiOfstream m_outInfo
Temporary output file stream for the general info/stats file.
bool x_Gene2PM_ParseLine(const string &strLine, SGene2PMLine &lineData)
Parse a Gene->PubMed line.
void x_Gene2PM_ProcessFile()
Process the Gene Info text file.
string m_strAllGeneDataFile
Path to Gene Data output file.
static bool x_CompareFourIntRecords(const TFourIntRecord &record1, const TFourIntRecord &record2)
Compare four-integer records.
friend class CGene2AccnProcessor
TIntToIntMap m_mapGiToType
Temporary map storing Gi types.
bool m_bAllowMultipleIds_ProtGis
Are multiple Gene IDs allowed for Protein Gis.
void x_GeneInfo_ProcessFile(bool bOverwrite)
Process the Gene Info text file.
int m_nRNAGis
Total number of RNA Gis, for the info/stats file.
static bool x_CompareTwoIntRecords(const STwoIntRecord &record1, const STwoIntRecord &record2)
Compare two-integer records.
void x_Gene2PM_LineToRecord(const SGene2PMLine &lineData, TTwoIntRecordVec &vecRecords)
Convert a parsed Gene->PubMed line to a record.
CRef< CSeqDBExpert > m_seqDb
SeqDB object used to convert taxID to organism name.
TFourIntRecordVec m_vecGeneIdToGiRecords
Temporary vector storing all the records from gene->accession file in the form (GeneId,...
string m_strGi2GeneFile
Path to Gi to GeneID output file.
bool x_GeneInfo_ParseLine(const string &strLine, SGeneInfoLine &lineData)
Parse a Gene Info line.
string m_strGene2PubMedFile
Path to Gene to PubMed input file.
string m_strGene2GiFile
Path to GeneID to Gi output file.
void x_Gene2Accn_Filter(const TTwoIntRecordVec &vecRecords, size_t iRec, bool &bUnique, TTwoIntRecordVec &vecFiltered)
Filtering step for processing Gene->Accession records.
void EnableMultipleGeneIdsForGenomicGis(bool bEnable)
Enable/disable storing multiple Gene IDs for Genomic Gis.
friend class CGene2PMProcessor
int m_nProtGis
Total number of Protein Gis, for the info/stats file.
string m_strInfoFile
Path to the general info/stats output file.
bool x_GetOffsetForGeneId(int geneId, int &nOffset)
Get Gene Data offset given the Gene ID.
bool m_bAllowMultipleIds_RNAGis
Are multiple Gene IDs allowed for RNA Gis.
bool m_bAllowMultipleIds_GenomicGis
Are multiple Gene IDs allowed for Genomic Gis.
virtual ~CGeneFileWriter()
Destructor.
CGeneFileWriter(const string &strGene2AccessionFile, const string &strGeneInfoFile, const string &strGene2PubMedFile, const string &strOutputDirPath)
Construct using direct paths.
bool x_Gene2Accn_ParseLine(const string &strLine, SGene2AccnLine &lineData)
Parse a Gene->Accession line.
TIntToIntMap m_mapIdToNumPMIDs
Temporary map for GeneID to PMID conversion.
int m_nTotalGis
Total number of Gis, for the info/stats file.
TIntToIntMap m_mapIdToOffset
Temporary map for GeneID to Offset conversion.
int m_nCurrentOffset
Current offset into the Gene Data file.
void x_Gene2Accn_ProcessFile(bool bOverwrite)
Process the Gene->Accession text file.
void x_GeneInfo_LineToRecord(const SGeneInfoLine &lineData, TTwoIntRecordVec &vecRecords)
Convert a parsed Gene Info line to a record.
vector< STwoIntRecord > TTwoIntRecordVec
Vector type for two-integer records.
string m_strGeneInfoFile
Path to Gene Info input text file.
int x_GetNumPubMedLinksForGeneId(int geneId)
Get number of PubMed links given the Gene ID.
int m_nGenomicGis
Total number of Genomic Gis, for the info/stats file.
int m_nGeneIds
Total number of Gene IDs, for the info/stats file.
string m_strGi2OffsetFile
Path to Gi to Offset output file.
void EnableMultipleGeneIdsForRNAGis(bool bEnable)
Enable/disable storing multiple Gene IDs for RNA Gis.
void EnableMultipleGeneIdsForProteinGis(bool bEnable)
Enable/disable storing multiple Gene IDs for Protein Gis.
friend class CGeneInfoProcessor
void ProcessFiles(bool bOverwrite=false)
Process all the input files and generate the binary files.
void x_GetOrgnameForTaxId(TTaxId nTaxId, string &strName)
Get the scientific name of the organism given its TaxID.
void x_ReadAndProcessFile(CNcbiIfstream &in, CLineProcessor *pLineProcessor, TTwoIntRecordVec &vecRecords, int nMinLineLength)
Process a text file and generate an array of records.
string m_strGene2AccessionFile
Path to Gene to Accession input text file.
void x_Gene2Accn_LineToRecord(const SGene2AccnLine &lineData, TTwoIntRecordVec &vecRecords)
Convert a parsed Gene->Accession line to one or more records.
string m_strGene2OffsetFile
Path to GeneID to Offset output file.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
static void GetTaxInfo(TTaxId taxid, SSeqDBTaxInfo &info)
Get taxonomy information.
container_type::iterator iterator
const_iterator end() const
iterator_bool insert(const value_type &val)
container_type::value_type value_type
const_iterator find(const key_type &key) const
Defines constants for reading and processing the Gene files.
#define GENE_2_PM_PMID_INDEX
Index of the PubMed ID item on a Gene->PubMed line.
#define GENE_INFO_NUM_ITEMS
Number of items on a valid Gene Info line.
#define GENE_2_PM_LINE_MIN
Minimum valid length of a Gene->PubMed line.
#define GENE_2_ACCN_GENE_ID_INDEX
Index of the Gene ID item on a Gene->Accession line.
#define GENE_2_ACCN_PROT_GI_INDEX
Index of the Protein Gi item on a Gene->Accession line.
#define GENE_2_PM_NUM_ITEMS
Number of items on a valid Gene->PubMed line.
#define GENE_INFO_GENE_ID_INDEX
Index of the Gene ID item on a Gene Info line.
#define GENE_INFO_LINE_MIN
Minimum valid length of a Gene Info line.
#define GENE_INFO_SYMBOL_INDEX
Index of the Gene Symbol item on a Gene Info line.
#define GENE_2_ACCN_GENOMIC_GI_INDEX
Index of the Genomic Gi item on a Gene->Accession line.
#define GENE_2_ACCN_TAX_ID_INDEX
Index of the taxonomy ID item on a Gene->Accession line.
#define GENE_2_PM_GENE_ID_INDEX
Index of the Gene ID item on a Gene->PubMed line.
#define GENE_2_ACCN_NUM_ITEMS
Number of items on a valid Gene->Accession line.
#define GENE_INFO_TAX_ID_INDEX
Index of the taxonomy ID item on a Gene Info line.
#define GENE_2_ACCN_LINE_MIN
Minimum valid length of a Gene->Accession line.
#define GENE_INFO_DESCRIPTION_INDEX
Index of the Gene Description item on a Gene Info line.
#define GENE_2_ACCN_RNA_GI_INDEX
Index of the RNA Gi item on a Gene->Accession line.
#define GENE_GENE2OFFSET_FILE_NAME
Name of the processed "GeneID to Offset" file.
#define GENE_GENERAL_INFO_FILE_NAME
Name of the general information/statistics file.
#define GENE_GI2GENE_FILE_NAME
Name of the processed "Gi to GeneID" file.
#define GENE_ALL_GENE_DATA_FILE_NAME
Name of the combined "Gene Data" file.
#define GENE_GI2OFFSET_FILE_NAME
Name of the processed "Gi to Offset" file.
#define GENE_GENE2GI_FILE_NAME
Name of the processed "Gene ID to Gi" file.
Defines a class for processing Gene files.
unsigned int TSeqPos
Type for sequence locations and lengths.
SStrictId_Tax::TId TTaxId
Taxon id type.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Reset(void)
Reset reference object.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
int64_t Int8
8-byte (64-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static list< string > & SplitByPattern(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Variation of Split() with fSplit_ByPattern flag applied by default.
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
if(yy_accept[yy_current_state])
constexpr auto sort(_Init &&init)
std::istream & in(std::istream &in_, double &x_)
static pcre_uint8 * buffer
SMultiIntRecord - an n-tuple of integers.
int n[k_nFields]
Array of integer fields of the record.
STwoIntRecord - a pair of integers.
int n1
First integer field of the record.
int n2
Second integer field of the record.
Structure representing a parsed gene->accession line.
int giRNANucl
RNA Gi corresponding to this Gene ID (0 if none).
int giProt
Protein Gi corresponding to this Gene ID (0 if none).
int giGenomicNucl
Genomic Gi corresponding to this Gene ID (0 if none).
TTaxId nTaxId
Taxonomy ID.
Structure representing a parsed gene->pubmed line.
Structure representing a parsed gene info line.
string strSymbol
Gene Symbol.
string strDescription
Gene Description (plain text, may include several sentences).
TTaxId nTaxId
Taxonomy ID.
string scientific_name
Scientific name, such as "Aotus vociferans".