70 # error COMP_LOG was already defined
74 #define COMP_LOG(msg) \
76 if( x_IsLogFileOpen() ) { \
77 *m_pLoadLogFile << msg << endl; \
103 const bool bAccnIsProtOnly = (
107 if( bAccnIsProtOnly || ! bSeqIdIsFound )
113 const bool bLocalSeqIdIsfound = (
115 if( bLocalSeqIdIsfound ) {
117 cerr <<
"Warning: '" <<
str <<
"' was used as an accession, "
118 "so the local component was ignored." << endl;
131 CCustomAgpToSeqEntry(
CScope * pScope)
139 return s_CustomGetSeqIdFromStr(
str, m_pScope.GetPointer());
160 const std::list<std::string> & files,
172 list<string> compAndObjFiles;
173 list<string> agpFiles;
174 ITERATE( std::list<std::string>, file_iter, files ) {
175 const string &
file = *file_iter;
180 compAndObjFiles.push_back(
file);
183 agpFiles.push_back(
file);
188 if( ! loadlog.empty() ) {
193 if( ! agp_as_fasta_file.empty() ) {
210 COMP_LOG(
"Component seq-id from AGP file(s): "
211 << seq_id_it->AsString());
214 COMP_LOG(
"Object seq-id from AGP file(s): "
215 << seq_id_it->AsString());
221 unique_ptr<CTmpFile> ldsdb_file;
234 lds_mgr->SetFastaFlags(fasta_flags);
236 list<string> objfiles;
237 ITERATE( list<string>, file_iter, compAndObjFiles ) {
242 COMP_LOG(
"Object file: " << *file_iter);
243 objfiles.push_back(*file_iter);
247 ifstream file_strm( file_iter->c_str() );
257 SIZE_TYPE after_seq_id_pos = line.find_first_of(
" \t");
258 if( after_seq_id_pos == string::npos ) {
259 after_seq_id_pos = line.length();
261 string acc_long = line.substr(1, (after_seq_id_pos - 1));
265 COMP_LOG(
"Sample accession from " << *file_iter
267 if( compSeqIds.
find(acc_h) != compSeqIds.
end() ) {
270 COMP_LOG(
"Component file: " << *file_iter);
271 lds_mgr->AddDataFile( *file_iter );
273 }
else if( objSeqIds.
find(acc_h) != objSeqIds.
end() ) {
275 COMP_LOG(
"Object file: " << *file_iter);
276 objfiles.push_back(*file_iter);
284 cerr <<
"Warning: This file seems to be unused: '"
285 << *file_iter <<
"'" << endl;
288 lds_mgr->UpdateData();
300 unique_ptr<CTmpSeqVecStorage> temp_dir;
301 if( diffs_to_find > 0 ) {
307 if( agpFiles.empty() ) {
308 cerr <<
"error: could not find any agp files" << endl;
315 if( objfiles.empty() ) {
316 cerr <<
"error: could not find any obj files" << endl;
340 for ( ; iter1 != iter1_end && iter2 != iter2_end; ) {
341 if (iter1->first < iter2->first) {
342 copy( iter1->second.begin(), iter1->second.end(),
343 inserter(vSeqIdFASTAOnly, vSeqIdFASTAOnly.
begin() ) );
346 else if (iter2->first < iter1->first) {
347 copy( iter2->second.begin(), iter2->second.end(),
348 inserter(vSeqIdAGPOnly, vSeqIdAGPOnly.
begin() ) );
351 else if( iter1->second != iter2->second ) {
353 set_difference( iter1->second.begin(), iter1->second.end(),
354 iter2->second.begin(), iter2->second.end(),
355 inserter(vSeqIdFASTAOnly,
356 vSeqIdFASTAOnly.
begin() ) );
359 set_difference( iter2->second.begin(), iter2->second.end(),
360 iter1->second.begin(), iter1->second.end(),
361 inserter(vSeqIdAGPOnly,
362 vSeqIdAGPOnly.
begin() ) );
373 for ( ; iter1 != iter1_end; ++iter1) {
374 copy( iter1->second.begin(), iter1->second.end(),
375 inserter(vSeqIdFASTAOnly, vSeqIdFASTAOnly.
begin() ) );
378 for ( ; iter2 != iter2_end; ++iter2) {
379 copy( iter2->second.begin(), iter2->second.end(),
380 inserter(vSeqIdAGPOnly, vSeqIdAGPOnly.
begin() ) );
389 const bool bThereWereDifferences = (
390 ( ! vSeqIdFASTAOnly.
empty() &&
392 ( ! vSeqIdAGPOnly.
empty() &&
394 if( ! bThereWereDifferences ) {
397 if( bThereWereDifferences ) {
401 if( bThereWereDifferences && diffs_to_find > 0 &&
402 ! seqIdIntersection.
empty() )
414 m_dir( x_GetTmpDir() )
417 throw std::runtime_error(
"Temp dir already exists: " +
m_dir.
GetPath() );
421 throw std::runtime_error(
"Could not create temp dir: " +
m_dir.
GetPath() );
427 if( ! m_dir.Remove() ) {
428 cerr <<
"Warning: could not delete temporary dir "
429 << m_dir.GetPath() << endl;
439 ofstream output_stream( GetFileName(
type, idh).c_str() );
445 int bytes_copied = 0;
446 for( ; iter != vec.
end(); ++iter, ++bytes_copied ) {
447 if( bytes_copied > 0 && (bytes_copied % 60) == 0 ) {
449 output_stream <<
'\n';
451 output_stream << *iter;
453 output_stream << endl;
460 std::stringstream file_name_strm;
465 file_name_strm <<
"agp";
468 file_name_strm <<
"obj";
473 file_name_strm <<
"UNKNOWN";
477 file_name_strm <<
'.';
482 const string initial_seq_id = idh.
AsString();
483 std::stringstream final_seq_id;
484 ITERATE(
string, ch_iter, initial_seq_id) {
485 const unsigned char ch = *ch_iter;
489 final_seq_id <<
'_' << setfill(
'0') << setw(3) << ch;
492 file_name_strm << final_seq_id.str();
495 return file_name_strm.str();
500 std::stringstream dir_strm;
505 return dir_strm.str();
510 int * in_out_pUniqueBioseqsLoaded,
511 int * in_out_pBioseqsSkipped,
515 in_out_pUniqueBioseqsLoaded !=
NULL &&
516 in_out_pBioseqsSkipped !=
NULL );
527 if( ! vec.
CanGetRange(0, bioseq_it->GetBioseqLength()) ) {
528 LOG_POST(
Error <<
" Skipping one: could not load due to error "
530 "(length issue or does not include range [1, "
531 << bioseq_it->GetBioseqLength() <<
"] or "
532 "doesn't exist) for " << idh
533 <<
" (though issue could be due to failure to resolve "
534 "one of the contigs. "
535 "Are all necessary components in GenBank or in files "
536 "specified on the command-line?)." );
546 LOG_POST(
Error <<
" Skipping one: could not load due to error, "
547 "probably in AGP file, possibly a length issue, for "
549 <<
"Raw technical information about error: " << ex.
what() );
554 if( pDataOutFile !=
NULL ) {
565 pair<TSeqIdSet::iterator, bool> insert_result =
567 if( ! insert_result.second ) {
568 LOG_POST(
Error <<
" Error: skipping sequence with same name and values: " << idh);
576 os << setw(2) << setfill(
'0') <<
hex << (
int)((
unsigned char)*
i);
580 <<
" / " <<
key.second);
583 ++*in_out_pUniqueBioseqsLoaded;
586 *in_out_pBioseqsSkipped = ( total - *in_out_pUniqueBioseqsLoaded);
596 dataOutFile << '>
' << idh << endl;
598 const SIZE_TYPE data_len = data.length();
599 SIZE_TYPE next_idx = 0;
600 for( ; next_idx < data_len ; next_idx += kFastaWidth ) {
601 SIZE_TYPE chars_to_copy = min( kFastaWidth, (data_len - next_idx) );
602 dataOutFile.write( data.c_str() + next_idx, chars_to_copy );
607 void CAgpFastaComparator::x_PrintDetailsOfLengthIssue(
608 CBioseq_Handle bioseq_h )
610 const static string kBugInAgpFastaCompare(
611 " This is probably a bug in agp_fasta_compare: could not get "
612 "information on the bioseq with an error" );
614 const CDelta_ext::Tdata *p_delta_data = NULL;
616 CScope &scope = bioseq_h.GetScope();
618 p_delta_data = &bioseq_h.GetCompleteBioseq()->GetInst().GetExt().GetDelta().Get();
620 if( p_delta_data == NULL ) {
621 LOG_POST(Error << kBugInAgpFastaCompare);
626 // put it in a reference to make it easier to work with
627 const CDelta_ext::Tdata &delta_data = *p_delta_data;
629 ITERATE( CDelta_ext::Tdata, delta_iter, delta_data ) {
630 if( (*delta_iter)->IsLiteral() ) {
634 const CSeq_interval & seq_int = (*delta_iter)->GetLoc().GetInt();
636 const TSeqPos highest_pnt =
637 max( seq_int.GetFrom(), seq_int.GetTo() );
638 CSeq_id_Handle seq_id_h =
639 CSeq_id_Handle::GetHandle(seq_int.GetId());
641 CBioseq_Handle inner_bioseq_h;
643 inner_bioseq_h = scope.GetBioseqHandle(seq_id_h);
644 if( ! inner_bioseq_h ) {
645 LOG_POST(Error << " Couldn't find bioseq
for "
647 << ". Maybe you need to specify component
file(s).
" );
648 } else if( ! inner_bioseq_h.IsSetInst_Length() ) {
649 LOG_POST(Error << " Could not get length of bioseq
for "
652 const TSeqPos bioseq_len = inner_bioseq_h.GetInst_Length();
653 if( highest_pnt >= bioseq_len ) {
654 LOG_POST(Error << " For
"
656 << " length is
" << bioseq_len
657 << " but user tries to access the point
"
658 << (highest_pnt+1) ); // "+1
" because user sees 1-based
662 LOG_POST(Error << " Could not find bioseq
for "
664 << ". Maybe you need to specify component
file(s).
" );
667 } catch(std::exception & ex) {
668 CNcbiOstrstream bioseq_strm;
669 bioseq_strm << MSerial_AsnText << *bioseq_h.GetCompleteBioseq();
670 LOG_POST(Error << kBugInAgpFastaCompare << ":
"
672 << "Raw technical information about
error:
" << Endl()
675 << " Bioseq
ASN.1:
" << (string)CNcbiOstrstreamToString(bioseq_strm) );
678 CNcbiOstrstream bioseq_strm;
679 bioseq_strm << MSerial_AsnText << *bioseq_h.GetCompleteBioseq();
680 LOG_POST(Error << kBugInAgpFastaCompare << ":
"
682 << " Bioseq
ASN.1:
" << (string)CNcbiOstrstreamToString(bioseq_strm) );
687 bool CAgpFastaComparator::x_GetCompAndObjSeqIds(
688 TSeqIdSet & out_compSeqIds,
689 TSeqIdSet & out_objSeqIds,
690 const std::list<std::string> & agpFiles )
692 const static CTempString kDelim("\
t");
694 const static CTempString kNotAGPErr(
697 // what is held in some of the AGP columns
698 const static int kObjSeqIdCol = 0;
699 const static int kCompTypeCol = 4;
700 const static int kCompSeqIdCol = 5;
701 const static int kMaxColUsed = kCompSeqIdCol;
703 vector<CTempString> vecLineTokens;
705 // for speed, we do the parsing ourselves with only very minimal
707 ITERATE( std::list<std::string>, file_iter, agpFiles ) {
708 ifstream file_strm(file_iter->c_str());
710 while( NcbiGetline(file_strm, line, "\
r\
n") ) {
711 // skip comment lines
712 if( line.empty() || line[0] == '#' ) {
716 vecLineTokens.clear();
717 NStr::Split(line, kDelim, vecLineTokens, 0);
719 // are there enough columns for an AGP file?
720 if( vecLineTokens.size() <= kMaxColUsed ){
721 cerr << kNotAGPErr << *file_iter << endl;
726 CTempString sComponentType = vecLineTokens[kCompTypeCol];
727 if( sComponentType.length() != 1 ) {
728 cerr << kNotAGPErr << *file_iter << endl;
731 const char chCompType = toupper(sComponentType[0]);
732 if( chCompType == 'N' || chCompType == 'U' )
739 CRef<CSeq_id> objSeqId = s_CustomGetSeqIdFromStr(
740 vecLineTokens[kObjSeqIdCol], NULL);
741 out_objSeqIds.insert(
742 CSeq_id_Handle::GetHandle(*objSeqId));
744 // get component Seq-id
745 CRef<CSeq_id> comp_seq_id =
746 s_CustomGetSeqIdFromStr(
747 vecLineTokens[kCompSeqIdCol], NULL);
748 out_compSeqIds.insert(
749 CSeq_id_Handle::GetHandle(*comp_seq_id) );
756 void CAgpFastaComparator::x_ProcessObjects(
757 const list<string> & filenames,
758 TUniqueSeqs& fasta_ids,
759 CTmpSeqVecStorage *temp_dir )
764 LOG_POST(Error << "Processing
object file(s)...
");
765 COMP_LOG("Processing
object file(s)...
");
766 ITERATE( list<string>, file_iter, filenames ) {
767 const string &filename = *file_iter;
769 CFormatGuess guesser( filename );
770 const CFormatGuess::EFormat format =
771 guesser.GuessFormat();
773 if( format == CFormatGuess::eFasta ) {
774 CNcbiIfstream file_istrm( filename.c_str(), ios::binary );
775 CFastaReader reader(file_istrm, CFastaReader::fAddMods);
777 CRef<CSeq_entry> entry = reader.ReadOneSeq();
779 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
780 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry);
781 x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL );
783 temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh );
786 } else if( format == CFormatGuess::eBinaryASN ||
787 format == CFormatGuess::eTextASN )
789 // see if it's a submit
790 CRef<CSeq_submit> submit( new CSeq_submit );
792 CNcbiIfstream file_istrm( filename.c_str(), ios::binary );
793 x_SetBinaryVsText( file_istrm, format );
794 file_istrm >> *submit;
799 if( ! submit->IsEntrys() ) {
800 LOG_POST(Error << "Seq-submits must have
'entrys'.
");
805 ITERATE( CSeq_submit::C_Data::TEntrys, entry_iter,
806 submit->GetData().GetEntrys() )
808 const CSeq_entry &entry = **entry_iter;
810 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
811 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(entry);
812 x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL );
814 temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh );
820 CRef<CSeq_entry> entry( new CSeq_entry );
822 CNcbiIfstream file_istrm( filename.c_str(), ios::binary );
823 x_SetBinaryVsText( file_istrm, format );
824 file_istrm >> *entry;
826 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
827 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry);
828 x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL );
830 temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh );
834 LOG_POST(Error << "Could not determine
format of
" << filename
835 << ", best guess is:
" << CFormatGuess::GetFormatName(format) );
840 catch(CObjReaderParseException & ex ) {
841 if( ex.GetErrCode() == CObjReaderParseException::eEOF ) {
842 // end of file; no problem
844 LOG_POST(Error << "Error reading
object file:
" << ex.what() );
849 catch (CException& ex ) {
850 LOG_POST(Error << "Error reading
object file:
" << ex.what() );
856 LOG_POST(Error << "Loaded
" << iNumLoaded << " object file sequence(s).
");
857 if( iNumSkipped > 0 ) {
858 LOG_POST(Error << " Skipped
" << iNumSkipped << " FASTA sequence(s).
");
863 void CAgpFastaComparator::x_ProcessAgps(const list<string> & filenames,
864 TUniqueSeqs& agp_ids,
865 CTmpSeqVecStorage *temp_dir )
870 LOG_POST(Error << "Processing AGP...
");
871 COMP_LOG("Processing AGP...
");
873 CRef<CScope> pAgpToSeqEntryScope(new CScope(*CObjectManager::GetInstance()));
874 pAgpToSeqEntryScope->AddDefaults();
876 ITERATE( list<string>, file_iter, filenames ) {
877 const string &filename = *file_iter;
878 CNcbiIfstream istr( filename.c_str() );
880 CCustomAgpToSeqEntry agp_reader(pAgpToSeqEntryScope.GetPointer());
881 int err_code = agp_reader.ReadStream( istr ); // loads entries
882 if( err_code != 0 ) {
883 LOG_POST(Error << "Error occurred reading AGP
file:
"
884 << agp_reader.GetErrorMessage() );
888 ITERATE (vector< CRef<CSeq_entry> >, it, agp_reader.GetResult() ) {
889 CRef<CSeq_entry> entry = *it;
891 CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
892 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry);
893 scope->AddDefaults();
895 x_Process(seh, agp_ids, &iNumLoaded, &iNumSkipped, m_pAgpAsFastaFile.get() );
897 temp_dir->WriteData( CTmpSeqVecStorage::eType_AGP, seh );
902 LOG_POST(Error << "Loaded
" << iNumLoaded << " AGP sequence(s).
");
903 if( iNumSkipped > 0 ) {
904 LOG_POST(Error << " Skipped
" << iNumSkipped << " AGP sequence(s).
");
908 void CAgpFastaComparator::x_OutputDifferingSeqIds(
909 const TSeqIdSet & vSeqIdFASTAOnly,
910 const TSeqIdSet & vSeqIdAGPOnly,
911 TDiffsToHide diffs_to_hide,
912 TSeqIdSet & out_seqIdIntersection )
914 // find the ones in both
916 vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(),
917 vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(),
918 inserter(out_seqIdIntersection, out_seqIdIntersection.begin()) );
919 if( ! out_seqIdIntersection.empty() ) {
920 LOG_POST(Error << " These
" << out_seqIdIntersection.size()
921 << " differ between
object file and AGP:
");
922 ITERATE( TSeqIdSet, id_iter, out_seqIdIntersection ) {
923 LOG_POST(Error << " " << *id_iter);
927 // find the ones in FASTA only
928 TSeqIdSet vSeqIdTempSet;
930 vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(),
931 vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(),
932 inserter(vSeqIdTempSet, vSeqIdTempSet.begin()) );
933 if( ! vSeqIdTempSet.empty() && ! (diffs_to_hide & fDiffsToHide_ObjfileOnly) ) {
934 LOG_POST(Error << " These
" << vSeqIdTempSet.size()
935 << " are
in Object
file only:
" << "\n"
936 << " (
Check above: were some AGP sequences skipped due
"
938 ITERATE( TSeqIdSet, id_iter, vSeqIdTempSet ) {
939 LOG_POST(Error << " " << *id_iter);
943 // find the ones in AGP only
944 vSeqIdTempSet.clear();
946 vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(),
947 vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(),
948 inserter(vSeqIdTempSet, vSeqIdTempSet.begin()) );
949 if( ! vSeqIdTempSet.empty() && ! (diffs_to_hide & fDiffsToHide_AGPOnly) ) {
950 LOG_POST(Error << " These
" << vSeqIdTempSet.size()
951 << " are
in AGP only:
" << "\
n"
952 << " (
Check above: were some FASTA sequences skipped due
"
954 ITERATE( TSeqIdSet, id_iter, vSeqIdTempSet ) {
955 LOG_POST(Error << " " << *id_iter);
960 void CAgpFastaComparator::x_CheckForDups( TUniqueSeqs & unique_ids,
961 const string & file_type )
963 ITERATE( TUniqueSeqs, unique_id_iter, unique_ids ) {
964 const TSeqIdSet & id_set = unique_id_iter->second;
965 if( id_set.size() > 1 ) {
966 CNcbiOstrstream errmsg;
967 errmsg << "WARNING: Identical sequences
in " << file_type << ":
";
968 ITERATE( TSeqIdSet, id_iter, id_set ) {
969 errmsg << " '" << *id_iter << "'";
971 LOG_POST( Error << (string)CNcbiOstrstreamToString(errmsg) );
976 void CAgpFastaComparator::x_OutputSeqDifferences(
978 const TSeqIdSet & seqIdIntersection,
979 CTmpSeqVecStorage & temp_dir )
981 const static string kDiff = "/usr/bin/diff
";
982 if( ! CExec::IsExecutable(kDiff) ) {
983 cerr << "No differences shown because cannot run
" << kDiff << endl;
987 const static string kAwk = "/usr/bin/awk
";
988 if( ! CExec::IsExecutable(kAwk) ) {
989 cerr << "No differences shown because cannot run
" << kAwk << endl;
993 ITERATE( TSeqIdSet, id_iter, seqIdIntersection ) {
994 const CSeq_id_Handle & idh = *id_iter;
995 const string agp_file = temp_dir.GetFileName( CTmpSeqVecStorage::eType_AGP, idh );
996 const string obj_file = temp_dir.GetFileName( CTmpSeqVecStorage::eType_Obj, idh );
999 cout << "##### Comparing
" << idh << " for AGP (
'<') and Obj ('>'):" << endl;
1012 std::stringstream cmd_strm;
1013 cmd_strm << kDiff << " '" << agp_file << "' '" << obj_file << "' 2> /dev/
null | " << kAwk << " '
BEGIN { max_lines =
" << diffs_to_find << "; left_seen = 0; right_seen = 0; }
"
1014 << "/^</ { left_seen += 1;
if( left_seen <= max_lines ) { print } }
"
1015 << "/^>/ { right_seen += 1;
if( right_seen <= max_lines ) { print } }
"
1016 << "/^[0-9]/ {
if( left_seen > right_seen ) { right_seen = left_seen }
else { left_seen = right_seen }
if( left_seen >= max_lines && right_seen >= max_lines) {
exit } ; print }
"
1017 << "/^-/ { print }
'";
1018 CExec::System( cmd_strm.str().c_str() );
1022 void CAgpFastaComparator::x_SetBinaryVsText( CNcbiIstream & file_istrm,
1023 CFormatGuess::EFormat guess_format )
1025 // set binary vs. text
1026 switch( guess_format ) {
1027 case CFormatGuess::eBinaryASN:
1028 file_istrm >> MSerial_AsnBinary;
1030 case CFormatGuess::eTextASN:
1031 file_istrm >> MSerial_AsnText;
1035 // a format where binary vs. text is irrelevant
1039 CAgpFastaComparator::EFileType CAgpFastaComparator::x_GuessFileType( const string & filename )
1041 // To prevent us from reading huge files
1042 int iterations_remaining = 100;
1044 ifstream file_strm(filename.c_str());
1047 // find first non-blank line
1048 while( file_strm && line.empty() &&
1049 iterations_remaining-- > 0 )
1051 // get line and trim it
1052 NcbiGetline(file_strm, line, "\r\n");
1053 NStr::TruncateSpacesInPlace( line );
1056 if( line.empty() ) {
1057 return eFileType_Unknown;
1060 if( line[0] == '>
' ) {
1061 return eFileType_FASTA;
1064 if( line.find("::=") != NPOS ) {
1065 return eFileType_ASN1;
1068 if( line[0] == '#
' ) {
1069 return eFileType_AGP;
1073 // did not use std::count because Sun WorkShop compiler defines it in
1074 // a non-standard way and this is cleaner than preprocessor directives
1075 ITERATE( string, str_iter, line ) {
1076 if( *str_iter == '\t' ) {
1080 if( num_tabs >= 7 ) {
1081 return eFileType_AGP;
1084 return eFileType_Unknown;
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Checksum and hash calculation classes.
void WriteData(EType type, objects::CSeq_entry_Handle seh)
string GetFileName(EType type, objects::CSeq_id_Handle idh)
EFileType x_GuessFileType(const string &filename)
bool x_GetCompAndObjSeqIds(TSeqIdSet &out_compSeqIds, TSeqIdSet &out_objSeqIds, const std::list< std::string > &agpFiles)
void x_CheckForDups(TUniqueSeqs &unique_ids, const string &file_type)
CAgpFastaComparator(void)
EResult Run(const std::list< std::string > &files, const std::string &loadlog, const std::string &agp_as_fasta_file, TDiffsToHide diffsToHide, int diffs_to_find)
void x_Process(const objects::CSeq_entry_Handle seh, TUniqueSeqs &seqs, int *in_out_pUniqueBioseqsLoaded, int *in_out_pBioseqsSkipped, CNcbiOfstream *pDataOutFile)
unique_ptr< CNcbiOfstream > m_pLoadLogFile
void x_ProcessObjects(const list< string > &filenames, TUniqueSeqs &fasta_ids, CTmpSeqVecStorage *temp_dir)
void x_ProcessAgps(const list< string > &filenames, TUniqueSeqs &agp_ids, CTmpSeqVecStorage *temp_dir)
@ fDiffsToHide_ObjfileOnly
pair< string, TSeqPos > TKey
void x_OutputSeqDifferences(int diffs_to_find, const TSeqIdSet &seqIdIntersection, CTmpSeqVecStorage &temp_dir)
void x_PrintDetailsOfLengthIssue(objects::CBioseq_Handle bioseq_h)
bool x_IsLogFileOpen(void)
void x_OutputDifferingSeqIds(const TSeqIdSet &vSeqIdFASTAOnly, const TSeqIdSet &vSeqIdAGPOnly, TDiffsToHide diffs_to_hide, TSeqIdSet &out_seqIdIntersection)
void x_WriteDataAsFasta(CNcbiOfstream &dataOutFile, const objects::CSeq_id_Handle &idh, const std::string &data)
unique_ptr< CNcbiOfstream > m_pAgpAsFastaFile
This class is used to turn an AGP file into a vector of Seq-entry's.
static CRef< objects::CSeq_id > s_DefaultSeqIdFromStr(const std::string &str)
This is the default method used to turn strings into Seq-ids in AGP contexts.
virtual CRef< objects::CSeq_id > x_GetSeqIdFromStr(const std::string &str)
If you must change exactly how strings are turned into Seq-ids, you can override this in a subclass.
static CRef< objects::CSeq_id > s_LocalSeqIdFromStr(const std::string &str)
Turn a string into a local Seq-id (removing "lcl|" from the beginning if needed)
CChecksum – Checksum calculator.
Base class for reading FASTA sequences.
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Argument-less loader - for compatibility only, unusable.
Class for managing LDS2 database and related data files.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
SeqVector related exceptions.
container_type::const_iterator const_iterator
const_iterator begin() const
const_iterator end() const
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator find(const key_type &key) const
const_iterator end() const
Operators to edit gaps in sequences.
static const char * str(char *buf, int n)
static void md5(const char *src, const char *out)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
void AddLine(const char *line, size_t len)
void GetMD5Digest(unsigned char digest[16]) const
Return calculated MD5 digest.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
void Error(CExceptionArgs_Base &args)
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
static string GetTmpDir(void)
Get temporary directory.
virtual bool Exists(void) const
Check if directory "dirname" exists.
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
bool Create(TCreateFlags flags=fCreate_Default) const
Create the directory using "dirname" passed in the constructor.
const string & GetPath(void) const
Get entry path.
long TFlags
binary OR of EFlags
@ fAddMods
Parse defline mods and add to SeqEntry.
@ fNoSeqData
Parse the deflines but skip the data.
@ fDisableParseRange
No ranges in seq-ids. Ranges part of seq-id instead.
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
EAccessionInfo
For IdentifyAccession (below)
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string AsString(void) const
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
bool CanGetRange(TSeqPos start, TSeqPos stop) const
Check if the sequence data is available for the interval [start, stop).
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
const_iterator begin(void) const
const_iterator end(void) const
void Reset(void)
Reset reference object.
static TPid GetPid(void)
Get process identifier (pid) for the current process.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)
Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)
const char * Endl(void)
Platform-specific EndOfLine.
NCBI_NS_STD::string::size_type SIZE_TYPE
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
@ eCurrent
Use current time. See also CCurrentTime.
bool IsLocal(void) const
Check if variant Local is selected.
@ eMol_na
just a nucleic acid
unsigned int
A callback function used to compare two keys in a database.
static void hex(unsigned char c)
const struct ncbi::grid::netcache::search::fields::KEY key
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
Defines a portable execute class.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
CRef< objects::CObjectManager > om