39 #include "../core/blast_psi_priv.h"
71 const char* matrix_name ,
74 unsigned int query_length ,
77 unsigned int msa_master_idx )
78 : m_Query(0), m_GapExistence(gap_existence), m_GapExtension(gap_extension)
82 m_MsaDimensions.query_length = query_length;
83 m_Query.reset(
new Uint1[query_length]);
84 memcpy((
void*) m_Query.get(), (
void*)
query, query_length);
88 m_Opts.ignore_unaligned_positions =
true;
91 if ( !m_Query || msa_master_idx != 0) {
92 x_ExtractQueryFromMsa(msa_master_idx);
94 x_ValidateQueryInMsa();
96 _ASSERT(m_MsaDimensions.query_length);
98 m_MsaDimensions.num_seqs =
static_cast<Uint4>(m_AsciiMsa.size() - 1);
103 m_MatrixName =
string(matrix_name ? matrix_name :
"");
106 *m_DiagnosticsRequest = *diags;
108 m_DiagnosticsRequest =
NULL;
125 reader.
Read(
false,
true);
130 (
NStr::Find(e.GetMsg(),
"Not all sequences have same length") !=
NPOS)) {
131 string msg(
"Repeated Seq-IDs detected in multiple sequence ");
132 msg +=
"alignment file, please ensure all Seq-IDs are unique ";
133 msg +=
"before proceeding.";
154 retval.
Set().reserve(query_length);
160 back_inserter(retval.
Set()));
164 back_inserter(retval.
Set()));
217 "Multiple alignment data structure");
228 const size_t kAligmentLength =
m_AsciiMsa.front().size();
233 for (; seq_idx <
m_AsciiMsa.size(); seq_idx++) {
234 size_t query_idx = 0;
235 for (
size_t align_idx = 0;
245 if (query_res == kMaskingRes && kCurrentRes ==
'U') {
246 query_res = kCurrentRes;
248 if (query_res != kCurrentRes) {
263 for (
size_t align_idx = 0; align_idx < kAligmentLength; align_idx++) {
267 string msg(
"No sequence in the multiple sequence alignment provided ");
268 msg +=
"matches the query sequence";
278 oss <<
"Invalid master sequence index, please use a value between 1 "
290 const unsigned int kQueryLength =
static_cast<unsigned int>(
kQuery.size() - kNumGaps);
294 unsigned int query_idx = 0;
303 _ASSERT(query_idx == kQueryLength);
314 const string& ascii_query =
m_AsciiMsa.front();
316 unsigned int query_idx = 0;
317 ITERATE(
string, residue, ascii_query) {
332 const size_t kAlignmentLength =
m_AsciiMsa.front().size();
336 for (; seq_index <
m_AsciiMsa.size(); seq_index++) {
337 size_t query_idx = 0;
338 for (
size_t align_idx = 0; align_idx < kAlignmentLength; align_idx++) {
344 const char kCurrentRes =
m_AsciiMsa[seq_index][align_idx];
355 const int kGapResidue = 0;
356 const int kLongGapLen = 10;
361 while (i < m_Msa->dimensions->query_length &&
369 while (i < m_Msa->dimensions->query_length) {
370 while (i < m_Msa->dimensions->query_length &&
375 int k =
static_cast<int>(
i) + 1;
376 while (k < m_Msa->dimensions->query_length &&
381 if (k -
i >= kLongGapLen) {
382 for (
int j=
static_cast<int>(
i);j < k;j++) {
Declares the BLAST exception class.
const Uint1 kProtMask
NCBISTDAA element used to mask residues in BLAST.
PSIMsa * PSIMsaFree(PSIMsa *msa)
Deallocates the PSIMsa structure.
PSIDiagnosticsRequest * PSIDiagnosticsRequestNew(void)
Allocates a PSIDiagnosticsRequest structure, setting all fields to false.
PSIMsa * PSIMsaNew(const PSIMsaDimensions *dimensions)
Allocates and initializes the multiple sequence alignment data structure for use as input to the PSSM...
PSIDiagnosticsRequest * PSIDiagnosticsRequestFree(PSIDiagnosticsRequest *diags_request)
Deallocates the PSIDiagnosticsRequest structure passed in.
const unsigned int kQueryIndex
Index into multiple sequence alignment structure for the query sequence.
class CAlnReader supports importing a large variety of text-based alignment formats into standard dat...
const vector< string > & GetSeqs(void) const
void Read(bool guess, bool generate_local_ids=false, objects::ILineErrorListener *pErrorListener=nullptr)
void SetClustal(EAlphabet alpha)
CRef< objects::CSeq_entry > GetSeqEntry(TFastaFlags fasta_flags=objects::CFastaReader::fAddMods, objects::ILineErrorListener *pErrorListener=nullptr)
Defines BLAST error codes (user errors included)
Defines system exceptions occurred while running BLAST.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
@ eFormat
Some of these are pretty specialized.
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
static void s_GetQuerySequenceData(const CBioseq &bioseq, size_t query_length, CNCBIstdaa &retval)
Auxiliary function to retrieve the sequence data in NCBI-stdaa format from the bioseq.
static const char kGapChar('-')
The representation of a gap in ASCII format.
PSIMsaDimensions m_MsaDimensions
Multiple sequence alignment dimensions.
void x_ValidateQueryInMsa()
Searches the query sequence (m_Query) in the aligned sequences (m_AsciiMsa) and moves the first insta...
unsigned int GetQueryLength()
Get the query's length.
void Process()
The work to process the alignment is done here.
CRef< objects::CSeq_entry > m_SeqEntry
CSeq_entry obtained from the multiple sequence alignment.
void x_CopyQueryToMsa()
Copies query sequence data to multiple alignment data structure.
TAutoUint1ArrayPtr m_Query
Pointer to query sequence.
void x_ExtractQueryForPssm()
Extracts the query bioseq from m_SeqEntry.
static bool s_AreSequencesEqual(const CNCBIstdaa &sequence, Uint1 *query)
Returns true iff sequence is identical to query.
CRef< objects::CBioseq > m_QueryBioseq
Query as CBioseq for PSSM.
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
void x_ReadAsciiMsa(CNcbiIstream &input_file)
Reads the multiple sequence alignment from the input file.
PSIDiagnosticsRequest * m_DiagnosticsRequest
Diagnostics request structure.
void x_ExtractAlignmentData()
Populates the multiple alignment data structure.
PSIMsa * m_Msa
Structure representing the multiple sequence alignment.
vector< string > m_AsciiMsa
The raw multiple sequence alignment in ASCII read from the input file.
const char NCBISTDAA_TO_AMINOACID[]
Translates between ncbieaa and ncbistdaa.
virtual ~CPsiBlastInputClustalW()
virtual destructor
void x_ExtractQueryFromMsa(unsigned int msa_master_idx=0)
Extracts the query sequence from the multiple sequence alignment, assuming it's the first one,...
void reset(element_type *p=0, EOwnership ownership=eTakeOwnership)
Reset will delete the old pointer (if owned), set content to the new value, and assume the ownership ...
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
element_type * get(void) const
Get pointer.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const TPrim & Get(void) const
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define USING_SCOPE(ns)
Use the specified namespace.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
TErrCode GetErrCode(void) const
Get error code.
const TInst & GetInst(void) const
Get the Inst member data.
bool IsNcbistdaa(void) const
Check if variant Ncbistdaa is selected.
const TNcbistdaa & GetNcbistdaa(void) const
Get the variant data.
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
@ e_Ncbistdaa
consecutive codes for std aas
unsigned int
A callback function used to compare two keys in a database.
const struct ncbi::grid::netcache::search::fields::SIZE size
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Boolean is_aligned
Is this letter part of the alignment?
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Uint4 query_length
Length of the query.
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
PSIMsaDimensions * dimensions
dimensions of the msa