124 "annotated by transcript or proteomic data";
126 "The sequence of the model RefSeq transcript was modified relative "
127 "to this genomic sequence to represent the inferred CDS";
129 "The sequence of the model RefSeq protein was modified relative "
130 "to this genomic sequence to represent the inferred CDS";
156 for ( ; desc_iter; ++desc_iter) {
169 , m_intron_stitch_threshold_flags(fBoth)
170 , m_min_intron(kDefaultMinIntron)
171 , m_allowed_unaligned(kDefaultAllowedUnaligned)
173 , m_is_best_refseq(
false)
187 :
m_impl(new SImplementation(scope))
223 return m_impl->CleanAlignment(align_in);
232 return m_impl->ConvertAlignToAnnot(align, annot, seqs, gene_id, cdregion,
false);
240 m_impl->ConvertAlignToAnnot(aligns, annot, seqs);
244 const objects::CSeq_loc &loc,
245 objects::CSeq_annot& annot,
246 objects::CBioseq_set& seqs,
254 "Can't find genomic sequence " + loc.GetId()->AsFastaString());
266 size_t new_id_num = counter.
Add(1);
269 string str(
"lcl|MRNA_");
276 string str(
"lcl|PROT_");
286 fake_align.
SetSegs().SetSpliced().SetProduct_id().Assign(*rna_id);
287 fake_align.
SetSegs().SetSpliced().SetGenomic_id().Assign(*loc.GetId());
289 fake_align.
SetSegs().SetSpliced().SetGenomic_strand(loc.GetStrand());
290 fake_align.
SetSegs().SetSpliced().SetProduct_type(
296 exon->SetProduct_start().SetNucpos(product_pos);
297 product_pos += loc_it.GetRange().GetLength();
298 exon->SetProduct_end().SetNucpos(product_pos-1);
299 exon->SetGenomic_start(loc_it.GetRange().GetFrom());
300 exon->SetGenomic_end(loc_it.GetRange().GetTo());
302 match->SetMatch(loc_it.GetRange().GetLength());
303 exon->SetParts().push_back(
match);
304 fake_align.
SetSegs().SetSpliced().SetExons().push_back(exon);
306 fake_align.
SetSegs().SetSpliced().SetProduct_length(product_pos);
309 cdregion.
SetData().SetCdregion().SetFrame(frame);
314 "Non-standard frame specified with 5'-complete location");
338 if (product_pos % 3) {
340 "Non-whole number of codons with 3'-complete location");
348 cdregion.
SetData().SetCdregion().SetCode().Set().push_back(
code);
354 m_impl->ConvertAlignToAnnot(fake_align, annot, seqs, 0, &cdregion,
false);
363 m_impl->SetFeatureExceptions(feat, align);
371 m_impl->SetPartialFlags(gene_feat, mrna_feat, cds_feat);
376 m_impl->RecomputePartialFlags(annot);
383 : m_aln(aln), m_scope(scope), m_genomic_row(-1)
384 , m_allowed_unaligned(allowed_unaligned), m_opts(opts)
397 "CreateGeneModelFromAlign(): "
398 "failed to create consistent alignment");
414 "CreateGeneModelFromAlign(): "
415 "More than one genomic row in alignment");
421 "CreateGeneModelFromAlign(): "
422 "No genomic sequence found in alignment");
430 if(m_aln.GetSegs().IsSpliced()) {
431 rna_loc = x_GetLocFromSplicedExons(m_aln);
433 const CSeq_id&
id = m_aln.GetSeq_id(GetRnaRow());
440 rna_loc = x_Mapper()->Map(*range_loc);
448 return m_genomic_row;
454 return GetGenomicRow() == 0 ? 1 : 0;
465 x_Mapper()->IncludeSourceLocs(
b);
470 x_Mapper()->SetMergeNone();
514 if(donor_ok || !acceptor_ok) {
517 if(acceptor_ok || !donor_ok) {
526 prev_int = genomic_int;
564 vector<SExon> orig_exons =
GetExons(*align);
573 if (
GetExons(*align) != orig_exons) {
596 model_num.erase(model_num.size()-2, 2);
613 bool found_start_codon =
false;
614 bool found_stop_codon =
false;
617 if ((*mod_it)->IsStart_codon_found()) {
618 found_start_codon = (*mod_it)->GetStart_codon_found();
620 if ((*mod_it)->IsStop_codon_found()) {
621 found_stop_codon = (*mod_it)->GetStop_codon_found();
629 "Can't find genomic sequence " +
634 fake_transcript_align->
Assign(*align);
635 align.
Reset(fake_transcript_align);
645 size_t new_id_num = counter.
Add(1);
647 string str(
"lcl|MRNA_");
654 fake_transcript_align->
SetSegs().SetSpliced().SetProduct_id(
657 fake_transcript_align->
SetSegs().SetSpliced().SetProduct_type(
660 fake_transcript_align->
SetSegs().SetSpliced().SetExons())
667 fake_transcript_align->
SetSegs().SetSpliced().SetExons().back();
668 bool aligned_to_the_end =
669 last_exon->GetProduct_end().GetNucpos()+1==
672 fake_transcript_align->
SetSegs().SetSpliced().SetProduct_length() =
674 (((found_stop_codon && aligned_to_the_end) || !aligned_to_the_end)?3:0);
676 if (found_stop_codon && aligned_to_the_end) {
677 bool is_minus = last_exon->IsSetGenomic_strand() ?
680 . IsSetGenomic_strand() &&
686 ? last_exon->GetGenomic_start()
687 : genomic_length - last_exon->GetGenomic_end() - 1);
688 if (space_for_codon < 3) {
691 "Stop codon goes outside genomic sequence");
694 new_exon->SetProduct_start().SetNucpos(
695 last_exon->GetProduct_end().GetNucpos() + space_for_codon + 1);
696 new_exon->SetProduct_end().SetNucpos(
697 last_exon->GetProduct_end().GetNucpos() + 3);
698 new_exon->SetGenomic_start(
699 is_minus ? genomic_length - 3 + space_for_codon : 0);
700 new_exon->SetGenomic_end(
701 is_minus ? genomic_length - 1 : 2 - space_for_codon);
702 if (last_exon->IsSetProduct_strand()) {
703 new_exon->SetProduct_strand(last_exon->GetProduct_strand());
705 if (last_exon->IsSetGenomic_strand()) {
706 new_exon->SetGenomic_strand(last_exon->GetGenomic_strand());
708 fake_transcript_align->
SetSegs().SetSpliced().SetExons()
709 . push_back(new_exon);
713 last_exon->SetProduct_end().SetNucpos() += space_for_codon;
715 last_exon->SetGenomic_start() -= space_for_codon;
717 last_exon->SetGenomic_end() += space_for_codon;
719 if (last_exon->IsSetParts() && space_for_codon) {
722 match_stop_codon->SetMatch(space_for_codon);
723 last_exon->SetParts().push_back(match_stop_codon);
728 cd_feat->
SetData().SetCdregion();
731 fake_transcript_align->
SetSegs().SetSpliced().SetProduct_id(),
733 if (!found_start_codon &&
734 fake_transcript_align->
SetSegs().SetSpliced().SetExons().front()->GetProduct_start().GetNucpos()==0) {
737 if (!found_stop_codon && aligned_to_the_end) {
746 cd_feat->
SetData().SetCdregion().SetCode().Set().push_back(
code);
757 transcribed_rna_id.
Assign(query_rna_id);
758 if (cds_feat_on_genome_with_translated_product &&
759 cds_feat_on_genome_with_translated_product->
CanGetProduct() &&
760 cds_feat_on_query_mrna &&
762 CSeq_id* translated_protein_id =
const_cast<CSeq_id*
>(cds_feat_on_genome_with_translated_product->
SetProduct().GetId());
773 const CSeq_feat* cds_feat_on_query_mrna_ptr,
774 bool call_on_align_list)
784 if (is_protein_align) {
812 if (cds_feat_on_query_mrna_ptr) {
814 cds_feat_on_query_mrna->
Assign(*cds_feat_on_query_mrna_ptr);
817 if (cdregion_handle) {
824 vector<CMappedFeat> ncRNAs;
827 if (query_rna_handle) {
829 feat_iter; ++feat_iter) {
830 const CSeq_loc &rna_loc = feat_iter->GetLocation();
831 if (feat_iter->GetData().GetSubtype() !=
833 ++rna_loc.
begin() == rna_loc.
end() &&
837 full_length_rna = *feat_iter;
838 }
else if (feat_iter->GetData().GetSubtype() ==
841 ncRNAs.push_back(*feat_iter);
848 size_t model_num = counter.
Add(1);
852 rna_feat_loc_on_genome->Assign(mapper.GetRnaLoc());
855 list<CRef<CSeq_loc> > transcribed_mrna_seqloc_refs;
863 cds_feat_on_query_mrna, cds_feat_on_transcribed_mrna);
870 *align, rna_feat_loc_on_genome, opts)
872 *transcribed_rna_id, cds_feat_on_query_mrna);
873 if (mrna_feat_on_genome_with_translated_product &&
874 !mrna_feat_on_genome_with_translated_product->
IsSetProduct()) {
876 mrna_feat_on_genome_with_translated_product->
877 SetProduct().SetWhole().
Assign(*transcribed_rna_id);
882 transcribed_mrna_seqloc_refs,
883 *align, rna_feat_loc_on_genome, time, model_num, seqs, opts);
891 *mrna_feat_on_genome_with_translated_product,
892 cds_feat_on_genome_with_translated_product.
GetPointer());
898 if(!call_on_align_list){
903 rna_feat_loc_on_genome, genomic_id, gene_id);
907 annot.
SetData().SetFtable().push_back(gene_feat);
909 gene =
genes.
insert(make_pair(gene_id,gene_feat)).first;
911 gene_feat = gene->second;
913 &mrna_feat_on_genome_with_translated_product->
GetLocation()));
917 genexref->SetId(*gene_feat->
SetIds().front());
920 mrnaxref->SetId(*mrna_feat_on_genome_with_translated_product->
SetIds().front());
922 gene_feat->
SetXref().push_back(mrnaxref);
923 mrna_feat_on_genome_with_translated_product->
SetXref().push_back(genexref);
927 rna_feat_loc_on_genome, genomic_id);
930 annot.
SetData().SetFtable().push_back(gene_feat);
935 if (mrna_feat_on_genome_with_translated_product) {
938 annot.
SetData().SetFtable().push_back(mrna_feat_on_genome_with_translated_product);
943 if(cds_feat_on_genome_with_translated_product.
NotNull()) {
944 propagated_features.push_back(cds_feat_on_genome_with_translated_product);
946 if (cds_feat_on_query_mrna && cds_feat_on_query_mrna->
CanGetProduct()) {
950 for (
CFeat_CI feat_iter(prot_handle,
952 feat_iter; ++feat_iter) {
954 feat_iter->GetData().GetProt();
959 prot_xref->SetData().SetProt().SetName()
960 . push_back(prot_ref.
GetName().front());
961 cds_feat_on_genome_with_translated_product->
SetXref().push_back(prot_xref);
969 ITERATE(vector<CMappedFeat>, it, ncRNAs){
973 propagated_features.push_back(ncrna_feat);
978 annot.
SetData().SetFtable().push_back(*it);
982 if ((*it)->IsSetIds()) {
983 propagatedxref->SetId(*(*it)->SetIds().front());
987 mrnaxref->SetId(*mrna_feat_on_genome_with_translated_product->
SetIds().front());
989 (*it)->SetXref().push_back(mrnaxref);
990 mrna_feat_on_genome_with_translated_product->
SetXref().push_back(propagatedxref);
994 if(!call_on_align_list){
995 if(propagated_features.empty()){
1000 SetPartialFlags(gene_feat, mrna_feat_on_genome_with_translated_product, *it);
1006 if (mrna_feat_on_genome_with_translated_product) {
1007 mrna_feat_on_genome_with_translated_product->
SetProduct().SetWhole().Assign(query_rna_id);
1009 if (cds_feat_on_genome_with_translated_product) {
1011 cds_feat_on_genome_with_translated_product->
1013 cds_feat_on_transcribed_mrna->
1017 seq_id->Assign(query_rna_id);
1018 cds_feat_on_transcribed_mrna->
SetLocation().SetId(*seq_id);
1020 (*loc)->SetId(*seq_id);
1025 if (!query_rna_handle) {
1027 cds_feat_on_query_mrna, cds_feat_on_genome_with_translated_product);
1031 if (mrna_feat_on_genome_with_translated_product) {
1033 m_scope->GetBioseqHandle(query_rna_id);
1040 cds_feat_on_genome_with_translated_product.
GetPointer(),
1045 m_scope->RemoveTopLevelSeqEntry(rna_seh);
1048 if (cds_feat_on_genome_with_translated_product) {
1056 TSeqPos clean_match_count = 0;
1060 &transcribed_mrna_seqloc_refs,
1061 &clean_match_count);
1062 if (!clean_match_count) {
1064 annot.
SetData().SetFtable().remove(cds_feat_on_genome_with_translated_product);
1065 cds_feat_on_genome_with_translated_product =
NULL;
1068 m_scope->RemoveTopLevelSeqEntry(prot_seh);
1073 RenameGeneratedBioseqs(query_rna_id, *transcribed_rna_id, cds_feat_on_query_mrna, cds_feat_on_genome_with_translated_product);
1078 m_scope->AddTopLevelSeqEntry(**it);
1087 for (CBioseq_set::TSeq_set::iterator bioseq_it =
1091 if (((*bioseq_it)->GetSeq().IsNa() &&
1093 ((*bioseq_it)->GetSeq().IsAa() &&
1096 bioseq_it = seqs.
SetSeq_set().erase(bioseq_it);
1105 if (loc->IsPacked_int() && loc->GetPacked_int().Get().size()==1) {
1107 loc->SetInt(*interval);
1110 return is_protein_align ? cds_feat_on_genome_with_translated_product : mrna_feat_on_genome_with_translated_product;
1133 const CSeq_id& genomic_id = clean_align->
GetSeq_id(mapper.GetGenomicRow());
1137 else if(!(gene_handle == genomic_id))
1139 "Bad list of alignments to ConvertAlignToAnnot(); alignments on different genes");
1142 loc->Assign(mapper.GetRnaLoc());
1153 gene_annot.
SetData().SetFtable().push_front(gene_feat);
1155 annot.
SetData().SetFtable().splice(annot.
SetData().SetFtable().end(),
1156 gene_annot.
SetData().SetFtable());
1173 if (!inst.
SetExt().SetDelta().Set().empty()) {
1198 inst.
SetExt().SetDelta().AddLiteral(seq, mol_class);
1208 inst.
SetExt().SetDelta().AddLiteral(seq, mol_class);
1218 bool add_unaligned_parts,
1219 bool mark_transcript_deletions,
1235 int prev_product_to = -1;
1236 bool prev_fuzz =
false;
1246 if ((prev_product_to > -1 &&
1249 if (has_gap !=
NULL) {
1253 inst.
SetExt().SetDelta().AddLiteral
1257 int gap_len = add_unaligned_parts ? mrna_loc->
GetTotalRange().
GetFrom()-(prev_product_to+1) : 0;
1259 seq_size += gap_len;
1260 prev_product_to += gap_len;
1261 inst.
SetExt().SetDelta().AddLiteral(gap_len);
1263 inst.
SetExt().SetDelta().Set().back()
1268 unsigned part_count = 0;
1269 unsigned mapped_exon_len = 0;
1270 for (
CSeq_loc_CI part_it(*mrna_loc); part_it; ++part_it) {
1272 if (prev_product_to<0) {
1273 prev_product_to = part_it.GetRange().GetFrom()-1;
1274 if (add_unaligned_parts && part_it.GetRange().GetFrom() > 0) {
1275 seq_size = part_it.GetRange().GetFrom();
1276 inst.
SetExt().SetDelta().AddLiteral(seq_size);
1279 int deletion_len = part_it.GetRange().GetFrom()-(prev_product_to+1);
1285 if (deletion_len > 0) {
1286 if (mark_transcript_deletions && part_count == 1) {
1290 deletion_loc.
SetInt().SetId().Assign(part_it.GetSeq_id());
1291 deletion_loc.
SetInt().SetFrom(prev_product_to+1);
1292 deletion_loc.
SetInt().SetTo(part_it.GetRange().GetFrom()-1);
1297 if (deletion_len > 0 && (mark_transcript_deletions || part_count > 1)) {
1298 if (has_indel !=
NULL) {
1301 string deletion(deletion_len,
'N');
1303 seq_size += deletion.size();
1311 mapped_exon_len += it.GetRange().GetLength();
1320 seq_size += vec.
size();
1322 prev_product_to = part_it.GetRange().GetTo();
1324 if (has_indel !=
NULL &&
1326 mapped_exon_len != loc_it.GetRange().GetLength())) {
1337 if (seq_size < (
int)length) {
1339 inst.
SetExt().SetDelta().AddLiteral
1343 inst.
SetExt().SetDelta().AddLiteral(length-seq_size);
1378 }
else if (cds_feat_on_query_mrna.
IsNull()) {
1397 bioseq.
SetInst().SetHist().SetAssembly().push_back(assembly);
1402 string str(
"lcl|CDNA_");
1408 transcribed_rna_id->
Set(
str);
1410 bioseq.
SetId().push_back(transcribed_rna_id);
1412 if (cds_feat_on_query_mrna.
NotNull()) {
1418 cds_feat_on_transcribed_mrna->
Assign(*cds_feat_on_query_mrna);
1419 cds_feat_on_transcribed_mrna->
SetLocation().SetId(*transcribed_rna_id);
1421 annot->
SetData().SetFtable().push_back(cds_feat_on_transcribed_mrna);
1425 cds_feat_on_transcribed_mrna->
SetData().SetCdregion();
1428 (*it)->SetLoc().SetId(*transcribed_rna_id);
1439 return transcribed_rna_id;
1446 code_break->
SetAa().SetNcbieaa(ncbieaa);
1448 feat.
SetData().SetCdregion().SetCode_break().push_back(code_break);
1466 string str(
"lcl|PROT_");
1473 cds_feat_on_transcribed_mrna->
SetProduct().SetWhole(*translated_protein_id);
1475 bioseq.
SetId().push_back(translated_protein_id);
1496 bioseq.
SetDescr().Set().push_back(desc);
1510 bool final_code_break =
false;
1512 final_code_break = (strprot[strprot.size()-1] !=
'*');
1514 strprot.resize(strprot.size()-1);
1521 seq_inst.
SetExt().SetDelta();
1541 bool starts_with_code_break =
false;
1545 starts_with_code_break =
true;
1553 size_t skip_5_prime = 0;
1554 size_t skip_3_prime = 0;
1555 unsigned count_internal_stops = 0;
1558 int codon_start_pos = (
int)ci.GetPosition() + frame;
1559 int len =
int(ci.GetLength()) - frame;
1561 _ASSERT( -3 < frame && frame < 3 );
1565 (ci.IsUnknownLength() || !ci.IsSetData()) &&
1575 bool stop_codon_included = e > strprot.size();
1576 if (stop_codon_included) {
1588 if (ci.IsUnknownLength()) {
1589 seq_inst.
SetExt().SetDelta().AddLiteral(
len);
1591 }
else if (!ci.IsSetData()) {
1592 if (
b==skip_5_prime &&
1594 skip_5_prime += e-
b;
1595 }
else if (stop_codon_included &&
b==e) {
1599 if (strprot[
b] !=
'X') {
1604 seq_inst.
SetExt().SetDelta().AddLiteral(e-
b);
1608 if (stop_codon_included && final_code_break) {
1611 stop_codon_on_mrna->
SetInt().SetFrom(pos_on_mrna);
1612 stop_codon_on_mrna->
SetInt().SetTo(pos_on_mrna + 2);
1613 AddCodeBreak(*cds_feat_on_transcribed_mrna, *stop_codon_on_mrna,
'*');
1614 transcribed_mrna_seqloc_refs.push_back(stop_codon_on_mrna);
1618 if (
b==0 && strprot[
b] !=
'M' &&
1619 !starts_with_code_break &&
1624 start_codon_on_mrna->
SetInt().SetFrom(pos_on_mrna);
1625 start_codon_on_mrna->
SetInt().SetTo(pos_on_mrna + 2);
1626 AddCodeBreak(*cds_feat_on_transcribed_mrna, *start_codon_on_mrna,
'M');
1627 transcribed_mrna_seqloc_refs.push_back(start_codon_on_mrna);
1631 size_t stop_aa_pos =
b-1;
1632 while ((stop_aa_pos = strprot.find(
'*', stop_aa_pos+1)) < e) {
1633 strprot[stop_aa_pos] =
'X';
1637 internal_stop_on_mrna->
SetInt().SetFrom(pos_on_mrna);
1638 internal_stop_on_mrna->
SetInt().SetTo(pos_on_mrna + 2);
1639 AddCodeBreak(*cds_feat_on_transcribed_mrna, *internal_stop_on_mrna,
'X');
1640 transcribed_mrna_seqloc_refs.push_back(internal_stop_on_mrna);
1641 ++count_internal_stops;
1650 _ASSERT( -2 <= frame && frame <= 0 );
1654 align_info->
SetType().SetStr(
"AlignInfo");
1655 align_info->
AddField(
"num_internal_stop_codon", (
int)count_internal_stops);
1656 cds_feat_on_transcribed_mrna->
AddExt(align_info);
1660 if (
b < strprot.size() && strprot[
b] !=
'X') {
1669 strprot.size() <=
b + (frame==0?0:1) );
1674 skip_3_prime += seq_inst.
GetExt().
GetDelta().
Get().back()->GetLiteral().GetLength();
1675 seq_inst.
SetExt().SetDelta().Set().pop_back();
1679 if (skip_5_prime || skip_3_prime) {
1685 prot_loc->
SetInt().SetFrom(skip_5_prime);
1686 prot_loc->
SetInt().SetTo(
b-skip_3_prime-1+(skip_3_prime?0:1));
1690 cds_feat_on_transcribed_mrna->
SetLocation(*to_mrna.
Map(*prot_loc));
1693 seq_inst.
SetLength(
b-skip_5_prime-skip_3_prime);
1695 if (seq_inst.
SetExt().SetDelta().Set().size() == 1 && seq_inst.
SetExt().SetDelta().Set().back()->GetLiteral().IsSetSeq_data()) {
1698 dprot->
Assign(seq_inst.
SetExt().SetDelta().Set().back()->GetLiteral().GetSeq_data());
1707 cds_feat_on_assembly_mrna->
Assign(*cds_feat_on_transcribed_mrna);
1711 cds_feat_on_assembly_mrna->
SetLocation().SetInt().SetTo() -= 3;
1717 prot_assembly->
SetSegs().SetSpliced().SetProduct_length(seq_inst.
GetLength());
1719 seq_inst.
SetHist().SetAssembly().push_back(prot_assembly);
1731 m_scope->RemoveTopLevelSeqEntry(prot_seh);
1734 m_scope->RemoveTopLevelSeqEntry(mrna_seh);
1755 if (!gnomon_model_num.empty()) {
1757 obj_id->
SetStr(
"rna." + gnomon_model_num);
1760 mrna_feat->
SetIds().push_back(feat_id);
1763 mrna_feat->
SetProduct().SetWhole().Assign(transcribed_rna_id);
1768 switch (
info->GetBiomol()) {
1792 if (
info->IsSetGbmoltype()) {
1793 RNA_class =
info->GetGbmoltype();
1810 if (!RNA_class.empty()) {
1811 mrna_feat->
SetData().SetRna().SetExt().SetGen().SetClass(RNA_class);
1814 if (!name.empty()) {
1815 if (!RNA_class.empty()) {
1816 mrna_feat->
SetData().SetRna().SetExt().SetGen().SetProduct(name);
1818 mrna_feat->
SetData().SetRna().SetExt().SetName(name);
1840 bool update_existing_gene = gene_feat;
1841 string gene_id_str =
"gene.";
1846 if (!update_existing_gene) {
1847 if (feat_iter && feat_iter.
GetSize()) {
1855 gene_feat->
SetData().SetGene();
1859 obj_id->
SetStr(gene_id_str);
1862 gene_feat->
SetIds().push_back(feat_id);
1875 }
else if (feat_iter && feat_iter.
GetSize()) {
1885 if (feat_iter && feat_iter.
GetSize() == 1 && update_existing_gene) {
1891 tag->Assign(**xref_it);
1892 bool duplicate =
false;
1897 if((*previous_xref_it)->Match(**xref_it)){
1910 gene_feat->
SetData().SetGene().SetDesc(gene_id_str);
1934 align, loc, opts,
offset);
1936 if (cds_feat_on_genome) {
1942 loc_ranges += loc_it.GetRange();
1948 string gnomon_model_num;
1955 if (!gnomon_model_num.empty()) {
1957 obj_id->
SetStr(
"cds." + gnomon_model_num);
1960 cds_feat_on_transcribed_mrna->
SetIds().push_back(feat_id);
1963 transcribed_mrna_seqloc_refs,
1964 time, model_num, seqs);
1967 cds_feat->
Assign(*cds_feat_on_transcribed_mrna);
1975 if (is_partial_5prime &&
offset) {
1978 orig_frame = cds_feat->
GetData()
1984 int frame = (
offset - orig_frame) % 3;
1988 frame = (3 - frame) % 3;
1989 if (frame != orig_frame) {
1992 cds_feat->
SetData().SetCdregion()
1996 cds_feat->
SetData().SetCdregion()
2000 cds_feat->
SetData().SetCdregion()
2006 "mod 3 out of bounds");
2011 if (!gnomon_model_num.empty() && !is_partial_5prime) {
2013 if (cds_start >= 3) {
2021 vec.
GetSeqData(cds_start % 3, cds_start, mrna);
2030 SIZE_TYPE stop_5prime = strprot.rfind(
'*');
2031 if (stop_5prime !=
NPOS) {
2032 stop_5prime = stop_5prime*3+cds_start%3;
2034 stop_5prime_feature->
SetData().SetImp().SetKey(
"misc_feature");
2035 stop_5prime_feature->
SetComment(
"upstream in-frame stop codon");
2037 stop_5prime_location->
SetInt().SetFrom(stop_5prime);
2038 stop_5prime_location->
SetInt().SetTo(stop_5prime+2);
2041 stop_5prime_feature->
SetLocation(*stop_5prime_location);
2058 cds_feat->
SetData().SetCdregion();
2059 CCdregion::TCode_break::iterator it =
2063 code_break_loc.
Assign((*it)->GetLoc());
2069 new_cb_loc = new_cb_loc->
GetEquiv().
Get().front();
2073 if (new_cb_loc && !new_cb_loc->
IsNull()) {
2075 new_cb_ranges += loc_it.GetRange();
2077 new_cb_ranges &= loc_ranges;
2080 (*it)->SetLoc(*new_cb_loc);
2101 name = sequence::CDeflineGenerator().GenerateDefline(handle);
2113 if (feat_iter && feat_iter.
GetSize() &&
2119 size_t last_comma = name.rfind(
',');
2120 if (last_comma != string::npos) {
2121 name.erase(last_comma);
2140 non_const_loc->
Assign(*loc);
2142 align, non_const_loc, opts,
offset);
2161 list< CRef< CSeq_loc > >& a_list = a_mix->
SetMix().Set();
2162 const list< CRef< CSeq_loc > >& b_list = b_mix->
GetMix().
Get();
2165 for (list<
CRef< CSeq_loc > >::iterator a_i = a_list.begin(); a_i != a_list.end();) {
2168 a_list.splice(a_i, diff->
SetMix().Set());
2169 a_i = a_list.erase(a_i);
2172 if (a_list.size() == 1) {
2173 return a_list.front();
2202 for (
CSeq_loc_CI loc_it(feature_on_mrna->GetLocation());
2222 "failed to find requisite parts of "
2227 if ( !this_loc_mapped ||
2228 this_loc_mapped->
IsNull() ||
2229 this_loc_mapped->
IsEmpty() ) {
2233 if ( !mapped_loc ) {
2237 feature_on_mrna->GetLocation().GetTotalRange().GetFrom();
2240 bool is_partial_5prime =
2242 bool is_partial_3prime =
2246 bool last_range = !++it1;
2247 if (is_partial_3prime && last_range &&
2250 feature_on_mrna->GetData().IsCdregion() &&
2255 equiv->
GetEquiv().
Get().back()->GetTotalRange().GetTo();
2256 if (missing_end < 3) {
2259 is_partial_3prime =
false;
2273 sub.
SetInt().SetId().Assign(*this_loc_mapped->
GetId());
2277 bool cross_origin = (left > right);
2284 half->
SetTo(genomic_size-1);
2296 if (this_loc_mapped->
IsMix()) {
2300 if (subloc_it.GetRangeAsSeq_loc()->
2303 mrna_fuzzy_boundaries.
insert(
2304 subloc_it.GetRange().GetFrom());
2306 if (subloc_it.GetRangeAsSeq_loc()->
2309 mrna_fuzzy_boundaries.
insert(
2310 subloc_it.GetRange().GetTo());
2315 this_loc_mapped->
SetMix().Set())
2317 (*subloc_it)->SetPartialStart(
2318 mrna_fuzzy_boundaries.count(
2321 (*subloc_it)->SetPartialStop(
2322 mrna_fuzzy_boundaries.count(
2335 mapped_loc->
SetMix().Set().push_back(this_loc_mapped);
2347 if (mapped_loc && feature_on_mrna->GetData().IsRna())
2386 if (mapped_loc && feature_on_mrna->GetData().IsCdregion()) {
2391 for (; vec.
IsInGap(start_gap); ++start_gap);
2392 if (start_gap > 0 && start_gap < vec.
size()) {
2399 orig_mapped_loc.
Assign(*mapped_loc);
2402 while (mapped_loc->
SetPacked_int().Set().front()->GetLength()
2405 start_gap -= mapped_loc->
SetPacked_int().Set().front()->GetLength();
2412 first_exon.
SetTo() -= start_gap;
2414 first_exon.
SetFrom() += start_gap;
2420 loc->
Assign(*SubtractPreserveBiologicalOrder(*loc, *SubtractPreserveBiologicalOrder(orig_mapped_loc, *mapped_loc)));
2425 for (; vec.
IsInGap(vec.
size() - 1 - end_gap); ++end_gap);
2426 if (end_gap > 0 && end_gap < vec.
size()) {
2431 orig_mapped_loc.
Assign(*mapped_loc);
2434 while (mapped_loc->
SetPacked_int().Set().back()->GetLength() <= end_gap)
2436 end_gap -= mapped_loc->
SetPacked_int().Set().back()->GetLength();
2443 last_exon.
SetFrom() += end_gap;
2445 last_exon.
SetTo() -= end_gap;
2450 loc->
Assign(*SubtractPreserveBiologicalOrder(*loc, *SubtractPreserveBiologicalOrder(orig_mapped_loc, *mapped_loc)));
2459 mapped_feat->
Assign(*feature_on_mrna);
2472 if(propagated_feat){
2487 if (mrna_feat && propagated_feat)
2519 if (gene_feat && mrna_feat){
2532 if (gene_feat && propagated_feat && !mrna_feat){
2569 feature::CFeatTree
tree(sah);
2570 vector<CMappedFeat> top_level_features =
tree.GetChildren(
CMappedFeat());
2573 vector< vector<CMappedFeat> > top_level_features_by_type;
2576 ITERATE(vector<CMappedFeat>, it, top_level_features)
2577 top_level_features_by_type[it->GetData().Which()].push_back(*it);
2584 ITERATE(vector<CMappedFeat>, gene_it,
2594 vector<CMappedFeat> gene_children =
2595 gene_feat ?
tree.GetChildren(*gene_it)
2597 sort(gene_children.begin(), gene_children.end());
2599 ITERATE(vector<CMappedFeat>, child_it, gene_children){
2608 }
else if(!child_feat || child_feat->
GetData().
IsRna()){
2609 vector<CMappedFeat> rna_children =
2610 child_feat ?
tree.GetChildren(*child_it)
2616 while((child_it+1) != gene_children.end() &&
2619 (child_it+1)->GetTotalRange())){
2620 rna_children.push_back(*(++child_it));
2622 if(rna_children.empty()){
2626 ITERATE(vector<CMappedFeat>, rna_child_it, rna_children){
2644 !propagated_feature || !propagated_feature->
IsSetDbxref())
2650 if((*gene_xref_it)->GetDb() !=
"miRBase")
2652 if((*gene_xref_it)->GetDb() == (*propagated_xref_it)->GetDb() &&
2653 !(*gene_xref_it)->Match(**propagated_xref_it))
2655 string propagated_feature_desc;
2657 propagated_feature_desc =
"corresponding cdregion";
2660 "Unexpected propagated feature type");
2661 propagated_feature_desc =
"propagated ncRNA feature";
2667 <<
" and " << propagated_feature_desc
2668 <<
" have " << (*gene_xref_it)->GetDb()
2669 <<
" dbxrefs with inconsistent tags");
2684 for (
CFeat_CI feat_iter(handle, sel); feat_iter; ++feat_iter) {
2686 feat->
Assign(feat_iter->GetOriginalFeature());
2688 mapper.
Map(feat_iter->GetLocation());
2693 annot.
SetData().SetFtable().push_back(feat);
2723 if ( !(*it)->IsSetId() ) {
2728 const CFeat_id& feat_id = (*it)->GetId();
2751 (
"rearrangement required for product");
2777 for ( ; align_iter; ++align_iter) {
2784 al.
Reset(&this_align);
2790 bool has_length_mismatch =
false;
2792 bool has_incomplete_polya_tail =
false;
2793 bool partial_unaligned_section =
false;
2808 has_length_mismatch =
true;
2827 partial_unaligned_section =
true;
2836 switch ((*part_it)->Which()) {
2838 pos += (*part_it)->GetMatch();
2842 TSeqRange(pos, pos+(*part_it)->GetMismatch()-1);
2843 pos += (*part_it)->GetMismatch();
2846 pos += (*part_it)->GetDiag();
2850 delete_sizes[pos] = (*part_it)->GetGenomic_ins();
2854 TSeqRange(pos, pos+(*part_it)->GetProduct_ins()-1);
2855 pos += (*part_it)->GetProduct_ins();
2867 if (
r.GetFrom() != 0) {
2869 partial_unaligned_section =
true;
2885 if (
r.GetTo() + 1 < max_align_len) {
2887 partial_unaligned_section =
true;
2889 insert_locs +=
TSeqRange(
r.GetTo()+1, max_align_len-1);
2901 if ( insert_locs.
empty() && delete_locs.
empty() && !partial_unaligned_section)
2917 mismatch_locs.
clear();
2919 for ( ; prod_it != prod_end && genomic_it != genomic_end;
2920 ++prod_it, ++genomic_it) {
2921 if (*prod_it != *genomic_it) {
2926 unsigned tail_len =
Convert(prod_end - prod_it);
2928 for ( ; prod_it != prod_end; ++prod_it) {
2929 if (*prod_it ==
'A') {
2934 if (tail_len && count_a >= tail_len * 0.8) {
2936 if (count_a < tail_len * 0.95) {
2937 has_incomplete_polya_tail =
true;
2940 else if (tail_len) {
2942 partial_unaligned_section =
true;
2945 insert_locs +=
TSeqRange(end_pos-tail_len+1, end_pos);
2951 if (!insert_locs.
empty() ||
2952 !delete_locs.
empty() ||
2953 has_length_mismatch ||
2954 has_incomplete_polya_tail ||
2955 partial_unaligned_section) {
2956 except_text =
"unclassified transcription discrepancy";
2958 else if (!mismatch_locs.
empty()) {
2959 except_text =
"mismatches in transcription";
2963 x_SetComment(feat, cds_feat, cds_feat_on_mrna, align, mismatch_locs,
2964 insert_locs, delete_locs, delete_sizes,
2965 partial_unaligned_section);
2976 if (range_it->GetLength() > pos) {
2977 pos += range_it->GetFrom();
2980 pos -= range_it->GetLength();
2983 CSeq_loc base_loc(*mapped_protein_id, pos, pos);
2985 mapped = to_genomic->
Map(*mrna_loc);
2995 const CSeq_feat* cds_feat_on_query_mrna,
2996 const CSeq_feat* cds_feat_on_transcribed_mrna,
3001 || ( cds_feat_on_query_mrna && !cds_feat_on_query_mrna->
IsSetProduct() )
3014 if ( !(*it)->IsSetId() ) {
3019 const CFeat_id& feat_id = (*it)->GetId();
3042 (
"rearrangement required for product");
3059 bool has_start =
false;
3060 bool has_stop =
false;
3062 bool has_gap =
false;
3063 bool has_indel =
false;
3069 if (cds_feat_on_query_mrna) {
3074 corrected_cds_feat_on_query_mrna->
Assign(*cds_feat_on_query_mrna);
3078 corrected_cds_feat_on_transcribed_mrna->
Assign(*cds_feat_on_transcribed_mrna);
3082 int cds_start_on_mrna = 0;
3083 int frame_on_mrna = 0;
3084 bool filled_by_polya =
false;
3086 if (align !=
NULL) {
3098 string except_text =
"unclassified translation discrepancy";
3101 if (clean_match_count) {
3102 *clean_match_count = seq.
size();
3111 int missing_end = 0;
3112 if (cds_feat_on_query_mrna) {
3134 seq.
GetSeqData(cds_start_on_mrna + frame_on_mrna, cds_start_on_mrna + cds_len_on_query_mrna, mrna);
3135 if ((missing_end == 1 || missing_end == 2) &&
3141 filled_by_polya =
true;
3142 for (
size_t pos = mrna.size() - 1 - missing_end;
3143 pos < mrna.size(); ++pos)
3159 if (xlate.size() && xlate[0] ==
'-') {
3162 string first_codon = mrna.substr(0,3);
3166 xlate[0] = first_aa[0];
3176 const CSeq_loc& cb_on_genome = (*it)->GetLoc();
3178 if (!cb_on_mrna)
continue;
3181 if (
r.GetLength() != 3) {
3190 switch ((*it)->GetAa().Which()) {
3192 src += (char)(*it)->GetAa().GetNcbieaa();
3197 src += (char)(*it)->GetAa().GetNcbistdaa();
3202 src += (char)(*it)->GetAa().GetNcbi8aa();
3214 xlate[pos] = dst[0];
3226 if (corrected_cds_feat_on_transcribed_mrna) {
3248 if (cds_feat_on_transcribed_mrna) {
3251 CSeq_loc cds_feat_on_transcribed_mrna_loc;
3252 cds_feat_on_transcribed_mrna_loc.
Assign(corrected_cds_feat_on_transcribed_mrna->
GetLocation());
3254 cds_feat_on_transcribed_mrna_loc.
FlipStrand();
3260 product_ranges.
clear();
3262 product_ranges += loc_it.GetRange();
3271 product_ranges.
GetTo());
3275 if ((xlate.size() == product_ranges.
GetTo() + (filled_by_polya ? 1 : 2) ||
3277 xlate[xlate.size() - 1] ==
'*')
3279 xlate.resize(xlate.size() - 1);
3288 if ( (product_ranges.
GetFrom()==0 && xlate.size() && xlate[0] ==
'M') ||
3293 if (product_ranges.
Empty()) {
3298 if (product_ranges[0].IsWhole()) {
3301 string xlate_trimmed;
3303 actual +=
whole.substr(range_it->GetFrom(), range_it->GetLength());
3304 xlate_trimmed += xlate.substr(range_it->GetFrom(), range_it->GetLength());
3306 xlate = xlate_trimmed;
3308 if (actual !=
whole) {
3323 string::const_iterator it1 = actual.begin();
3324 string::const_iterator it1_end = actual.end();
3325 string::const_iterator it2 = xlate.begin();
3326 string::const_iterator it2_end = xlate.end();
3328 for ( ; it1 != it1_end && it2 != it2_end; ++it1, ++it2) {
3331 mapped_protein_id, product_ranges, to_mrna, to_genomic);
3334 if (!mapped->
IsInt()) {
3351 "fTrustProteinSeq & fForceTranslateCds combination not implemented");
3354 char actual_aa = *it1;
3355 code_break->
SetAa().SetNcbieaa(actual_aa);
3357 }
else if (*it2 ==
'-' || *it2 ==
'*') {
3359 }
else if (*it1 != *it2) {
3361 }
else if (clean_match_count && (!mapped ||
3364 ++*clean_match_count;
3368 if (has_stop && filled_by_polya) {
3371 product_ranges, to_mrna, to_genomic);
3379 feat.
SetComment() +=
"stop codon completed by the addition of "
3380 "3' A residues to the mRNA";
3391 (feat.
GetComment().find(
"indel") != string::npos ||
3392 feat.
GetComment().find(
"inserted") != string::npos ||
3393 feat.
GetComment().find(
"deleted") != string::npos))
3398 if (actual.size() != xlate.size() ||
3399 !has_stop || !has_start ||
3400 has_gap || has_indel) {
3401 except_text =
"unclassified translation discrepancy";
3403 else if (mismatch_count) {
3404 except_text =
"mismatches in translation";
3413 string except_text =
text;
3415 list<string> except_toks;
3419 for (list<string>::iterator it = except_toks.begin();
3420 it != except_toks.end(); ) {
3423 *it ==
"annotated by transcript or proteomic data" ||
3424 *it ==
"unclassified transcription discrepancy" ||
3425 *it ==
"mismatches in transcription" ||
3426 *it ==
"unclassified translation discrepancy" ||
3427 *it ==
"mismatches in translation") {
3428 except_toks.erase(it++);
3436 if ( !except_text.empty() ) {
3440 if(it->GetSeqId()->IsOther() &&
3441 it->GetSeqId()->GetOther().GetAccession()[0] ==
'N' &&
3442 string(
"MRP").find(it->GetSeqId()->GetOther().GetAccession()[1]) != string::npos)
3444 except_text =
"annotated by transcript or proteomic data";
3447 string product_type_string;
3449 product_type_string =
"AA sequence";
3452 product_type_string =
"RNA sequence";
3455 product_type_string +=
", mRNA";
3458 qualifier->
SetQual(
"inference");
3459 qualifier->
SetVal(
"similar to " + product_type_string +
" (same species):RefSeq:" +
3460 it->GetSeqId()->GetOther().GetAccession() +
'.' +
3462 feat.
SetQual().push_back(qualifier);
3465 except_toks.push_back(except_text);
3469 if (except_text.empty()) {
3485 string product_type_string =
"RNA sequence";
3489 product_type_string +=
", mRNA";
3501 qualifier->
SetQual(
"inference");
3502 qualifier->
SetVal(
"similar to " + product_type_string +
" (same species):"+db+
":" +
3504 feat.
SetQual().push_back(qualifier);
3511 const CSeq_feat* cds_feat_on_query_mrna,
3512 const CSeq_feat* cds_feat_on_transcribed_mrna,
3519 align_ref.
Reset(align);
3528 for (CSeq_feat::TQual::iterator it = feat.
SetQual().begin();
3531 if ((*it)->CanGetQual() && (*it)->GetQual() ==
"inference") {
3532 it = feat.
SetQual().erase(it);
3556 cds_feat_on_query_mrna, cds_feat_on_transcribed_mrna,
3557 transcribed_mrna_seqloc_refs,
3583 static string s_Count(
unsigned num,
const string &item_name)
3596 bool partial_unaligned_section)
3598 if (mismatch_locs.
empty() && insert_locs.
empty() && delete_locs.
empty() &&
3599 !partial_unaligned_section &&
3606 string rna_comment, cds_comment;
3615 inserts_in_cds &= insert_locs;
3616 deletes_in_cds &= delete_locs;
3618 if (cds_feat_on_mrna) {
3622 cds_ranges += loc_it.GetRange();
3627 align_info->
SetType().SetStr(
"AlignInfo");
3630 unsigned indel_count =
Convert(insert_locs.
size() + delete_locs.
size());
3631 unsigned frameshift_count = 0;
3632 unsigned pct_coverage = 100, cds_pct_coverage = 100;
3633 if (partial_unaligned_section) {
3640 if (cds_feat && cds_feat_on_mrna) {
3641 unsigned cds_indel_count = 0;
3643 ++(it->GetLength() % 3 ? frameshift_count : cds_indel_count);
3646 ++(delete_sizes[it->GetFrom()] % 3 ? frameshift_count
3649 indel_count -= frameshift_count;
3650 unsigned cds_mismatch_count = 0;
3651 bool start_codon_mismatch =
false;
3666 if (!single_interval_product) {
3668 "product is required to be a single interval");
3670 for (
TSeqPos pos = start_pos; pos < start_pos +
prot.size(); ++pos)
3672 CSeq_loc aa_loc(*cds_id, pos, pos);
3677 if (codon.
size() == 3) {
3679 codon[0], codon[1], codon[2]);
3680 char translated_codon = pos == 0
3683 if (translated_codon !=
prot[pos]) {
3684 ++cds_mismatch_count;
3689 start_codon_mismatch =
true;
3694 if (cds_mismatch_count || cds_indel_count || frameshift_count || cds_pct_coverage < 100)
3696 cds_comment =
"The RefSeq protein";
3697 if (cds_mismatch_count) {
3698 cds_comment +=
" has "
3699 +
s_Count(cds_mismatch_count,
"substitution");
3701 if (frameshift_count) {
3702 cds_comment += (cds_mismatch_count ?
", " :
" has ")
3703 +
s_Count(frameshift_count,
"frameshift");
3705 if (cds_indel_count) {
3706 cds_comment += (cds_mismatch_count || frameshift_count ?
", " :
" has ")
3707 +
s_Count(cds_indel_count,
"non-frameshifting indel");
3709 if (cds_pct_coverage < 100) {
3710 if (cds_mismatch_count || cds_indel_count || frameshift_count) {
3711 cds_comment +=
" and";
3713 cds_comment +=
" aligns at "
3717 cds_comment +=
" compared to this genomic sequence";
3719 if (start_codon_mismatch) {
3720 align_info->
AddField(
"start_codon_mismatches", 1);
3723 rna_comment =
"The RefSeq transcript";
3724 if (!mismatch_locs.
empty()) {
3725 rna_comment +=
" has " +
3729 if (frameshift_count) {
3730 rna_comment += (mismatch_locs.
empty() ?
" has " :
", ") +
3731 s_Count(frameshift_count,
"frameshift");
3732 align_info->
AddField(
"num_frameshifts", (
int)frameshift_count);
3735 rna_comment += (mismatch_locs.
empty() && !frameshift_count?
" has " :
", ") +
3736 s_Count(indel_count,
"non-frameshifting indel");
3737 align_info->
AddField(
"num_nonframeshift_indel", (
int)indel_count);
3739 if (partial_unaligned_section) {
3740 if (!mismatch_locs.
empty() || indel_count || frameshift_count) {
3741 rna_comment +=
" and";
3743 rna_comment +=
" aligns at "
3747 if (rna_comment ==
"The RefSeq transcript") {
3748 rna_comment.clear();
3750 rna_comment +=
" compared to this genomic sequence";
3756 deleted_bases = 0, cds_deleted_bases = 0,
3760 "Delete locations should always be one base");
3761 deleted_bases += delete_sizes.
find(delete_it->GetFrom())->second;
3764 for (
TSeqPos pos = insert_it->GetFrom();
3765 pos <= insert_it->GetTo(); ++pos)
3772 "Delete locations should always be one base");
3773 delete_codons.
insert((delete_it->GetFrom() -
3775 cds_deleted_bases +=
3776 delete_sizes.
find(delete_it->GetFrom())->second;
3783 switch ((*it)->GetAa().Which()) {
3785 aa = (*it)->GetAa().GetNcbieaa();
3790 string src_string(1, (*it)->GetAa().GetNcbistdaa()),
3801 string src_string(1, (*it)->GetAa().GetNcbi8aa()),
3818 unsigned insert_codons_count =
Convert(insert_codons.
size()),
3819 delete_codons_count =
Convert(delete_codons.
size());
3820 if (inserted_bases || deleted_bases) {
3823 if (inserted_bases) {
3824 rna_comment +=
": inserted " +
s_Count(inserted_bases,
"base")
3825 +
" in " +
s_Count(insert_codons_count,
"codon");
3827 if (deleted_bases) {
3829 +
" deleted " +
s_Count(deleted_bases,
"base")
3830 +
" in " +
s_Count(delete_codons_count,
"codon");
3832 if (cds_inserted_bases || cds_deleted_bases || code_breaks) {
3835 if (cds_inserted_bases) {
3836 cds_comment +=
": inserted " +
s_Count(cds_inserted_bases,
"base")
3837 +
" in " +
s_Count(insert_codons_count,
"codon");
3839 if (cds_deleted_bases) {
3841 +
" deleted " +
s_Count(cds_deleted_bases,
"base")
3842 +
" in " +
s_Count(delete_codons_count,
"codon");
3846 +
" substituted " +
s_Count(code_breaks,
"base")
3847 +
" at " +
s_Count(code_breaks,
"genomic stop codon");
3851 if (!rna_comment.empty()) {
3855 }
else if (rna_feat.
GetComment().find(rna_comment) == string::npos) {
3859 if (!cds_comment.empty()) {
3863 }
else if (cds_feat->
GetComment().find(cds_comment) == string::npos) {
3864 cds_feat->
SetComment() +=
"; " + cds_comment;
3867 if (!align_info->
GetData().empty()) {
3868 rna_feat.
AddExt(align_info);
3886 }
else if (feat.
GetComment().find(comment) == string::npos) {
3892 comment =
" added " +
s_Count(insert_length,
"base") +
" not found in genome assembly";
3904 string ensembl_match_rna, ensembl_match_cds;
3905 vector<string> keywords;
3909 for (
CSeqdesc_CI desc(rna_handle, desc_types); desc; ++desc) {
3910 if (desc->IsGenbank() && desc->GetGenbank().IsSetKeywords()) {
3911 for (
const string &keyword : desc->GetGenbank().GetKeywords()) {
3913 (keyword ==
"MANE Select" || keyword ==
"MANE Plus"
3914 || keyword ==
"MANE Plus Clinical"))
3917 if (keyword ==
"MANE Select") {
3918 keywords.push_back(
"RefSeq Select");
3919 }
else if (keyword ==
"MANE Plus Clinical") {
3920 keywords.push_back(
"RefSeq Plus Clinical");
3923 keywords.push_back(keyword);
3926 }
else if (desc->IsUser() &&
3927 desc->GetUser().HasField(
"MANE Ensembl match"))
3931 "/", ensembl_match_rna, ensembl_match_cds);
3934 }
else if (desc->IsUser() && desc->GetUser().GetType().IsStr() &&
3935 desc->GetUser().GetType().GetStr() ==
"RefGeneTracking" &&
3936 need_location_check)
3938 if (desc->GetUser().HasField(
"EnsemblLocation")) {
3940 desc->GetUser().GetField(
"EnsemblLocation"));
3941 }
else if (desc->GetUser().HasField(
"SelectGeneLocation")) {
3945 desc->GetUser().GetField(
"SelectGeneLocation")));
3950 if ((match_found >=
eOverlap || !need_location_check) && !keywords.empty())
3959 if (match_found ==
eExact && !drop && !ensembl_match_rna.empty()) {
3961 rna_ensembl_ref->
SetDb(
"Ensembl");
3962 rna_ensembl_ref->
SetTag().SetStr(ensembl_match_rna);
3963 rna_feat.
SetDbxref().push_back(rna_ensembl_ref);
3964 if (cds_feat && !ensembl_match_cds.empty()) {
3966 cds_ensembl_ref->
SetDb(
"Ensembl");
3967 cds_ensembl_ref->
SetTag().SetStr(ensembl_match_cds);
3968 cds_feat->
SetDbxref().push_back(cds_ensembl_ref);
3982 +
" doesn't have expected fields");
4017 for (
const string &keyword : keywords) {
4020 qualifier->
SetVal(keyword);
4021 feat.
SetQual().push_back(qualifier);
4045 id->Assign(*loc1->
GetId());
4051 merged_loc = left_loc;
4052 merged_loc->
Add(*right_loc);
4053 merged_loc->
Add(*loc1);
4055 merged_loc->
Add(*loc2);
4065 x[1] += genomic_size;
4067 x[3] += genomic_size;
4070 x[0] += genomic_size;
4071 x[1] += genomic_size;
4072 }
else if (x[3] < x[0]) {
4073 x[2] += genomic_size;
4074 x[3] += genomic_size;
4078 x[0] =
min(x[0], x[2]);
4079 x[1] =
max(x[1], x[3]) - genomic_size;
4095 id->Assign(*loc.
GetId());
4103 left_loc->
Add(*it.GetRangeAsSeq_loc());
4105 right_loc->
Add(*it.GetRangeAsSeq_loc());
4115 swap(left_loc, right_loc);
4117 left_loc->
Add(*right_loc);
4119 if (no_gap_at_origin) {
4123 if (interval.
GetFrom() == 0) {
4126 if (interval.
GetTo() == genomic_size-1) {
4156 return genomic_ids.
size() > 1;
4166 const int k_gap_length,
4167 const int next_exon_start)
4169 if (insert->
SetMix().Set().size() > 1) {
4173 if (insert->
SetMix().Set().size() > 0) {
4174 int half_intron_length = (next_exon_start - region_end)/2;
4175 int copy_length =
min(k_gap_length, half_intron_length);
4176 region_end += copy_length;
4178 if (region_begin < region_end) {
4182 edited_sequence_seqloc->
SetMix().Set().push_back(genome_loc);
4184 if (copy_length < k_gap_length) {
4189 edited_sequence_seqloc->
SetMix().Set().push_back(gap_loc);
4193 edited_sequence_seqloc->
SetMix().Set().push_back(insert);
4196 if (copy_length < k_gap_length) {
4199 edited_sequence_seqloc->
SetMix().Set().push_back(gap_loc);
4203 region_begin = region_end;
4213 const CSeq_feat* cds_feat_on_query_mrna_ptr,
4214 bool call_on_align_list)
4218 align->
Assign(input_align);
4249 const int k_gap_length =
min(1000,
int(genomic_length));
4256 int region_begin = 0;
4267 if (!seqid.
Match(*genomic_seqid)) {
4274 insert->
SetMix().Set().push_back(loc);
4276 int exon_length = exon_stop - exon_start +1;
4277 exon_stop = region_end + k_gap_length -1;
4278 exon_start = region_end + k_gap_length - exon_length;
4284 if (!(region_end <= exon_start)) {
4297 region_end = exon_stop +1;
4316 if (region_begin < (
int)genomic_length) {
4319 genomic_length -1));
4320 edited_sequence_seqloc->
SetMix().Set().push_back(genome_loc);
4333 seqentry->
SetSeq(*bioseq);
4341 bioseq->
SetDescr().Set().push_back(seq_desc);
4349 bioseq->
SetDescr().Set().push_back(seq_desc);
4363 gene_feat = gene->second;
4371 call_on_align_list);
4373 m_scope->RemoveBioseq(bioseq_handle);
4374 annot_local.
SetData().SetFtable().clear();
4378 genes[gene_id] = gene_feat;
4386 TSeqPos cds_insert_length = 0;
4389 align->
Assign(input_align);
4396 if (!seqid.
Match(*genomic_seqid)) {
4400 if (cds_feat_on_query_mrna_ptr) {
4401 int cds_intersection_len =
4407 if (cds_intersection_len > 0) {
4408 cds_insert_length += cds_intersection_len;
4420 gene_id, cds_feat_on_query_mrna_ptr,
4421 call_on_align_list);
4425 align->
Assign(input_align);
4428 if (entry.
IsSeq() &&
4444 it != annot_local.
SetData().SetFtable().rend(); ++it) {
4446 if (
f.GetData().IsGene()) {
4450 if (
f.GetData().IsCdregion() && cds_insert_length==0) {
4462 annot.
SetData().SetFtable().splice(annot.
SetData().SetFtable().end(),
4463 annot_local.
SetData().SetFtable());
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
@ eExtreme_Biological
5' and 3'
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
size_t GetSize(void) const
const CSeq_id * GetFirstId() const
TFeatureGeneratorFlags GetFlags() const
CRef< objects::CSeq_feat > ConvertAlignToAnnot(const objects::CSeq_align &align, objects::CSeq_annot &annot, objects::CBioseq_set &seqs, Int8 gene_id=0, const objects::CSeq_feat *cdregion_on_mrna=NULL)
Convert an alignment to an annotation.
unique_ptr< SImplementation > m_impl
void RecomputePartialFlags(objects::CSeq_annot &annot)
Recompute the correct partial states for all features in this annotation.
void SetMinIntron(TSeqPos)
EIntronStitchThresholdFlags
void SetFeatureExceptions(objects::CSeq_feat &feat, const objects::CSeq_align *align=NULL)
Correctly mark exceptions on a feature.
void SetFlags(TFeatureGeneratorFlags)
void SetAllowedUnaligned(TSeqPos)
CFeatureGenerator(CRef< objects::CScope > scope)
CConstRef< objects::CSeq_align > CleanAlignment(const objects::CSeq_align &align)
Clean an alignment according to our best guess of its biological representation.
void SetIntronStitchThresholdFlags(EIntronStitchThresholdFlags)
@ fGenerateStableLocalIds
@ fAddTranslatedCDSAssembly
int TFeatureGeneratorFlags
void SetPartialFlags(CRef< objects::CSeq_feat > gene_feat, CRef< objects::CSeq_feat > mrna_feat, CRef< objects::CSeq_feat > cds_feat)
Mark the correct partial states for a set of features.
void ConvertLocToAnnot(const objects::CSeq_loc &loc, objects::CSeq_annot &annot, objects::CBioseq_set &seqs, objects::CCdregion::EFrame frame=objects::CCdregion::eFrame_one, CRef< objects::CSeq_id > prot_id=CRef< objects::CSeq_id >(), CRef< objects::CSeq_id > rna_id=CRef< objects::CSeq_id >())
Convert genomic location to an annotation.
@Gb_qual.hpp User-defined methods of the data storage class.
static const CTrans_table & GetTransTable(int id)
static void SetFeatureExceptions(objects::CSeq_feat &feat, objects::CScope &scope, const objects::CSeq_align *align=NULL)
Correctly mark exceptions on a feature.
static void CreateGeneModelFromAlign(const objects::CSeq_align &align, objects::CScope &scope, objects::CSeq_annot &annot, objects::CBioseq_set &seqs, TGeneModelCreateFlags flags=fDefaults, TSeqPos allowed_unaligned=10)
Create a gene model from an alignment this will optionally promote all features through the alignment...
static void SetPartialFlags(objects::CScope &scope, CRef< objects::CSeq_feat > gene_feat, CRef< objects::CSeq_feat > mrna_feat, CRef< objects::CSeq_feat > cds_feat)
static void CreateGeneModelsFromAligns(const list< CRef< objects::CSeq_align > > &aligns, objects::CScope &scope, objects::CSeq_annot &annot, objects::CBioseq_set &seqs, TGeneModelCreateFlags flags=fDefaults, TSeqPos allowed_unaligned=10)
int TGeneModelCreateFlags
static void RecomputePartialFlags(objects::CScope &scope, objects::CSeq_annot &annot)
position_type GetTo() const
position_type GetFrom() const
position_type GetCoveredLength(void) const
Returns total length covered by ranges in this collection, i.e.
double GetPercentCoverage(CScope &scope, const CSeq_align &align, unsigned query=0)
Compute percent coverage of the query (sequence 0) (range 0-100)
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
ESubtype GetSubtype(void) const
CRange< TSeqPos > GetSeqRange(TDim row) const
GetSeqRange NB: On a Spliced-seg, in case the product-type is protein, these only return the amin par...
CRef< CSeq_loc > CreateRowSeq_loc(TDim row) const
TDim CheckNumRows(void) const
Validatiors.
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
namespace ncbi::objects::
void AddExt(CRef< CUser_object > ext, TAddExt add_flags=0)
Add an extension by type in exts container.
void SetPartialStart(bool val, ESeqLocExtremes ext)
void SetPartialStop(bool val, ESeqLocExtremes ext)
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
CSeq_feat_Handle GetFeatureWithId(CSeqFeatData::E_Choice type, TFeatureIdInt id) const
char GetStartResidue(int state) const
char GetCodonResidue(int state) const
static int SetCodonState(unsigned char ch1, unsigned char ch2, unsigned char ch3)
Template class for iteration on objects of class C.
int GetInt(void) const
get value
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user field.
const string & GetString(void) const
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
container_type::iterator iterator
const_iterator end() const
iterator_bool insert(const value_type &val)
const_iterator find(const key_type &key) const
iterator_bool insert(const value_type &val)
CMappedFeat GetCdsOnMrna(const objects::CSeq_id &rna_id, CScope &scope)
static bool s_Contains(const TSeqRange &range1, const TSeqRange &range2)
Check whether range1 contains range2.
bool IsProteinAlign(const CSeq_align &align)
void AddInsertWithGaps(CRef< CSeq_loc > &edited_sequence_seqloc, CSeq_id &genomic_seqid, int ®ion_begin, int ®ion_end, int &offset, CRef< CSeq_loc > &insert, const int k_gap_length, const int next_exon_start)
const char * k_except_text_for_gap_filled_gnomon_model
const char * k_cds_comment
void AddCodeBreak(CSeq_feat &feat, CSeq_loc &loc, char ncbieaa)
static void s_TransformToNucpos(CProduct_pos &pos)
const char * k_rna_comment
void AddLiteral(CSeq_inst &inst, const string &seq, CSeq_inst::EMol mol_class)
string ExtractGnomonModelNum(const CSeq_id &seq_id)
void RenameGeneratedBioseqs(const CSeq_id &query_rna_id, CSeq_id &transcribed_rna_id, CRef< CSeq_feat > cds_feat_on_query_mrna, CRef< CSeq_feat > cds_feat_on_genome_with_translated_product)
bool IsContinuous(const CSeq_loc &loc)
static string s_Count(unsigned num, const string &item_name)
static CRef< CSeq_loc > s_MapSingleAA(TSeqPos pos, CRef< CSeq_id > mapped_protein_id, const CRangeCollection< TSeqPos > &product_ranges, CRef< CSeq_loc_Mapper > to_mrna, CRef< CSeq_loc_Mapper > to_genomic)
static const CMolInfo * s_GetMolInfo(const CBioseq_Handle &handle)
Return the mol-info object for a given sequence.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
int TSignedSeqPos
Type for signed sequence position.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
TValue Add(int delta) THROWS_NONE
Atomically add value (=delta), and return new counter value.
#define NCBI_ASSERT(expr, mess)
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Warning(CExceptionArgs_Base &args)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
const string AsFastaString(void) const
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
CConstRef< CSeq_id > GetSeqId(void) const
EAccessionInfo
For IdentifyAccession (below)
CSeq_id::EAccessionInfo IdentifyAccession(void) const
CSeq_id & Set(const CTempString &the_id, TParseFlags flags=fParse_AnyRaw)
Reassign based on flat specifications; arguments interpreted as with constructors.
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
void SetPacked_int(TPacked_int &v)
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
ENa_strand GetStrand(void) const
Get the location's strand.
void ChangeToPackedInt(void)
Works only if location is currently an interval, point, packed-int (handled trivially),...
bool IsReverseStrand(void) const
Return true if all ranges have reverse strand.
void FlipStrand(void)
Flip the strand (e.g. plus to minus)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
TRange GetTotalRange(void) const
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
CRef< CSeq_loc > Merge(TOpFlags flags, ISynonymMapper *syn_mapper) const
All functions create and return a new seq-loc object.
const_iterator end(void) const
const_iterator begin(void) const
int Compare(const CSeq_loc &loc) const
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
void SetPartialStart(bool val, ESeqLocExtremes ext)
set / remove e_Lim fuzz on start or stop (lt/gt - indicating partial interval)
CRef< CSeq_loc > Intersect(const CSeq_loc &other, TOpFlags flags, ISynonymMapper *syn_mapper) const
Find the intersection with the seq-loc, merge/sort resulting ranges depending on flags.
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
void SetPartialStop(bool val, ESeqLocExtremes ext)
bool IsPartialStop(ESeqLocExtremes ext) const
TSeqPos GetStop(ESeqLocExtremes ext) const
@ eOrder_Biological
Iterate sub-locations in positional order.
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
const CMolInfo * GetMolInfo(const CBioseq &bioseq)
Retrieve the MolInfo object for a given bioseq handle.
const COrg_ref * GetOrg_refOrNull(const CBioseq_Handle &handle)
Return the pointer to org-ref associated with a given sequence or null if there is no org-ref associa...
const COrg_ref & GetOrg_ref(const CBioseq_Handle &handle)
Return the org-ref associated with a given sequence.
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
@ fIs5PrimePartial
= 0x4 Translate first codon even if not start codon (because sequence is 5' partial)
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
@ eGetId_ForceAcc
return only an accession based seq-id
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
CSeq_annot_Handle AddSeq_annot(CSeq_annot &annot, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add Seq-annot, return its CSeq_annot_Handle.
CSeq_loc_Mapper_Base & SetMergeAll(void)
Merge any abutting or overlapping intervals.
@ eProductToLocation
Map from the feature's product to location.
@ eLocationToProduct
Map from the feature's location to product.
@ fAlign_Dense_seg_TotalRange
Ignore internal dense-seg structure - map each dense-seg according to the total ranges involved.
vector< CSeq_id_Handle > TId
const CTSE_Handle & GetTSE_Handle(void) const
Get CTSE_Handle of containing TSE.
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
bool IsSetDbxref(void) const
virtual CConstRef< CSeq_feat > GetSeq_feat(void) const
const CSeqFeatData & GetData(void) const
TSeqPos GetBioseqLength(void) const
CSeq_feat_EditHandle AddFeat(const CSeq_feat &new_obj) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
TInst_Topology GetInst_Topology(void) const