NCBI C++ ToolKit
create_defline.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jonathan Kans, Aaron Ucko
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 
33 #include <ncbi_pch.hpp>
34 
36 
38 #include <util/text_joiner.hpp>
39 #include <serial/iterator.hpp>
40 
42 #include <objects/seq/Map_ext.hpp>
44 
45 #include <objmgr/annot_ci.hpp>
46 #include <objmgr/feat_ci.hpp>
47 #include <objmgr/seq_map_ci.hpp>
48 #include <objmgr/seqdesc_ci.hpp>
49 #include <objmgr/mapped_feat.hpp>
50 #include <objmgr/seq_entry_ci.hpp>
51 #include <objmgr/error_codes.hpp>
52 
53 #include <objmgr/util/feature.hpp>
54 #include <objmgr/util/sequence.hpp>
55 #include <objmgr/util/autodef.hpp>
56 
59 USING_SCOPE(sequence);
60 USING_SCOPE(feature);
61 
62 #define NCBI_USE_ERRCODE_X ObjMgr_SeqUtil
63 
66 {
67 public:
68  CDefLineJoiner(bool show_mods = false)
69  : m_ShowMods(show_mods)
70  {
71  }
72  void Add(const CTempString &name, const CTempString &value, EHidePart hide = eHideNone)
73  {
74  if (m_ShowMods)
75  {
76  if (name.empty() || value.empty()) {
77  return;
78  }
79  // The case of no quotes is much more common, so optimize for that
80  if (value.find_first_of("\"=") != string::npos) {
81  // rarer case: bad characters in value name, so
82  // we need surrounding double-quotes and we need to change
83  // double-quotes to single-quotes.
84  m_Joiner.Add(" [").Add(name).Add("=\"");
85  ReplaceAndAdd(value, "\"", "'");
86  m_Joiner.Add("\"]");
87  } else {
88  m_Joiner.Add(" [").Add(name).Add("=").Add(value).Add("]");
89  }
90  }
91  else
92  {
93  if (eHideNone == hide && !name.empty()) {
94  m_Joiner.Add(" ").Add(name);
95  }
96  if (!value.empty()) {
97  m_Joiner.Add(" ").Add(value);
98  }
99  }
100  }
101  void Join(std::string* result) const
102  {
104  }
105 private:
107  const CTempString &replace_what, const CTempString &replace_with)
108  {
109  // commented out: CTempString is immutable
110  //string fixed = NStr::Replace(value, "\"", "'");
111  CTempString::size_type p1 = 0, p2 = value.length();
112  for (; (p2 = value.find(replace_what, p1)) != string::npos;
113  p1 = p2 + 1, p2 = value.length()) {
114  m_Joiner.Add(value.substr(p1, p2 - p1)).Add(replace_with);
115  }
116  m_Joiner.Add(value.substr(p1, p2 - p1));
117  }
120 };
121 
122 // constructor
124 {
125  m_ConstructedFeatTree = false;
126  m_InitializedFeatTree = false;
127  x_Init();
128 }
129 
130 // constructor
132 {
133  // initialize common bits (FSA)
134  x_Init();
135 
136  // then store top SeqEntry Handle for building CFeatTree when first needed
137  m_TopSEH = tseh;
138  m_ConstructedFeatTree = true;
139  m_InitializedFeatTree = false;
140 }
141 
142 // destructor
144 
145 {
146 }
147 
149 {
150  // nothing here yet
151 }
152 
153 // macros
154 
155 // SEQENTRY_HANDLE_ON_SEQENTRY_HANDLE_ITERATOR
156 // FOR_EACH_SEQENTRY_HANDLE_ON_SEQENTRY_HANDLE
157 // CSeq_entry_Handle as input,
158 // dereference with CSeq_entry_Handle var = *Itr;
159 
160 #define SEQENTRY_HANDLE_ON_SEQENTRY_HANDLE_ITERATOR(Itr, Var) \
161 CSeq_entry_CI Itr(Var)
162 
163 #define FOR_EACH_SEQENTRY_HANDLE_ON_SEQENTRY_HANDLE(Itr, Var) \
164 for (SEQENTRY_HANDLE_ON_SEQENTRY_HANDLE_ITERATOR(Itr, Var); Itr; ++Itr)
165 
166 // FOR_EACH_SEQID_ON_BIOSEQ_HANDLE
167 // CBioseq_Handle& as input,
168 // dereference with CSeq_id_Handle sid = *Itr;
169 
170 #define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var) \
171 ITERATE (CBioseq_Handle::TId, Itr, Var.GetId())
172 
173 // SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR
174 // FOR_EACH_SEQFEAT_ON_BIOSEQ_HANDLE
175 // CBioseq_Handle& as input,
176 // dereference with const CSeq_feat& sft = Itr->GetOriginalFeature();
177 
178 #define SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR(Itr, Var, Chs) \
179 CFeat_CI Itr(Var, CSeqFeatData::e_##Chs)
180 
181 #define FOR_EACH_SEQFEAT_ON_BIOSEQ_HANDLE(Itr, Var, Chs) \
182 for (SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR(Itr, Var, Chs); Itr; ++Itr)
183 
184 // SEQFEAT_ON_SCOPE_ITERATOR
185 // FOR_EACH_SEQFEAT_ON_SCOPE
186 // CScope& as input,
187 // dereference with const CSeq_feat& sft = Itr->GetOriginalFeature();
188 
189 #define SEQFEAT_ON_SCOPE_ITERATOR(Itr, Var, Loc, Chs) \
190 CFeat_CI Itr(Var, Loc, CSeqFeatData::e_##Chs)
191 
192 #define FOR_EACH_SEQFEAT_ON_SCOPE(Itr, Var, Loc, Chs) \
193 for (SEQFEAT_ON_SCOPE_ITERATOR(Itr, Var, Loc, Chs); Itr; ++Itr)
194 
195 // SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR
196 // FOR_SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE
197 // CBioseq_Handle& and SAnnotSelector as input,
198 // dereference with const CSeq_feat& sft = Itr->GetOriginalFeature();
199 
200 #define SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR(Itr, Var, Sel) \
201 CFeat_CI Itr(Var, Sel)
202 
203 #define FOR_SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE(Itr, Var, Sel) \
204 for (SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR(Itr, Var, Sel); Itr; ++Itr)
205 
206 // Copied from CleanAndCompress in objtools/format/utils.cpp
207 
208 // two-bytes combinations we're looking to clean
209 #define twochars(a,b) Uint2((a) << 8 | (b))
210 #define twocommas twochars(',',',')
211 #define twospaces twochars(' ',' ')
212 #define space_comma twochars(' ',',')
213 #define space_bracket twochars(' ',')')
214 #define bracket_space twochars('(',' ')
215 #define space_semicolon twochars(' ',';')
216 #define comma_space twochars(',',' ')
217 #define semicolon_space twochars(';',' ')
218 
219 void x_CleanAndCompress(string& dest, const CTempString& instr, bool isProt)
220 {
221  size_t left = instr.size();
222  // this is the input stream
223  const char* in = instr.data();
224 
225  // skip front white spaces
226  while (left && *in == ' ')
227  {
228  in++;
229  left--;
230  }
231  // forget end white spaces
232  while (left && in[left - 1] == ' ')
233  {
234  left--;
235  }
236 
237  dest.resize(left);
238 
239  if (left < 1) return;
240 
241  // this is where we write result
242  char* out = (char*)dest.c_str();
243 
244  char curr = *in++; // initialize with first character
245  left--;
246 
247  char next = 0;
248  Uint2 two_chars = curr; // this is two bytes storage where we see current and previous symbols
249 
250  while (left > 0) {
251  next = *in++;
252 
253  two_chars = Uint2((two_chars << 8) | next);
254 
255  switch (two_chars)
256  {
257  case twocommas: // replace double commas with comma+space
258  *out++ = curr;
259  next = ' ';
260  break;
261  case twospaces: // skip multispaces (only print last one)
262  break;
263  case bracket_space: // skip space after bracket
264  next = curr;
265  two_chars = curr;
266  break;
267  case space_bracket: // skip space before bracket
268  break;
269  case space_comma:
270  case space_semicolon: // swap characters
271  *out++ = next;
272  next = curr;
273  two_chars = curr;
274  break;
275  case comma_space:
276  *out++ = curr;
277  *out++ = ' ';
278  while (next == ' ' || next == ',') {
279  next = *in;
280  in++;
281  left--;
282  }
283  two_chars = next;
284  break;
285  case semicolon_space:
286  *out++ = curr;
287  *out++ = ' ';
288  while (next == ' ' || next == ';') {
289  next = *in;
290  in++;
291  left--;
292  }
293  two_chars = next;
294  break;
295  default:
296  *out++ = curr;
297  break;
298  }
299 
300  curr = next;
301  left--;
302  }
303 
304  if (curr > 0 && curr != ' ') {
305  *out++ = curr;
306  }
307 
308  dest.resize(out - dest.c_str());
309 
310  if (isProt) {
311  NStr::ReplaceInPlace (dest, ". [", " [");
312  NStr::ReplaceInPlace (dest, ", [", " [");
313  }
314 }
315 
316 static bool s_IsVirusOrPhage(const CTempString& taxname)
317 {
318  return (NStr::FindNoCase(taxname, "virus") != NPOS ||
319  NStr::FindNoCase(taxname, "phage") != NPOS);
320 }
321 
322 
324  TBIOSOURCE_GENOME genome
325  ) const
326 {
327  const char* result = kEmptyCStr;
328 
329  const bool has_plasmid = !m_Plasmid.empty();
330 
331 
332  switch (genome) {
333  case NCBI_GENOME(chloroplast):
334  result = "chloroplast";
335  break;
336  case NCBI_GENOME(chromoplast):
337  result = "chromoplast";
338  break;
339  case NCBI_GENOME(kinetoplast):
340  result = "kinetoplast";
341  break;
342  case NCBI_GENOME(mitochondrion):
343  {
344  if (!m_FastaFormat && (has_plasmid || m_IsWGS)) {
345  result = "mitochondrial";
346  } else {
347  result = "mitochondrion";
348  }
349  break;
350  }
351  case NCBI_GENOME(plastid):
352  result = "plastid";
353  break;
354  case NCBI_GENOME(macronuclear):
355  {
356  result = "macronuclear";
357  break;
358  }
359  case NCBI_GENOME(extrachrom):
360  {
361  if (!m_IsWGS) {
362  result = "extrachromosomal";
363  }
364  break;
365  }
366  case NCBI_GENOME(plasmid):
367  {
368  if (!m_IsWGS) {
369  result = "plasmid";
370  }
371  break;
372  }
373  // transposon and insertion-seq are obsolete
374  case NCBI_GENOME(cyanelle):
375  result = "cyanelle";
376  break;
377  case NCBI_GENOME(proviral):
378  {
379  if (!s_IsVirusOrPhage(m_Taxname)) {
380  if (has_plasmid || m_IsWGS) {
381  result = "proviral";
382  } else {
383  result = "provirus";
384  }
385  }
386  break;
387  }
388  case NCBI_GENOME(virion):
389  {
390  if (!s_IsVirusOrPhage(m_Taxname)) {
391  result = "virus";
392  }
393  break;
394  }
395  case NCBI_GENOME(nucleomorph):
396  {
397  if (!m_IsWGS) {
398  result = "nucleomorph";
399  }
400  break;
401  }
402  case NCBI_GENOME(apicoplast):
403  result = "apicoplast";
404  break;
405  case NCBI_GENOME(leucoplast):
406  result = "leucoplast";
407  break;
408  case NCBI_GENOME(proplastid):
409  result = "proplastid";
410  break;
411  case NCBI_GENOME(endogenous_virus):
412  result = "endogenous virus";
413  break;
414  case NCBI_GENOME(hydrogenosome):
415  result = "hydrogenosome";
416  break;
417  case NCBI_GENOME(chromosome):
418  result = "chromosome";
419  break;
420  case NCBI_GENOME(chromatophore):
421  result = "chromatophore";
422  break;
423  }
424 
425  return result;
426 }
427 
428 // set instance variables from Seq-inst, Seq-ids, MolInfo, etc., but not
429 // BioSource
431  const CBioseq_Handle& bsh,
433 )
434 
435 {
437  if (! bsx) {
438  return;
439  }
440 
441  // set flags from record components
445  m_GpipeMode = (flags & fGpipeMode) != 0;
447  m_DevMode = (flags & fDevMode) != 0;
448  m_FastaFormat = (flags & fFastaFormat) != 0;
449 
450  // reset member variables to cleared state
451  m_IsNA = bsx->IsNA();
452  m_IsAA = bsx->IsAA();
453  m_Topology = bsx->GetTopology();
454  m_Length = bsx->GetLength();
455 
456  m_IsSeg = false;
457  m_IsDelta = bsx->IsDelta();
458  m_IsVirtual = bsx->IsVirtual();
459  m_IsMap = bsx->IsMap();
460 
461  m_IsNC = bsx->IsNC();
462  m_IsNM = bsx->IsNM();
463  m_IsNR = bsx->IsNR();
464  m_IsNZ = bsx->IsNZ();
465  m_IsPatent = bsx->IsPatent();
466  m_IsPDB = bsx->IsPDB();
467  m_IsWP = bsx->IsWP();
468  m_ThirdParty = bsx->IsThirdParty();
469  m_WGSMaster = bsx->IsWGSMaster();
470  m_TSAMaster = bsx->IsTSAMaster();
471  m_TLSMaster = bsx->IsTLSMaster();
472 
473  m_GeneralStr = bsx->GetGeneralStr();
474  m_GeneralId = bsx->GetGeneralId();
475 
479 
480  m_PDBChain = bsx->GetPDBChain();
481  m_PDBChainID = bsx->GetPDBChainID();
482 
483  m_MIBiomol = bsx->GetBiomol();
484  m_MITech = bsx->GetTech();
486 
487  m_HTGTech = bsx->IsHTGTech();
489  m_IsTLS = bsx->IsTLS();
490  m_IsTSA = bsx->IsTSA();
491  m_IsWGS = bsx->IsWGS();
493 
494  m_MainTitle.clear();
495  if (! m_HTGSUnfinished && ! m_Reconstruct) {
496  m_MainTitle = bsx->GetTitle();
497  }
498 
499  m_UseBiosrc = bsx->IsUseBiosrc();
500 
502  m_HTGSDraft = bsx->IsHTGSDraft();
503  m_HTGSPooled = bsx->IsHTGSPooled();
504  m_TPAExp = bsx->IsTPAExp();
505  m_TPAInf = bsx->IsTPAInf();
506  m_TPAReasm = bsx->IsTPAReasm();
507  m_Unordered = bsx->IsUnordered();
508 
509  m_PDBCompound = bsx->GetPDBCompound();
510 
511  m_Source = bsx->GetBioSource();
512  m_Taxname = bsx->GetTaxname();
513  m_Genus = bsx->GetGenus();
514  m_Species = bsx->GetSpecies();
516  m_Genome = bsx->GetGenome();
517  m_IsPlasmid = bsx->IsPlasmid();
518  m_IsChromosome = bsx->IsChromosome();
519 
520  m_Organelle = bsx->GetOrganelle();
521 
525 
526  m_Chromosome = bsx->GetChromosome();
528  m_Clone = bsx->GetClone();
529  m_has_clone = bsx->HasClone();
530  m_Map = bsx->GetMap();
531  m_Plasmid = bsx->GetPlasmid();
532  m_Segment = bsx->GetSegment();
533 
534  m_Breed = bsx->GetBreed();
535  m_Cultivar = bsx->GetCultivar();
537  m_Isolate = bsx->GetIsolate();
538  m_Strain = bsx->GetStrain();
539  m_Substrain = bsx->GetSubstrain();
541 
542  m_IsUnverified = bsx->IsUnverified();
544  if (m_IsUnverified) {
545  int unverified_count = 0;
546  m_UnverifiedPrefix = "UNVERIFIED: ";
547  if (bsx->IsUnverifiedFeature()) {
548  m_UnverifiedPrefix = "UNVERIFIED: ";
549  unverified_count++;
550  }
551  if (bsx->IsUnverifiedMisassembled()) {
552  m_UnverifiedPrefix = "UNVERIFIED_ASMBLY: ";
553  unverified_count++;
554  }
555  if (bsx->IsUnverifiedContaminant()) {
556  m_UnverifiedPrefix = "UNVERIFIED_CONTAM: ";
557  unverified_count++;
558  }
559  if (bsx->IsUnverifiedOrganism()) {
560  m_UnverifiedPrefix = "UNVERIFIED_ORG: ";
561  unverified_count++;
562  }
563  if (unverified_count > 1) {
564  // m_UnverifiedPrefix = "UNVERIFIED: ";
565  }
566  }
567 
568  m_IsUnreviewed = bsx->IsUnreviewed();
570  if (m_IsUnreviewed) {
571  m_UnreviewedPrefix = "UNREVIEWED: ";
572  if (bsx->IsUnreviewedUnannotated()) {
573  m_UnreviewedPrefix = "UNREVIEWED_UNANNOT: ";
574  }
575  }
576 
577  m_Comment = bsx->GetComment();
578  m_IsPseudogene = bsx->IsPseudogene();
580 
581  m_rEnzyme = bsx->GetrEnzyme();
582 
584 
585  if (m_IsPDB) {
586  if (m_Comment.empty()) {
588  } else if (m_IsNA) {
589  if ( m_Length < 25 ) {
591  } else if (NStr::Find(m_Comment, "COMPLETE GENOME") != NPOS ||
592  NStr::Find(m_Comment, "CHROMOSOME XII") != NPOS) {
594  } else if (NStr::Find(m_Comment, "Dna (5'") != NPOS ||
595  NStr::Find(m_Comment, "SEQRES") != NPOS) {
597  }
598  } else {
599  if (NStr::Find(m_Comment, "hypothetical protein") != NPOS ||
600  NStr::Find(m_Comment, "uncharacterized protein") != NPOS ||
601  NStr::Find(m_Comment, "putative uncharacterized protein") != NPOS ||
602  NStr::Find(m_Comment, "putative protein") != NPOS ||
603  NStr::Find(m_Comment, "SEQRES") != NPOS) {
605  }
606  }
607  }
608 }
609 
610 // set instance variables from Seq-inst, Seq-ids, MolInfo, etc., but not
611 // BioSource
613  const CBioseq_Handle& bsh,
615 )
616 
617 {
618  // set flags from record components
622  m_GpipeMode = (flags & fGpipeMode) != 0;
624  m_DevMode = (flags & fDevMode) != 0;
625  m_FastaFormat = (flags & fFastaFormat) != 0;
626 
627  // reset member variables to cleared state
628  m_IsNA = false;
629  m_IsAA = false;
630  m_Topology = NCBI_SEQTOPOLOGY(not_set);
631  m_Length = 0;
632 
633  m_IsSeg = false;
634  m_IsDelta = false;
635  m_IsVirtual = false;
636  m_IsMap = false;
637 
638  m_IsNC = false;
639  m_IsNM = false;
640  m_IsNR = false;
641  m_IsNZ = false;
642  m_IsPatent = false;
643  m_IsPDB = false;
644  m_IsWP = false;
645  m_ThirdParty = false;
646  m_WGSMaster = false;
647  m_TSAMaster = false;
648  m_TLSMaster = false;
649 
650  m_MainTitle.clear();
651  m_GeneralStr.clear();
652  m_GeneralId = 0;
653  m_PatentCountry.clear();
654  m_PatentNumber.clear();
655 
656  m_PatentSequence = 0;
657 
658  m_PDBChain = 0;
659  m_PDBChainID.clear();
660 
661  m_MIBiomol = NCBI_BIOMOL(unknown);
662  m_MITech = NCBI_TECH(unknown);
664 
665  m_HTGTech = false;
666  m_HTGSUnfinished = false;
667  m_IsTLS = false;
668  m_IsTSA = false;
669  m_IsWGS = false;
670  m_IsEST_STS_GSS = false;
671 
672  m_UseBiosrc = false;
673 
674  m_HTGSCancelled = false;
675  m_HTGSDraft = false;
676  m_HTGSPooled = false;
677  m_TPAExp = false;
678  m_TPAInf = false;
679  m_TPAReasm = false;
680  m_Unordered = false;
681 
683 
684  m_Source.Reset();
685  m_Taxname.clear();
686  m_Genus.clear();
687  m_Species.clear();
688  m_Multispecies = false;
689  m_Genome = NCBI_GENOME(unknown);
690  m_IsPlasmid = false;
691  m_IsChromosome = false;
692 
693  m_Organelle.clear();
694 
695  m_FirstSuperKingdom.clear();
696  m_SecondSuperKingdom.clear();
697  m_IsCrossKingdom = false;
698 
701  m_Clone.clear();
702  m_has_clone = false;
703  m_Map.clear();
704  m_Plasmid.clear();
705  m_Segment.clear();
706 
707  m_Breed.clear();
708  m_Cultivar.clear();
710  m_Isolate.clear();
711  m_Strain.clear();
712  m_Substrain.clear();
714 
715  m_IsUnverified = false;
717  m_IsUnreviewed = false;
720 
721  m_Comment.clear();
722  m_IsPseudogene = false;
723 
724  m_rEnzyme.clear();
725 
727 
728  // now start setting member variables
729  m_IsNA = bsh.IsNa();
730  m_IsAA = bsh.IsAa();
732  m_Length = bsh.GetInst_Length();
733 
734  if (bsh.IsSetInst()) {
735  if (bsh.IsSetInst_Repr()) {
736  TSEQ_REPR repr = bsh.GetInst_Repr();
737  m_IsSeg = (repr == CSeq_inst::eRepr_seg);
738  m_IsDelta = (repr == CSeq_inst::eRepr_delta);
740  m_IsMap = (repr == CSeq_inst::eRepr_map);
741  }
742  }
743 
744  // process Seq-ids
745  FOR_EACH_SEQID_ON_BIOSEQ_HANDLE (sid_itr, bsh) {
746  CSeq_id_Handle sid = *sid_itr;
747  switch (sid.Which()) {
748  case NCBI_SEQID(Tpg):
749  case NCBI_SEQID(Tpe):
750  case NCBI_SEQID(Tpd):
751  m_ThirdParty = true;
752  // fall through
753  case NCBI_SEQID(Other):
754  case NCBI_SEQID(Genbank):
755  case NCBI_SEQID(Embl):
756  case NCBI_SEQID(Ddbj):
757  {
758  CConstRef<CSeq_id> id = sid.GetSeqId();
759  const CTextseq_id& tsid = *id->GetTextseq_Id ();
760  if (tsid.IsSetAccession()) {
761  const string& acc = tsid.GetAccession ();
763  TACCN_CHOICE div = (TACCN_CHOICE) (type & NCBI_ACCN(division_mask));
764  if ( div == NCBI_ACCN(wgs) )
765  {
766  if( (type & CSeq_id::fAcc_master) != 0 ) {
767  m_WGSMaster = true;
768  }
769  } else if ( div == NCBI_ACCN(tsa) )
770  {
771  if( (type & CSeq_id::fAcc_master) != 0 && m_IsVirtual ) {
772  m_TSAMaster = true;
773  }
774  } else if (type == NCBI_ACCN(refseq_chromosome)) {
775  m_IsNC = true;
776  } else if (type == NCBI_ACCN(refseq_mrna)) {
777  m_IsNM = true;
778  } else if (type == NCBI_ACCN(refseq_mrna_predicted)) {
779  m_IsNM = true;
780  } else if (type == NCBI_ACCN(refseq_ncrna)) {
781  m_IsNR = true;
782  } else if (type == NCBI_ACCN(refseq_contig)) {
783  m_IsNZ = true;
784  } else if (type == NCBI_ACCN(refseq_unique_prot)) {
785  m_IsWP = true;
786  }
787  }
788  break;
789  }
790  case NCBI_SEQID(General):
791  {
792  CConstRef<CSeq_id> id = sid.GetSeqId();
793  const CDbtag& gen_id = id->GetGeneral ();
794  if (! gen_id.IsSkippable ()) {
795  if (gen_id.IsSetTag ()) {
796  const CObject_id& oid = gen_id.GetTag();
797  if (oid.IsStr()) {
798  m_GeneralStr = oid.GetStr();
799  } else if (oid.IsId()) {
800  m_GeneralId = oid.GetId();
801  }
802  }
803  }
804  break;
805  }
806  case NCBI_SEQID(Pdb):
807  {
808  m_IsPDB = true;
809  CConstRef<CSeq_id> id = sid.GetSeqId();
810  const CPDB_seq_id& pdb_id = id->GetPdb ();
811  if (pdb_id.IsSetChain_id()) {
812  m_PDBChainID = pdb_id.GetChain_id();
813  } else if (pdb_id.IsSetChain()) {
814  m_PDBChain = pdb_id.GetChain();
815  }
816  break;
817  }
818  case NCBI_SEQID(Patent):
819  {
820  m_IsPatent = true;
821  CConstRef<CSeq_id> id = sid.GetSeqId();
822  const CPatent_seq_id& pat_id = id->GetPatent();
823  if (pat_id.IsSetSeqid()) {
824  m_PatentSequence = pat_id.GetSeqid();
825  }
826  if (pat_id.IsSetCit()) {
827  const CId_pat& cit = pat_id.GetCit();
828  m_PatentCountry = cit.GetCountry();
830  }
831  break;
832  }
833  case NCBI_SEQID(Gpipe):
834  break;
835  default:
836  break;
837  }
838  }
839 
840  enum ENeededDescChoices {
841  fMolinfo = 1 << 0,
842  fUser = 1 << 1,
843  fSource = 1 << 2,
844  fGenbank = 1 << 3,
845  fEmbl = 1 << 4,
846  fTitle = 1 << 5,
847  fPdb = 1 << 6,
848  fComment = 1 << 7
849  };
850  int needed_desc_choices = fMolinfo | fUser | fSource | fGenbank | fEmbl | fComment;
851 
852  CSeqdesc_CI::TDescChoices desc_choices;
853  desc_choices.reserve(7);
854  desc_choices.push_back(CSeqdesc::e_Molinfo);
855  desc_choices.push_back(CSeqdesc::e_User);
856  desc_choices.push_back(CSeqdesc::e_Source);
857  // Only truly needed if (m_HTGTech || m_ThirdParty), but
858  // determining m_HTGTech requires a descriptor scan.
859  desc_choices.push_back(CSeqdesc::e_Genbank);
860  desc_choices.push_back(CSeqdesc::e_Embl);
861  desc_choices.push_back(CSeqdesc::e_Comment);
862  if (! m_Reconstruct) {
863  needed_desc_choices |= fTitle;
864  desc_choices.push_back(CSeqdesc::e_Title);
865  }
866  if (m_IsPDB) {
867  needed_desc_choices |= fPdb;
868  desc_choices.push_back(CSeqdesc::e_Pdb);
869  }
870 
871  const list <string> *keywords = NULL;
872 
873  int num_super_kingdom = 0;
874  bool super_kingdoms_different = false;
875 
876  for (CSeqdesc_CI desc_it(bsh, desc_choices);
877  needed_desc_choices != 0 && desc_it; ++desc_it) {
878  switch (desc_it->Which()) {
879  case CSeqdesc::e_Molinfo:
880  {
881  // process MolInfo tech
882  if ((needed_desc_choices & fMolinfo) == 0) {
883  continue; // already covered
884  }
885 
886  const CMolInfo& molinf = desc_it->GetMolinfo();
887  m_MIBiomol = molinf.GetBiomol();
888  m_MITech = molinf.GetTech();
890  switch (m_MITech) {
891  case NCBI_TECH(htgs_0):
892  case NCBI_TECH(htgs_1):
893  case NCBI_TECH(htgs_2):
894  m_HTGSUnfinished = true;
895  // manufacture all titles for unfinished HTG sequences
896  m_Reconstruct = true;
897  needed_desc_choices &= ~fTitle;
898  m_MainTitle.clear();
899  // fall through
900  case NCBI_TECH(htgs_3):
901  m_HTGTech = true;
902  m_UseBiosrc = true;
903  break;
904  case NCBI_TECH(est):
905  case NCBI_TECH(sts):
906  case NCBI_TECH(survey):
907  m_IsEST_STS_GSS = true;
908  m_UseBiosrc = true;
909  break;
910  case NCBI_TECH(wgs):
911  m_IsWGS = true;
912  m_UseBiosrc = true;
913  break;
914  case NCBI_TECH(tsa):
915  m_IsTSA = true;
916  m_UseBiosrc = true;
917  if (m_IsVirtual) {
918  m_TSAMaster = true;
919  }
920  break;
921  case NCBI_TECH(targeted):
922  m_IsTLS = true;
923  m_UseBiosrc = true;
924  if (m_IsVirtual) {
925  m_TLSMaster = true;
926  }
927  break;
928  default:
929  break;
930  }
931 
932  // take first, then skip remainder
933  needed_desc_choices &= ~fMolinfo;
934  break;
935  }
936 
937  case CSeqdesc::e_User:
938  {
939  // process Unverified and Unreviewed user objects
940  if ((needed_desc_choices & fUser) == 0) {
941  continue; // already covered
942  }
943 
944  const CUser_object& user_obj = desc_it->GetUser();
945  if (FIELD_IS_SET_AND_IS(user_obj, Type, Str)) {
946  if (user_obj.IsUnverified()) {
947  m_IsUnverified = true;
948  int unverified_count = 0;
949  needed_desc_choices &= ~fUser;
950  m_UnverifiedPrefix = "UNVERIFIED: ";
951  if (user_obj.IsUnverifiedFeature()) {
952  m_UnverifiedPrefix = "UNVERIFIED: ";
953  unverified_count++;
954  }
955  if (user_obj.IsUnverifiedMisassembled()) {
956  m_UnverifiedPrefix = "UNVERIFIED_ASMBLY: ";
957  unverified_count++;
958  }
959  if (user_obj.IsUnverifiedContaminant()) {
960  m_UnverifiedPrefix = "UNVERIFIED_CONTAM: ";
961  unverified_count++;
962  }
963  if (user_obj.IsUnverifiedOrganism()) {
964  m_UnverifiedPrefix = "UNVERIFIED_ORG: ";
965  unverified_count++;
966  }
967  if (unverified_count > 1) {
968  // m_UnverifiedPrefix = "UNVERIFIED: ";
969  }
970  } else if (user_obj.IsUnreviewed()) {
971  m_IsUnreviewed = true;
972  m_UnreviewedPrefix = "UNREVIEWED: ";
973  if (user_obj.IsUnreviewedUnannotated()) {
974  m_UnreviewedPrefix = "UNREVIEWED_UNANNOT: ";
975  }
976  } else if (user_obj.GetType().GetStr() == "AutodefOptions" ) {
977  FOR_EACH_USERFIELD_ON_USEROBJECT (uitr, user_obj) {
978  const CUser_field& fld = **uitr;
979  if (! FIELD_IS_SET_AND_IS(fld, Label, Str)) continue;
980  const string &label_str = GET_FIELD(fld.GetLabel(), Str);
981  if (! NStr::EqualNocase(label_str, "Targeted Locus Name")) continue;
982  if (fld.IsSetData() && fld.GetData().IsStr()) {
983  m_TargetedLocus = fld.GetData().GetStr();
984  }
985  }
986  }
987  }
988  break;
989  }
990 
991  case CSeqdesc::e_Comment:
992  {
993  // process comment
994  if ((needed_desc_choices & fComment) == 0) {
995  continue; // already covered
996  }
997 
998  m_Comment = desc_it->GetComment();
999  if (NStr::Find (m_Comment, "[CAUTION] Could be the product of a pseudogene") != string::npos) {
1000  m_IsPseudogene = true;
1001  }
1002  break;
1003  }
1004 
1005  case CSeqdesc::e_Source:
1006  {
1007  if ((needed_desc_choices & fSource) != 0) {
1008  m_Source.Reset(&desc_it->GetSource());
1009  // take first, then skip remainder
1010  needed_desc_choices &= ~fSource;
1011  }
1012  const CBioSource &bsrc = desc_it->GetSource();
1013  if (! bsrc.IsSetOrgname()) break;
1014  const COrgName &onp = bsrc.GetOrgname();
1015  if (onp.IsSetMod()) {
1016  for (auto& omd : onp.GetMod()) {
1017  if (omd->IsSetSubname()) {
1018  const string& str = omd->GetSubname();
1019  COrgMod::TSubtype subtype = omd->GetSubtype();
1020  if (subtype == COrgMod::eSubtype_metagenome_source) {
1021  if (m_MetaGenomeSource.empty()) {
1023  }
1024  }
1025  }
1026  }
1027  }
1028  if (m_IsWP) {
1029  const COrgName::TName& nam = onp.GetName();
1030  if (! nam.IsPartial()) break;
1031  const CPartialOrgName& pon = nam.GetPartial();
1032  if (! pon.IsSet()) break;
1033  const CPartialOrgName::Tdata& tx = pon.Get();
1034  ITERATE (CPartialOrgName::Tdata, itr, tx) {
1035  const CTaxElement& te = **itr;
1036  if (! te.IsSetFixed_level()) continue;
1037  if (te.GetFixed_level() != 0) continue;
1038  if (! te.IsSetLevel()) continue;
1039  const string& lvl = te.GetLevel();
1040  if (! NStr::EqualNocase (lvl, "superkingdom")) continue;
1041  num_super_kingdom++;
1042  if (m_FirstSuperKingdom.empty() && te.IsSetName()) {
1044  } else if (te.IsSetName() && ! NStr::EqualNocase (m_FirstSuperKingdom, te.GetName())) {
1045  if (m_SecondSuperKingdom.empty()) {
1046  super_kingdoms_different = true;
1048  }
1049  }
1050  if (num_super_kingdom > 1 && super_kingdoms_different) {
1051  m_IsCrossKingdom = true;
1052  }
1053  }
1054  }
1055  }
1056  break;
1057 
1058  case CSeqdesc::e_Title:
1059  if ((needed_desc_choices & fTitle) != 0) {
1060  // for everything other than PDB proteins, title must be packaged on Bioseq - RW-2005
1061  if ( m_IsPDB || desc_it.GetSeq_entry_Handle().IsSeq() ) {
1062  m_MainTitle = desc_it->GetTitle();
1063  }
1064  // take first, then skip remainder
1065  needed_desc_choices &= ~fTitle;
1066  }
1067  break;
1068 
1069  case CSeqdesc::e_Genbank:
1070  {
1071  if ((needed_desc_choices & fGenbank) == 0) {
1072  continue; // already covered
1073  }
1074  const CGB_block& gbk = desc_it->GetGenbank();
1075  if (gbk.IsSetKeywords()) {
1076  keywords = &gbk.GetKeywords();
1077  }
1078 
1079  // take first, then skip remainder along with any EMBL blocks
1080  needed_desc_choices &= ~(fGenbank | fEmbl);
1081  break;
1082  }
1083 
1084  case CSeqdesc::e_Embl:
1085  {
1086  if ((needed_desc_choices & fEmbl) == 0) {
1087  continue; // already covered
1088  }
1089  const CEMBL_block& ebk = desc_it->GetEmbl();
1090  if (ebk.IsSetKeywords()) {
1091  keywords = &ebk.GetKeywords();
1092  }
1093 
1094  // take first, then skip remainder
1095  needed_desc_choices &= ~fEmbl;
1096  break;
1097  }
1098 
1099  case CSeqdesc::e_Pdb:
1100  {
1101  if ((needed_desc_choices & fPdb) == 0) {
1102  continue; // already covered
1103  }
1104  _ASSERT(m_IsPDB);
1105  const CPDB_block& pbk = desc_it->GetPdb();
1106  FOR_EACH_COMPOUND_ON_PDBBLOCK (cp_itr, pbk) {
1107  if (m_PDBCompound.empty()) {
1108  m_PDBCompound = *cp_itr;
1109 
1110  // take first, then skip remainder
1111  needed_desc_choices &= ~fPdb;
1112  }
1113  }
1114  break;
1115  }
1116 
1117  default:
1118  _TROUBLE;
1119  }
1120  }
1121 
1122  if (keywords != NULL) {
1123  FOR_EACH_STRING_IN_LIST (kw_itr, *keywords) {
1124  const string& clause = *kw_itr;
1125  list<string> kywds;
1126  NStr::Split( clause, ";", kywds, NStr::fSplit_Tokenize );
1127  FOR_EACH_STRING_IN_LIST ( k_itr, kywds ) {
1128  const string& str = *k_itr;
1129  if (NStr::EqualNocase (str, "UNORDERED")) {
1130  m_Unordered = true;
1131  }
1132  if ((! m_HTGTech) && (! m_ThirdParty)) continue;
1133  if (NStr::EqualNocase (str, "HTGS_DRAFT")) {
1134  m_HTGSDraft = true;
1135  } else if (NStr::EqualNocase (str, "HTGS_CANCELLED")) {
1136  m_HTGSCancelled = true;
1137  } else if (NStr::EqualNocase (str, "HTGS_POOLED_MULTICLONE")) {
1138  m_HTGSPooled = true;
1139  } else if (NStr::EqualNocase (str, "TPA:experimental")) {
1140  m_TPAExp = true;
1141  } else if (NStr::EqualNocase (str, "TPA:inferential")) {
1142  m_TPAInf = true;
1143  } else if (NStr::EqualNocase (str, "TPA:reassembly")) {
1144  m_TPAReasm = true;
1145  } else if (NStr::EqualNocase (str, "TPA:assembly")) {
1146  m_TPAReasm = true;
1147  }
1148  }
1149  }
1150  }
1151 
1152  if (m_IsMap) {
1153  if (bsh.IsSetInst_Ext() && bsh.GetInst_Ext().IsMap()) {
1154  const CMap_ext& mp = bsh.GetInst_Ext().GetMap();
1155  if (mp.IsSet()) {
1156  const CMap_ext::Tdata& ft = mp.Get();
1157  ITERATE (CMap_ext::Tdata, itr, ft) {
1158  const CSeq_feat& feat = **itr;
1159  const CSeqFeatData& data = feat.GetData();
1160  if (! data.IsRsite()) continue;
1161  const CRsite_ref& rsite = data.GetRsite();
1162  if (rsite.IsStr()) {
1163  m_rEnzyme = rsite.GetStr();
1164  }
1165  }
1166  }
1167  }
1168  }
1169 
1170  if (m_IsPDB) {
1171  if (m_Comment.empty()) {
1173  } else if (m_IsNA) {
1174  if ( m_Length < 25 ) {
1176  } else if (NStr::Find(m_Comment, "COMPLETE GENOME") != NPOS ||
1177  NStr::Find(m_Comment, "CHROMOSOME XII") != NPOS) {
1179  } else if (NStr::Find(m_Comment, "Dna (5'") != NPOS ||
1180  NStr::Find(m_Comment, "SEQRES") != NPOS) {
1182  }
1183  } else {
1184  if (NStr::Find(m_Comment, "hypothetical protein") != NPOS ||
1185  NStr::Find(m_Comment, "uncharacterized protein") != NPOS ||
1186  NStr::Find(m_Comment, "putative uncharacterized protein") != NPOS ||
1187  NStr::Find(m_Comment, "putative protein") != NPOS ||
1188  NStr::Find(m_Comment, "SEQRES") != NPOS) {
1190  }
1191  }
1192  }
1193 }
1194 
1196  const CBioseq_Handle& bsh
1197 )
1198 
1199 {
1201  if (! bsx) {
1202  return;
1203  }
1204 
1205  m_Source = bsx->GetBioSource();
1206  m_Taxname = bsx->GetTaxname();
1207 
1208  m_Genome = bsx->GetGenome();
1209  m_IsPlasmid = bsx->IsPlasmid();
1210  m_IsChromosome = bsx->IsChromosome();
1211 
1212  m_Chromosome = bsx->GetChromosome();
1214  m_Clone = bsx->GetClone();
1215  m_has_clone = bsx->HasClone();
1216  m_Map = bsx->GetMap();
1217  m_Plasmid = bsx->GetPlasmid();
1218  m_Segment = bsx->GetSegment();
1219 
1220  m_Genus = bsx->GetGenus();
1221  m_Species = bsx->GetSpecies();
1222  m_Multispecies = bsx->IsMultispecies();
1223 
1224  m_Strain = bsx->GetStrain();
1225  m_Substrain = bsx->GetSubstrain();
1227  m_Cultivar = bsx->GetCultivar();
1229  m_Isolate = bsx->GetIsolate();
1230  m_Breed = bsx->GetBreed();
1231 
1232  m_Organelle = bsx->GetOrganelle();
1233 
1234  if (m_has_clone) return;
1235 
1236  try {
1238  while (feat_it) {
1239  const CSeq_feat& feat = feat_it->GetOriginalFeature();
1240  if (! feat.IsSetData ()) continue;
1241  const CSeqFeatData& sfdata = feat.GetData ();
1242  const CBioSource& source = sfdata.GetBiosrc();
1243 
1244  // process SubSource
1246  const CSubSource& sbs = **sbs_itr;
1247  if (! sbs.IsSetName()) continue;
1249  case NCBI_SUBSOURCE(clone):
1250  m_has_clone = true;
1251  return;
1252  default:
1253  break;
1254  }
1255  }
1256  ++feat_it;
1257  }
1258  } catch ( const exception& ) {
1259  // ERR_POST(Error << "Unable to iterate source features while constructing default definition line");
1260  }
1261 }
1262 
1263 // set instance variables from BioSource
1265  const CBioseq_Handle& bsh
1266 )
1267 
1268 {
1269  if (m_Source.NotEmpty()) {
1270  // get organism name
1271  if (m_Source->IsSetTaxname()) {
1273  }
1274  if (m_Source->IsSetGenome()) {
1276  m_IsPlasmid = (m_Genome == NCBI_GENOME(plasmid));
1277  m_IsChromosome = (m_Genome == NCBI_GENOME(chromosome));
1278  }
1279 
1280  // process SubSource
1282  const CSubSource& sbs = **sbs_itr;
1283  if (! sbs.IsSetName()) continue;
1284  const string& str = sbs.GetName();
1286  case NCBI_SUBSOURCE(chromosome):
1287  m_Chromosome = str;
1288  break;
1289  case NCBI_SUBSOURCE(clone):
1290  m_Clone = str;
1291  m_has_clone = true;
1292  break;
1293  case NCBI_SUBSOURCE(map):
1294  m_Map = str;
1295  break;
1296  case NCBI_SUBSOURCE(plasmid_name):
1297  m_Plasmid = str;
1298  break;
1299  case NCBI_SUBSOURCE(segment):
1300  m_Segment = str;
1301  break;
1302  case NCBI_SUBSOURCE(linkage_group):
1303  m_LinkageGroup = str;
1304  break;
1305  default:
1306  break;
1307  }
1308  }
1309 
1310  if (m_Source->IsSetOrgname()) {
1311  const COrgName& onp = m_Source->GetOrgname();
1312  if (onp.IsSetName()) {
1313  const COrgName::TName& nam = onp.GetName();
1314  if (nam.IsBinomial()) {
1315  const CBinomialOrgName& bon = nam.GetBinomial();
1316  if (bon.IsSetGenus()) {
1317  m_Genus = bon.GetGenus();
1318  }
1319  if (bon.IsSetSpecies()) {
1320  m_Species = bon.GetSpecies();
1321  }
1322  } else if (nam.IsPartial()) {
1323  const CPartialOrgName& pon = nam.GetPartial();
1324  if (pon.IsSet()) {
1325  const CPartialOrgName::Tdata& tx = pon.Get();
1326  ITERATE (CPartialOrgName::Tdata, itr, tx) {
1327  const CTaxElement& te = **itr;
1328  if (te.IsSetFixed_level()) {
1329  int fl = te.GetFixed_level();
1330  if (fl > 0) {
1331  m_Multispecies = true;
1332  } else if (te.IsSetLevel()) {
1333  const string& lvl = te.GetLevel();
1334  if (! NStr::EqualNocase (lvl, "species")) {
1335  m_Multispecies = true;
1336  }
1337  }
1338  }
1339  }
1340  }
1341  }
1342  }
1343  }
1344 
1345  // process OrgMod
1347  const COrgMod& omd = **omd_itr;
1348  if (! omd.IsSetSubname()) continue;
1349  const string& str = omd.GetSubname();
1350  SWITCH_ON_ORGMOD_CHOICE (omd) {
1351  case NCBI_ORGMOD(strain):
1352  if (m_Strain.empty()) {
1353  m_Strain = str;
1354  }
1355  break;
1356  case NCBI_ORGMOD(substrain):
1357  if (m_Substrain.empty()) {
1358  m_Substrain = str;
1359  }
1360  break;
1361  case NCBI_ORGMOD(cultivar):
1362  if (m_Cultivar.empty()) {
1363  m_Cultivar = str;
1364  }
1365  break;
1366  case NCBI_ORGMOD(specimen_voucher):
1367  if (m_SpecimenVoucher.empty()) {
1369  }
1370  break;
1371  case NCBI_ORGMOD(isolate):
1372  if (m_Isolate.empty()) {
1373  m_Isolate = str;
1374  }
1375  break;
1376  case NCBI_ORGMOD(breed):
1377  if (m_Breed.empty()) {
1378  m_Breed = str;
1379  }
1380  break;
1381  case NCBI_ORGMOD(metagenome_source):
1382  if (m_MetaGenomeSource.empty()) {
1384  }
1385  break;
1386  default:
1387  break;
1388  }
1389  }
1390  }
1391 /*
1392  bool virus_or_phage = false;
1393  bool has_plasmid = false;
1394 
1395  if (NStr::FindNoCase(m_Taxname, "virus") != NPOS ||
1396  NStr::FindNoCase(m_Taxname, "phage") != NPOS) {
1397  virus_or_phage = true;
1398  }
1399 
1400  if (! m_Plasmid.empty()) {
1401  has_plasmid = true;
1402  }
1403 */
1404 
1406 
1407  if (m_has_clone) return;
1408 
1409  try {
1411  while (feat_it) {
1412  const CSeq_feat& feat = feat_it->GetOriginalFeature();
1413  if (! feat.IsSetData ()) continue;
1414  const CSeqFeatData& sfdata = feat.GetData ();
1415  const CBioSource& source = sfdata.GetBiosrc();
1416 
1417  // process SubSource
1419  const CSubSource& sbs = **sbs_itr;
1420  if (! sbs.IsSetName()) continue;
1422  case NCBI_SUBSOURCE(clone):
1423  m_has_clone = true;
1424  return;
1425  default:
1426  break;
1427  }
1428  }
1429  ++feat_it;
1430  }
1431  } catch ( const exception& ) {
1432  // ERR_POST(Error << "Unable to iterate source features while constructing default definition line");
1433  }
1434 }
1435 
1436 // generate title from BioSource fields
1438  vector<CTempString>& desc,
1439  string& buf
1440 )
1441 
1442 {
1444  desc.push_back(", pooled multiple clones");
1445  return;
1446  }
1447 
1448  if( m_Clone.empty() ) {
1449  return;
1450  }
1451 
1452  SIZE_TYPE count = 1;
1453  for (SIZE_TYPE pos = m_Clone.find(';'); pos != NPOS;
1454  pos = m_Clone.find(';', pos + 1)) {
1455  ++count;
1456  }
1457  if (count > 3) {
1458  buf = NStr::NumericToString(count);
1459  desc.reserve(3);
1460  desc.push_back(", ");
1461  desc.push_back(buf);
1462  desc.push_back(" clones");
1463  } else {
1464  desc.reserve(2);
1465  desc.push_back(" clone ");
1466  desc.push_back(m_Clone);
1467  }
1468 }
1469 
1470 static bool s_EndsWithStrain (
1471  const CTempString& taxname,
1472  const CTempString& strain
1473 )
1474 
1475 {
1476  // return NStr::EndsWith(taxname, strain, NStr::eNocase);
1477  if (strain.size() >= taxname.size()) {
1478  return false;
1479  }
1480  SIZE_TYPE pos = taxname.find(' ');
1481  if (pos == NPOS) {
1482  return false;
1483  }
1484  pos = taxname.find(' ', pos + 1);
1485  if (pos == NPOS) {
1486  return false;
1487  }
1488 
1489  pos = NStr::Find (taxname, strain, NStr::eNocase, NStr::eReverseSearch);
1490  if (pos == taxname.size() - strain.size()) {
1491  // check for space to avoid fortuitous match to end of taxname
1492  char ch = taxname[pos - 1];
1493  if (ispunct (ch) || isspace (ch)) {
1494  return true;
1495  }
1496  } else if (pos == taxname.size() - strain.size() - 1
1497  && taxname[pos - 1] == '\''
1498  && taxname[taxname.size() - 1] == '\'') {
1499  return true;
1500  }
1501  return false;
1502 }
1503 
1504 
1505 
1506 static string s_RemoveColonsAndWhiteSpace(string str)
1507 {
1508  str.erase(remove_if(begin(str), end(str),
1509  [](char c) { return c == ':' || c == ' '|| c == '\t'; }),
1510  end(str));
1511  return str;
1512 }
1513 
1514 static string s_RemoveWhiteSpace(string str)
1515 {
1516  str.erase(remove_if(begin(str), end(str),
1517  [](char c) { return c == ' ' || c == '\t'; }),
1518  end(str));
1519  return str;
1520 }
1521 
1522 
1523 static void s_AddVoucherAndIsolate(const CTempString& taxname,
1524  const CTempString& strain,
1525  const CTempString& specimen_voucher,
1526  const CTempString& isolate,
1527  CDefLineJoiner& joiner)
1528 {
1529  if (!specimen_voucher.empty()) {
1530  if (strain.empty() || (s_RemoveColonsAndWhiteSpace(strain) != s_RemoveColonsAndWhiteSpace(specimen_voucher))) {
1531  joiner.Add("voucher", specimen_voucher);
1532  }
1533  }
1534 
1535  if (!isolate.empty() && (isolate != specimen_voucher)) {
1536  // s_EndsWithStrain just checks for supplied pattern, using here for isolate
1537  if ((!s_EndsWithStrain(taxname, isolate)) &&
1538  (s_RemoveColonsAndWhiteSpace(specimen_voucher) != s_RemoveWhiteSpace(isolate))) {
1539  joiner.Add("isolate", isolate);
1540  }
1541  }
1542 }
1543 
1544 
1546 
1547 {
1548  CDefLineJoiner joiner;
1549 
1550  joiner.Add("organism", m_Taxname, eHideType);
1551 
1552  if (! m_Strain.empty()) {
1553  CTempString add(m_Strain, 0, m_Strain.find(';'));
1554  if (! s_EndsWithStrain (m_Taxname, add)) {
1555  joiner.Add("strain", add);
1556  }
1557  }
1558  if (! m_Substrain.empty()) {
1559  CTempString add(m_Substrain, 0, m_Substrain.find(';'));
1560  if (! s_EndsWithStrain (m_Taxname, add)) {
1561  joiner.Add("substr.", add);
1562  }
1563  }
1564  if (! m_Breed.empty()) {
1565  joiner.Add("breed", m_Breed.substr (0, m_Breed.find(';')));
1566  }
1567  if (! m_Cultivar.empty()) {
1568  joiner.Add("cultivar", m_Cultivar.substr (0, m_Cultivar.find(';')));
1569  }
1570 
1572 
1573  if (! m_Chromosome.empty()) {
1574  joiner.Add("location", "chromosome", eHideType);
1575  joiner.Add("chromosome", m_Chromosome, eHideType);
1576  } else if ( !m_LinkageGroup.empty()) {
1577  joiner.Add("location", "linkage group", eHideType);
1578  joiner.Add("linkage group", m_LinkageGroup, eHideType);
1579  } else if ( !m_Plasmid.empty()) {
1580  joiner.Add("location", m_Organelle, eHideType); //"plasmid"
1581  joiner.Add("plasmid name", m_Plasmid, eHideType);
1582  } else if (! m_Organelle.empty()) {
1583  joiner.Add("location", m_Organelle, eHideType);
1584  }
1585 
1586  string clnbuf;
1587  vector<CTempString> clnvec;
1588  if (m_has_clone) {
1589  x_DescribeClones (clnvec, clnbuf);
1590  ITERATE (vector<CTempString>, it, clnvec) {
1591  joiner.Add("clone", *it, eHideType);
1592  }
1593  }
1594  if (! m_Map.empty()) {
1595  joiner.Add("map", m_Map);
1596  }
1597 
1598  joiner.Join(&m_MainTitle);
1600 }
1601 
1602 // generate title for NC
1604 
1605 {
1606  if (m_MIBiomol != NCBI_BIOMOL(genomic) &&
1607  m_MIBiomol != NCBI_BIOMOL(other_genetic)) return;
1608 
1609  // require taxname to be set
1610  if (m_Taxname.empty()) return;
1611 
1612  CDefLineJoiner joiner;
1613 
1614  joiner.Add("organism", m_Taxname, eHideType);
1615 
1616  bool add_gen_tag = false;
1617  if (NStr::FindNoCase (m_Taxname, "plasmid") != NPOS) {
1618  //
1619  } else if (m_IsPlasmid || ! m_Plasmid.empty()) {
1620  if (m_Plasmid.empty()) {
1621  joiner.Add("", "unnamed plasmid", eHideType);
1622  } else {
1623  if ( !m_IsPlasmid) { // do we need this?
1624  joiner.Add("location", m_Organelle, eHideType);
1625  }
1626  if (NStr::FindNoCase(m_Plasmid, "plasmid") == NPOS &&
1627  NStr::FindNoCase(m_Plasmid, "element") == NPOS) {
1628  joiner.Add("plasmid", m_Plasmid);
1629  } else {
1630  joiner.Add("", m_Plasmid, eHideType);
1631  }
1632  }
1633  } else if ( ! m_Organelle.empty() ) {
1634  if ( m_Chromosome.empty() ) {
1635  switch (m_Genome) {
1636  case NCBI_GENOME(mitochondrion):
1637  case NCBI_GENOME(chloroplast):
1638  case NCBI_GENOME(kinetoplast):
1639  case NCBI_GENOME(plastid):
1640  case NCBI_GENOME(apicoplast):
1641  joiner.Add("location", m_Organelle, eHideType);
1642  break;
1643  }
1644  /*
1645  if ( m_LinkageGroup.empty() ) {
1646  add_gen_tag = true;
1647  }
1648  */
1649  } else {
1650  if (! m_IsChromosome) {
1651  joiner.Add("location", m_Organelle, eHideType);
1652  }
1653  joiner.Add("chromosome", m_Chromosome);
1654  }
1655  } else if (! m_Segment.empty()) {
1656  if (m_Segment.find ("DNA") == NPOS &&
1657  m_Segment.find ("RNA") == NPOS &&
1658  m_Segment.find ("segment") == NPOS &&
1659  m_Segment.find ("Segment") == NPOS) {
1660  joiner.Add("segment", m_Segment);
1661  } else {
1662  joiner.Add("", m_Segment, eHideType);
1663  }
1664  } else if (! m_Chromosome.empty()) {
1665  joiner.Add("chromosome", m_Chromosome);
1666  } else /* if ( m_LinkageGroup.empty() ) */ {
1667  add_gen_tag = true;
1668  }
1669 
1670  if (add_gen_tag) {
1671  joiner.Add("completeness", (x_IsComplete() ? ", complete genome" : ", partial genome"), eHideType);
1672  } else {
1673  joiner.Add("completeness", (x_IsComplete() ? ", complete sequence" : ", partial sequence"), eHideType);
1674  }
1675  joiner.Join(&m_MainTitle);
1676 
1677  NStr::ReplaceInPlace (m_MainTitle, "Plasmid", "plasmid");
1678  NStr::ReplaceInPlace (m_MainTitle, "Element", "element");
1679 }
1680 
1681 // generate title for NM
1682 static void x_FlyCG_PtoR (
1683  string& s
1684 )
1685 
1686 {
1687  // s =~ s/\b(CG\d*-)P([[:alpha:]])\b/$1R$2/g, more or less.
1688  SIZE_TYPE pos = 0, len = s.size();
1689  while (pos + 3 < len && (pos = NStr::FindCase (s, "CG", pos)) != NPOS) {
1690  if (pos > 0 && !isspace((unsigned char)s[pos - 1]) ) {
1691  pos += 2;
1692  continue;
1693  }
1694  pos += 2;
1695  while (pos + 3 < len && isdigit((unsigned char)s[pos])) {
1696  ++pos;
1697  }
1698  if (s[pos] == '-' && s[pos + 1] == 'P' &&
1699  isalpha((unsigned char)s[pos + 2]) &&
1700  (pos + 3 == len || strchr(" ,;", s[pos + 3])) ) {
1701  s[pos + 1] = 'R';
1702  }
1703  }
1704 }
1705 
1707  const CBioseq_Handle& bsh
1708 )
1709 
1710 {
1711  unsigned int genes = 0, cdregions = 0, prots = 0;
1712  CConstRef<CSeq_feat> gene(0);
1713  CConstRef<CSeq_feat> cdregion(0);
1714 
1715  // require taxname to be set
1716  if (m_Taxname.empty()) return;
1717 
1718  CScope& scope = bsh.GetScope();
1719 
1720  SAnnotSelector sel;
1724  sel.SetResolveTSE();
1725 
1726  FOR_SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE (feat_it, bsh, sel) {
1727  const CSeq_feat& sft = feat_it->GetOriginalFeature();
1728  SWITCH_ON_FEATURE_CHOICE (sft) {
1729  case CSeqFeatData::e_Gene:
1730  ++genes;
1731  gene.Reset(&sft);
1732  break;
1734  ++cdregions;
1735  cdregion.Reset(&sft);
1736  break;
1737  case CSeqFeatData::e_Prot:
1738  ++prots;
1739  break;
1740  default:
1741  break;
1742  }
1743  }
1744 
1745  if (genes == 1 && cdregions == 1 && (! m_Taxname.empty())) {
1746  string cds_label, gene_label;
1748 
1749  feature::GetLabel(*cdregion, &cds_label, feature::fFGL_Content, &scope);
1750  if (NStr::EqualNocase (m_Taxname, "Drosophila melanogaster")) {
1751  x_FlyCG_PtoR (cds_label);
1752  }
1753  NStr::ReplaceInPlace (cds_label, "isoform ", "transcript variant ");
1754  feature::GetLabel(*gene, &gene_label, feature::fFGL_Content, &scope);
1755  joiner.Add(m_Taxname).Add(" ").Add(cds_label).Add(" (")
1756  .Add(gene_label).Add("), mRNA");
1757  joiner.Join(&m_MainTitle);
1758  }
1759 }
1760 
1761 // generate title for NR
1763  const CBioseq_Handle& bsh
1764 )
1765 
1766 {
1767  // require taxname to be set
1768  if (m_Taxname.empty()) return;
1769 
1770  FOR_EACH_SEQFEAT_ON_BIOSEQ_HANDLE (feat_it, bsh, Gene) {
1771  const CSeq_feat& sft = feat_it->GetOriginalFeature();
1772  m_MainTitle = string(m_Taxname) + " ";
1774  m_MainTitle += ", ";
1775  switch (m_MIBiomol) {
1776  case NCBI_BIOMOL(pre_RNA):
1777  m_MainTitle += "precursorRNA";
1778  break;
1779  case NCBI_BIOMOL(mRNA):
1780  m_MainTitle += "mRNA";
1781  break;
1782  case NCBI_BIOMOL(rRNA):
1783  m_MainTitle += "rRNA";
1784  break;
1785  case NCBI_BIOMOL(tRNA):
1786  m_MainTitle += "tRNA";
1787  break;
1788  case NCBI_BIOMOL(snRNA):
1789  m_MainTitle += "snRNA";
1790  break;
1791  case NCBI_BIOMOL(scRNA):
1792  m_MainTitle += "scRNA";
1793  break;
1794  case NCBI_BIOMOL(cRNA):
1795  m_MainTitle += "cRNA";
1796  break;
1797  case NCBI_BIOMOL(snoRNA):
1798  m_MainTitle += "snoRNA";
1799  break;
1800  case NCBI_BIOMOL(transcribed_RNA):
1801  m_MainTitle += "miscRNA";
1802  break;
1803  case NCBI_BIOMOL(ncRNA):
1804  m_MainTitle += "ncRNA";
1805  break;
1806  case NCBI_BIOMOL(tmRNA):
1807  m_MainTitle += "tmRNA";
1808  break;
1809  default:
1810  break;
1811  }
1812 
1813  // take first, then break to skip remainder
1814  break;
1815  }
1816 }
1817 
1818 // generate title for Patent
1820 
1821 {
1822  string seqno = NStr::IntToString(m_PatentSequence);
1824  joiner.Add("Sequence ").Add(seqno).Add(" from Patent ")
1826  joiner.Join(&m_MainTitle);
1827 }
1828 
1830 
1831 {
1832  if (! m_PDBChainID.empty()) {
1833  string chain(m_PDBChainID);
1836  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_PDBCompound);
1837  } else {
1838  std::size_t found = m_Comment.find_first_not_of("0123456789");
1839  if (found != std::string::npos && found < m_Comment.length() && m_Comment[found] == ' ') {
1840  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_Comment.substr (found));
1841  } else {
1842  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_Comment);
1843  }
1844  }
1845  joiner.Join(&m_MainTitle);
1846  } else if (isprint ((unsigned char) m_PDBChain)) {
1847  string chain(1, (char) m_PDBChain);
1850  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_PDBCompound);
1851  } else {
1852  std::size_t found = m_Comment.find_first_not_of("0123456789");
1853  if (found != std::string::npos && found < m_Comment.length() && m_Comment[found] == ' ') {
1854  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_Comment.substr (found));
1855  } else {
1856  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_Comment);
1857  }
1858  }
1859  joiner.Join(&m_MainTitle);
1860  } else {
1862  }
1863 }
1864 
1866 
1867 {
1868  CDefLineJoiner joiner;
1869 
1870  joiner.Add("organism", m_Taxname, eHideType);
1871 
1872  if ( ! m_Organelle.empty() && NStr::FindNoCase (m_Organelle, "plasmid") != NPOS) {
1873  joiner.Add("location", m_Organelle, eHideType);
1874  }
1875 
1876  if (! m_Strain.empty()) {
1877  CTempString add(m_Strain, 0, m_Strain.find(';'));
1878  if (! s_EndsWithStrain (m_Taxname, add)) {
1879  joiner.Add("strain", add);
1880  }
1881  }
1882  if (! m_Strain.empty()) {
1883  CTempString add(m_Substrain, 0, m_Substrain.find(';'));
1884  if (! s_EndsWithStrain (m_Taxname, add)) {
1885  joiner.Add("substr.", add);
1886  }
1887  }
1888  if (! m_Chromosome.empty()) {
1889  joiner.Add("chromosome", m_Chromosome);
1890  }
1891  if (m_has_clone) {
1892  string clnbuf;
1893  vector<CTempString> clnvec;
1894  x_DescribeClones (clnvec, clnbuf);
1895  ITERATE (vector<CTempString>, it, clnvec) {
1896  joiner.Add("clone", *it, eHideType);
1897  }
1898  }
1899  if (! m_Map.empty()) {
1900  joiner.Add("map", m_Map);
1901  }
1902  if (! m_Plasmid.empty()) {
1903  if (NStr::FindNoCase(m_Plasmid, "plasmid") == NPOS &&
1904  NStr::FindNoCase(m_Plasmid, "element") == NPOS) {
1905  joiner.Add("plasmid", m_Plasmid);
1906  } else {
1907  joiner.Add("", m_Plasmid);
1908  }
1909  }
1910 
1911  if (x_IsComplete()) {
1912  joiner.Add("completeness", ", complete sequence", eHideType);
1913  }
1914 
1915  joiner.Join(&m_MainTitle);
1917 }
1918 
1919 // generate title for protein
1921  const CBioseq_Handle& bsh
1922 )
1923 
1924 {
1925  TSeqPos longest = 0;
1927  CProt_ref::EProcessed processed;
1929  CConstRef<CSeq_feat> prot_feat;
1930  TSeqPos seq_len = UINT_MAX;
1931 
1932  CScope& scope = bsh.GetScope();
1933 
1934  if (bsh.IsSetInst ()) {
1935  if (bsh.IsSetInst_Length ()) {
1936  seq_len = bsh.GetInst_Length ();
1937  }
1938  }
1939 
1940  FOR_EACH_SEQFEAT_ON_BIOSEQ_HANDLE (feat_it, bsh, Prot) {
1941  const CSeq_feat& feat = feat_it->GetOriginalFeature();
1942  if (! feat.IsSetData ()) continue;
1943  const CSeqFeatData& sfdata = feat.GetData ();
1944  const CProt_ref& prp = sfdata.GetProt();
1945  processed = CProt_ref::eProcessed_not_set;
1946  if (prp.IsSetProcessed()) {
1947  processed = prp.GetProcessed();
1948  }
1949  if (! feat.IsSetLocation ()) continue;
1950  const CSeq_loc& loc = feat.GetLocation ();
1951  TSeqPos prot_length = GetLength (loc, &scope);
1952  if (prot_length > longest) {
1953  prot_feat = &feat;
1954  longest = prot_length;
1955  bestprocessed = processed;
1956  } else if (prot_length == longest) {
1957  // unprocessed 0 preferred over preprotein 1 preferred
1958  // over mat peptide 2
1959  if (processed < bestprocessed) {
1960  prot_feat = &feat;
1961  longest = prot_length;
1962  bestprocessed = processed;
1963  }
1964  }
1965  }
1966 
1967  if (longest == seq_len && prot_feat) {
1968  return prot_feat;
1969  }
1970 
1971  // confirm that this will automatically check features on
1972  // parts and segset in pathological segmented protein ???
1973 
1974  if (prot_feat) {
1975  return prot_feat;
1976  }
1977 
1978  CSeq_loc everywhere;
1979  everywhere.SetWhole().Assign(*bsh.GetSeqId());
1980 
1981  prot_feat = GetBestOverlappingFeat (everywhere, CSeqFeatData::e_Prot,
1982  eOverlap_Contained, scope);
1983 
1984  if (prot_feat) {
1985  return prot_feat;
1986  }
1987 
1988  return CConstRef<CSeq_feat> ();
1989 }
1990 
1991 // m_LocalAnnotsOnly test is unnecessary because feature iterator is built on local features only
1992 // sqd-4081: it appears that test still does matter. reinstated and even more rigorously applied.
1994  const CMappedFeat& mapped_cds)
1995 
1996 {
1997  CConstRef<CGene_ref> gene_ref;
1998 
1999  if (mapped_cds) {
2000  const CSeq_feat& cds_feat = mapped_cds.GetOriginalFeature();
2001  FOR_EACH_SEQFEATXREF_ON_FEATURE (xf_itr, cds_feat) {
2002  const CSeqFeatXref& sfx = **xf_itr;
2003  if (sfx.IsSetData()) {
2004  const CSeqFeatData& sfd = sfx.GetData();
2005  if (sfd.IsGene()) {
2006  gene_ref = &sfd.GetGene();
2007  }
2008  }
2009  }
2010 
2011  if (gene_ref) {
2012  return gene_ref;
2013  }
2014 
2015  if (m_ConstructedFeatTree) {
2016  if (! m_InitializedFeatTree) {
2017  CFeat_CI iter (m_TopSEH);
2018  m_Feat_Tree.Reset (new CFeatTree (iter));
2019  m_InitializedFeatTree = true;
2020  }
2021  }
2022  if (m_Feat_Tree.Empty ()) {
2023  m_Feat_Tree.Reset (new CFeatTree);
2024  }
2025  if (! m_ConstructedFeatTree) {
2026  m_Feat_Tree->AddGenesForCds (mapped_cds);
2027  }
2028 
2029  try {
2030  CMappedFeat mapped_gene = GetBestGeneForCds (mapped_cds, m_Feat_Tree);
2031  if (mapped_gene) {
2032  const CSeq_feat& gene_feat = mapped_gene.GetOriginalFeature();
2033  gene_ref = &gene_feat.GetData().GetGene();
2034  }
2035  } catch ( const exception& ) {
2036  // ERR_POST(Error << "x_GetGeneRefViaCDS GetBestGeneForCds failure");
2037  }
2038 
2039  // clearing m_InitializedFeatTree may remove artifact after first protein is indexed and second protein is requested
2040  if (m_ConstructedFeatTree) {
2041  m_InitializedFeatTree = false;
2042  }
2043  }
2044 
2045  return gene_ref;
2046 }
2047 
2049  const CBioseq_Handle& bsh
2050 )
2051 
2052 {
2053  CConstRef<CSeq_feat> cds_feat;
2054  CConstRef<CSeq_loc> cds_loc;
2055  CConstRef<CBioSource> src_ref;
2056 
2057  CScope& scope = bsh.GetScope();
2058 
2059  cds_feat = GetCDSForProduct (bsh);
2060 
2061  if (cds_feat) {
2062  /*
2063  const CSeq_feat& feat = *cds_feat;
2064  */
2065  cds_loc = &cds_feat->GetLocation();
2066  if (cds_loc) {
2067  CRef<CSeq_loc> cleaned_location( new CSeq_loc );
2068  cleaned_location->Assign( *cds_loc );
2070  if (src_feat) {
2071  const CSeq_feat& feat = *src_feat;
2072  if (feat.IsSetData()) {
2073  const CSeqFeatData& sfd = feat.GetData();
2074  if (sfd.IsBiosrc()) {
2075  src_ref = &sfd.GetBiosrc();
2076  }
2077  }
2078  } else {
2079  CRef<CSeq_loc> rev_loc(SeqLocRevCmpl(*cleaned_location, &scope));
2080  cleaned_location->Assign(*rev_loc);
2081  src_feat = GetBestOverlappingFeat (*cleaned_location, CSeqFeatData::eSubtype_biosrc, eOverlap_SubsetRev, scope);
2082  if (src_feat) {
2083  const CSeq_feat& feat = *src_feat;
2084  if (feat.IsSetData()) {
2085  const CSeqFeatData& sfd = feat.GetData();
2086  if (sfd.IsBiosrc()) {
2087  src_ref = &sfd.GetBiosrc();
2088  }
2089  }
2090  }
2091  }
2092  }
2093  }
2094 
2095  if (src_ref) {
2096  return src_ref;
2097  }
2098 
2099  return CConstRef<CBioSource> ();
2100 }
2101 
2103  const CSeq_feat& sft
2104 )
2105 
2106 {
2107  if (! FEATURE_CHOICE_IS (sft, NCBI_SEQFEAT(Cdregion))) return false;
2108  if (! sft.IsSetExcept()) return false;
2109  if (! sft.GetExcept()) return false;
2110  if (! sft.IsSetExcept_text()) return false;
2111 
2112  const string& str = sft.GetExcept_text();
2113  int current_state = 0;
2114  FOR_EACH_CHAR_IN_STRING (str_itr, str) {
2115  const char ch = *str_itr;
2116  int next_state = ms_p_Low_Quality_Fsa->GetNextState (current_state, ch);
2117  if (ms_p_Low_Quality_Fsa->IsMatchFound (next_state)) {
2118  return true;
2119  }
2120  current_state = next_state;
2121  }
2122 
2123 
2124  return false;
2125 }
2126 
2127 static const char* s_proteinOrganellePrefix [] = {
2128  "", // "",
2129  "", // "",
2130  "chloroplast", // "chloroplast",
2131  "chromoplast", // "chromoplast",
2132  "kinetoplast", // "kinetoplast",
2133  "mitochondrion", // "mitochondrion",
2134  "plastid", // "plastid",
2135  "macronuclear", // "macronuclear",
2136  "", // "extrachromosomal",
2137  "plasmid", // "plasmid",
2138  "", // "",
2139  "", // "",
2140  "cyanelle", // "cyanelle",
2141  "", // "proviral",
2142  "", // "virus",
2143  "nucleomorph", // "nucleomorph",
2144  "apicoplast", // "apicoplast",
2145  "leucoplast", // "leucoplast",
2146  "protoplast", // "protoplast",
2147  "endogenous virus", // "endogenous virus",
2148  "hydrogenosome", // "hydrogenosome",
2149  "", // "chromosome",
2150  "chromatophore" // "chromatophore"
2151 };
2152 
2153 static string s_RemoveBracketedOrgFromEnd (string str, string taxname)
2154 
2155 {
2156  string final;
2157  if (str.empty()) return str;
2158  if (taxname.empty()) return str;
2159  SIZE_TYPE taxlen = taxname.length();
2160  int len = (int) str.length();
2161  if (len < 5) return str;
2162  if (str [len - 1] != ']') return str;
2164  if (cp == NPOS) return str;
2165  string suffix = str.substr(cp+1);
2166  if (NStr::StartsWith(suffix, "NAD")) return str;
2167  if (suffix.length() != taxlen + 1) return str;
2168  if (NStr::StartsWith(suffix, taxname)) {
2169  str.erase (cp);
2170  x_CleanAndCompress(final, str, true);
2171  return final;
2172 
2173  }
2174  return str;
2175 }
2176 
2178  const CBioseq_Handle& bsh
2179 )
2180 
2181 {
2182  CConstRef<CSeq_feat> cds_feat;
2184  CConstRef<CSeq_feat> prot_feat;
2185  CConstRef<CGene_ref> gene;
2187  CTempString locus_tag;
2188 
2190  if (! bsx) {
2191  return;
2192  }
2193 
2195 
2197 
2198  if (prtx) {
2199  const CMappedFeat mf = prtx->GetMappedFeat();
2200  const CProt_ref& prp = mf.GetData().GetProt();
2201 
2202  const char* prefix = "";
2203  FOR_EACH_NAME_ON_PROT (prp_itr, prp) {
2204  const string& str = *prp_itr;
2205  string trimmed = s_RemoveBracketedOrgFromEnd (str, m_Taxname);
2206  m_MainTitle += prefix;
2207  m_MainTitle += trimmed;
2208  if (! m_AllProtNames) {
2209  break;
2210  }
2211  prefix = "; ";
2212  }
2213 
2214  if (! m_MainTitle.empty()) {
2215  // strip trailing periods, commas, and spaces
2216  SIZE_TYPE pos = m_MainTitle.find_last_not_of (".,;~ ");
2217  if (pos != NPOS) {
2218  m_MainTitle.erase (pos + 1);
2219  }
2220 
2221  size_t offset = 0;
2222  size_t delta = 0;
2223  string comma;
2224  string isoform;
2225  if (NStr::StartsWith (m_MainTitle, "hypothetical protein")) {
2226  offset = 20;
2227  } else if (NStr::StartsWith (m_MainTitle, "uncharacterized protein")) {
2228  offset = 23;
2229  }
2230  if (offset > 0 && offset < m_MainTitle.length()) {
2231  if (m_MainTitle [offset] == ',' && m_MainTitle [offset + 1] == ' ') {
2232  comma = ", isoform ";
2233  delta = 2;
2234  }
2235  if (m_MainTitle [offset] == ' ') {
2236  comma = " isoform ";
2237  delta = 1;
2238  }
2239  if (NStr::StartsWith (m_MainTitle.substr (offset + delta), "isoform ")) {
2240  isoform = m_MainTitle.substr (offset + delta + 8);
2241  // !!! check for single alphanumeric string
2242  m_MainTitle.erase (offset);
2243  }
2244  }
2245  if ((NStr::EqualNocase (m_MainTitle, "hypothetical protein") ||
2246  NStr::EqualNocase (m_MainTitle, "uncharacterized protein"))
2247  /* && !m_LocalAnnotsOnly */ ) {
2248  if (sfxp) {
2249  CRef<CFeatureIndex> fsx = sfxp->GetBestGene();
2250  if (fsx) {
2251  const CGene_ref& grp = fsx->GetMappedFeat().GetData().GetGene();
2252  if (grp.IsSetLocus_tag()) {
2253  locus_tag = grp.GetLocus_tag();
2254  }
2255  }
2256  }
2257  if (! locus_tag.empty()) {
2258  m_MainTitle += " " + string(locus_tag) + string(comma) + string(isoform);
2259  }
2260  }
2261  }
2262  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2263  if (prp.IsSetDesc()) {
2264  m_MainTitle = prp.GetDesc();
2265  }
2266  }
2267  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2268  FOR_EACH_ACTIVITY_ON_PROT (act_itr, prp) {
2269  const string& str = *act_itr;
2270  m_MainTitle = str;
2271  break;
2272  }
2273  }
2274  }
2275 
2276  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2277  if (sfxp) {
2278  CRef<CFeatureIndex> fsx = sfxp->GetBestGene();
2279  if (fsx) {
2280  const CGene_ref& grp = fsx->GetMappedFeat().GetData().GetGene();
2281  if (grp.IsSetLocus()) {
2282  m_MainTitle = grp.GetLocus();
2283  }
2284  if (m_MainTitle.empty()) {
2285  FOR_EACH_SYNONYM_ON_GENE (syn_itr, grp) {
2286  const string& str = *syn_itr;
2287  m_MainTitle = str;
2288  break;
2289  }
2290  }
2291  if (m_MainTitle.empty()) {
2292  if (grp.IsSetDesc()) {
2293  m_MainTitle = grp.GetDesc();
2294  }
2295  }
2296  }
2297  }
2298  if (! m_MainTitle.empty()) {
2299  m_MainTitle += " gene product";
2300  }
2301  }
2302 
2303  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2304  m_MainTitle = "unnamed protein product";
2305  if (sfxp) {
2306  CRef<CFeatureIndex> fsx = sfxp->GetBestGene();
2307  if (fsx) {
2308  const CGene_ref& grp = fsx->GetMappedFeat().GetData().GetGene();
2309  if (grp.IsSetLocus_tag()) {
2310  locus_tag = grp.GetLocus_tag();
2311  }
2312  }
2313  }
2314  if (! locus_tag.empty()) {
2315  m_MainTitle += " " + string(locus_tag);
2316  }
2317  }
2318 
2319  if (sfxp) {
2320  const CMappedFeat mf = sfxp->GetMappedFeat();
2321  const CSeq_feat& cds = mf.GetOriginalFeature();
2322  if (x_CDShasLowQualityException (cds)) {
2323  const string& low_qual = "LOW QUALITY PROTEIN: ";
2324  if (NStr::FindNoCase (m_MainTitle, low_qual, 0) == NPOS) {
2325  string tmp = m_MainTitle;
2326  m_MainTitle = low_qual + tmp;
2327  }
2328  }
2329  }
2330 
2331  // strip trailing periods, commas, and spaces
2332  SIZE_TYPE pos = m_MainTitle.find_last_not_of (".,;~ ");
2333  if (pos != NPOS) {
2334  m_MainTitle.erase (pos + 1);
2335  }
2336 
2337  if (! x_IsComplete() /* && m_MainTitle.find(", partial") == NPOS */) {
2338  m_MainTitle += ", partial";
2339  }
2340 
2341  if (m_OmitTaxonomicName) return;
2342 
2343  CTempString taxname = m_Taxname;
2344 
2345  if (m_Genome >= NCBI_GENOME(chloroplast) && m_Genome <= NCBI_GENOME(chromatophore)) {
2346  const char * organelle = s_proteinOrganellePrefix [m_Genome];
2347  if ( organelle[0] != '\0' && ! taxname.empty()
2348  /* && NStr::Find (taxname, organelle) == NPOS */) {
2349  m_MainTitle += " (";
2350  m_MainTitle += organelle;
2351  m_MainTitle += ")";
2352  }
2353  }
2354 
2355  // check for special taxname, go to overlapping source feature
2356  if ((taxname.empty() ||
2357  (!NStr::EqualNocase (taxname, "synthetic construct") &&
2358  !NStr::EqualNocase (taxname, "artificial sequence") &&
2359  taxname.find ("vector") == NPOS &&
2360  taxname.find ("Vector") == NPOS)) &&
2361  !m_LocalAnnotsOnly) {
2362  /*
2363  CWeakRef<CBioseqIndex> bsxp = bsx->GetBioseqForProduct();
2364  auto nucx = bsxp.Lock();
2365  if (nucx) {
2366  if (nucx->HasSource()) {
2367  src = x_GetSourceFeatViaCDS (bsh);
2368  if (src.NotEmpty() && src->IsSetTaxname()) {
2369  taxname = src->GetTaxname();
2370  }
2371  }
2372  }
2373  */
2374  src = x_GetSourceFeatViaCDS (bsh);
2375  if (src.NotEmpty() && src->IsSetTaxname()) {
2376  taxname = src->GetTaxname();
2377  }
2378  }
2379 
2380  if (m_IsCrossKingdom && ! m_FirstSuperKingdom.empty() && ! m_SecondSuperKingdom.empty()) {
2382  } else if (! taxname.empty() /* && m_MainTitle.find(taxname) == NPOS */) {
2383  m_MainTitle += " [" + string(taxname) + "]";
2384  }
2385 }
2386 
2388  const CBioseq_Handle& bsh
2389 )
2390 
2391 {
2392  CConstRef<CSeq_feat> cds_feat;
2394  CConstRef<CSeq_feat> prot_feat;
2395  CConstRef<CGene_ref> gene;
2397  CTempString locus_tag;
2398 
2399  // gets longest protein on Bioseq, parts set, or seg set, even if not
2400  // full-length
2401 
2402  prot_feat = x_GetLongestProtein (bsh);
2403 
2404  if (prot_feat) {
2405  prot = &prot_feat->GetData().GetProt();
2406  }
2407 
2408  const CMappedFeat& mapped_cds = GetMappedCDSForProduct (bsh);
2409 
2410  if (prot) {
2411  const CProt_ref& prp = *prot;
2412  const char* prefix = "";
2413  FOR_EACH_NAME_ON_PROT (prp_itr, prp) {
2414  const string& str = *prp_itr;
2415  string trimmed = s_RemoveBracketedOrgFromEnd (str, m_Taxname);
2416  m_MainTitle += prefix;
2417  m_MainTitle += trimmed;
2418  if (! m_AllProtNames) {
2419  break;
2420  }
2421  prefix = "; ";
2422  }
2423 
2424  if (! m_MainTitle.empty()) {
2425  // strip trailing periods, commas, and spaces
2426  SIZE_TYPE pos = m_MainTitle.find_last_not_of (".,;~ ");
2427  if (pos != NPOS) {
2428  m_MainTitle.erase (pos + 1);
2429  }
2430 
2431  int offset = 0;
2432  int delta = 0;
2433  string comma;
2434  string isoform;
2435  if (NStr::StartsWith (m_MainTitle, "hypothetical protein")) {
2436  offset = 20;
2437  } else if (NStr::StartsWith (m_MainTitle, "uncharacterized protein")) {
2438  offset = 23;
2439  }
2440  if (offset > 0 && offset < m_MainTitle.length()) {
2441  if (m_MainTitle [offset] == ',' && m_MainTitle [offset + 1] == ' ') {
2442  comma = ", isoform ";
2443  delta = 2;
2444  }
2445  if (m_MainTitle [offset] == ' ') {
2446  comma = " isoform ";
2447  delta = 1;
2448  }
2449  if (NStr::StartsWith (m_MainTitle.substr (offset + delta), "isoform ")) {
2450  isoform = m_MainTitle.substr (offset + delta + 8);
2451  // !!! check for single alphanumeric string
2452  m_MainTitle.erase (offset);
2453  }
2454  }
2455  if ((NStr::EqualNocase (m_MainTitle, "hypothetical protein") ||
2456  NStr::EqualNocase (m_MainTitle, "uncharacterized protein"))
2457  /* && !m_LocalAnnotsOnly */ ) {
2458  gene = x_GetGeneRefViaCDS (mapped_cds);
2459  if (gene) {
2460  const CGene_ref& grp = *gene;
2461  if (grp.IsSetLocus_tag()) {
2462  locus_tag = grp.GetLocus_tag();
2463  }
2464  }
2465  if (! locus_tag.empty()) {
2466  m_MainTitle += " " + string(locus_tag) + string(comma) + string(isoform);
2467  }
2468  }
2469  }
2470  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2471  if (prp.IsSetDesc()) {
2472  m_MainTitle = prp.GetDesc();
2473  }
2474  }
2475  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2476  FOR_EACH_ACTIVITY_ON_PROT (act_itr, prp) {
2477  const string& str = *act_itr;
2478  m_MainTitle = str;
2479  break;
2480  }
2481  }
2482  }
2483 
2484  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2485  gene = x_GetGeneRefViaCDS (mapped_cds);
2486  if (gene) {
2487  const CGene_ref& grp = *gene;
2488  if (grp.IsSetLocus()) {
2489  m_MainTitle = grp.GetLocus();
2490  }
2491  if (m_MainTitle.empty()) {
2492  FOR_EACH_SYNONYM_ON_GENE (syn_itr, grp) {
2493  const string& str = *syn_itr;
2494  m_MainTitle = str;
2495  break;
2496  }
2497  }
2498  if (m_MainTitle.empty()) {
2499  if (grp.IsSetDesc()) {
2500  m_MainTitle = grp.GetDesc();
2501  }
2502  }
2503  }
2504  if (! m_MainTitle.empty()) {
2505  m_MainTitle += " gene product";
2506  }
2507  }
2508 
2509  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2510  m_MainTitle = "unnamed protein product";
2511  gene = x_GetGeneRefViaCDS (mapped_cds);
2512  if (gene) {
2513  const CGene_ref& grp = *gene;
2514  if (grp.IsSetLocus_tag()) {
2515  locus_tag = grp.GetLocus_tag();
2516  }
2517  }
2518  if (! locus_tag.empty()) {
2519  m_MainTitle += " " + string(locus_tag);
2520  }
2521  }
2522 
2523  if (mapped_cds) {
2524  const CSeq_feat& cds = mapped_cds.GetOriginalFeature();
2525  if (x_CDShasLowQualityException (cds)) {
2526  const string& low_qual = "LOW QUALITY PROTEIN: ";
2527  if (NStr::FindNoCase (m_MainTitle, low_qual, 0) == NPOS) {
2528  string tmp = m_MainTitle;
2529  m_MainTitle = low_qual + tmp;
2530  }
2531  }
2532  }
2533 
2534  // strip trailing periods, commas, and spaces
2535  SIZE_TYPE pos = m_MainTitle.find_last_not_of (".,;~ ");
2536  if (pos != NPOS) {
2537  m_MainTitle.erase (pos + 1);
2538  }
2539 
2540  if (! x_IsComplete() /* && m_MainTitle.find(", partial") == NPOS */) {
2541  m_MainTitle += ", partial";
2542  }
2543 
2544  if (m_OmitTaxonomicName) return;
2545 
2546  CTempString taxname = m_Taxname;
2547 
2548  if (m_Genome >= NCBI_GENOME(chloroplast) && m_Genome <= NCBI_GENOME(chromatophore)) {
2549  const char * organelle = s_proteinOrganellePrefix [m_Genome];
2550  if ( organelle[0] != '\0' && ! taxname.empty()
2551  /* && NStr::Find (taxname, organelle) == NPOS */) {
2552  m_MainTitle += " (";
2553  m_MainTitle += organelle;
2554  m_MainTitle += ")";
2555  }
2556  }
2557 
2558  // check for special taxname, go to overlapping source feature
2559  if ((taxname.empty() ||
2560  (!NStr::EqualNocase (taxname, "synthetic construct") &&
2561  !NStr::EqualNocase (taxname, "artificial sequence") &&
2562  taxname.find ("vector") == NPOS &&
2563  taxname.find ("Vector") == NPOS)) &&
2564  !m_LocalAnnotsOnly) {
2565  src = x_GetSourceFeatViaCDS (bsh);
2566  if (src.NotEmpty() && src->IsSetTaxname()) {
2567  taxname = src->GetTaxname();
2568  }
2569  }
2570 
2571  if (m_IsCrossKingdom && ! m_FirstSuperKingdom.empty() && ! m_SecondSuperKingdom.empty()) {
2573  } else if (! taxname.empty() /* && m_MainTitle.find(taxname) == NPOS */) {
2574  m_MainTitle += " [" + string(taxname) + "]";
2575  }
2576 }
2577 
2578 // generate title for segmented sequence
2580  string& locus,
2581  string& product,
2582  const char*& completeness,
2583  const CBioseq_Handle& bsh
2584 )
2585 
2586 {
2587  CScope& scope = bsh.GetScope();
2588 
2589  // check C toolkit code to understand what is happening here ???
2590 
2591  CSeq_loc everywhere;
2592  everywhere.SetMix().Set() = bsh.GetInst_Ext().GetSeg();
2593 
2594  FOR_EACH_SEQFEAT_ON_SCOPE (it, scope, everywhere, Cdregion) {
2595  const CSeq_feat& cds = it->GetOriginalFeature();
2596  if (! cds.IsSetLocation ()) continue;
2597  const CSeq_loc& cds_loc = cds.GetLocation();
2598 
2599  GetLabel (cds, &product, feature::fFGL_Content, &scope);
2600 
2601  if (cds.IsSetPartial()) {
2602  completeness = "partial";
2603  }
2604 
2605  FOR_EACH_SEQFEATXREF_ON_SEQFEAT (xr_itr, cds) {
2606  const CSeqFeatXref& sfx = **xr_itr;
2607  if (! FIELD_IS_SET (sfx, Data)) continue;
2608  const CSeqFeatData& sfd = GET_FIELD (sfx, Data);
2609  if (! FIELD_IS (sfd, Gene)) continue;
2610  const CGene_ref& gr = GET_FIELD (sfd, Gene);
2611  if (FIELD_IS_SET (gr, Locus)) {
2612  locus = GET_FIELD (gr, Locus);
2613  } else {
2614  FOR_EACH_SYNONYM_ON_GENEREF (syn_itr, gr) {
2615  locus = *syn_itr;
2616  // take first, then break to skip remainder
2617  break;
2618  }
2619  }
2620  }
2621 
2622  if (locus.empty()) {
2623  CConstRef<CSeq_feat> gene_feat
2624  = GetBestOverlappingFeat(cds_loc,
2627  scope);
2628  if (gene_feat.NotEmpty()) {
2629  const CSeq_feat& gene = *gene_feat;
2630  GetLabel (gene, &locus, feature::fFGL_Content, &scope);
2631  /*
2632  if (gene_feat->GetData().GetGene().IsSetLocus()) {
2633  locus = gene_feat->GetData().GetGene().GetLocus();
2634  } else if (gene_feat->GetData().GetGene().IsSetSyn()) {
2635  locus = *gene_feat->GetData().GetGene().GetSyn().begin();
2636  }
2637  */
2638  }
2639  }
2640 
2641  return true;
2642  }
2643 
2644  return false;
2645 }
2646 
2648  const CBioseq_Handle& bsh
2649 )
2650 
2651 {
2652  const char * completeness = "complete";
2653  bool cds_found = false;
2654  string locus, product;
2655  CDefLineJoiner joiner;
2656 
2657  if (m_Taxname.empty()) {
2658  m_Taxname = "Unknown";
2659  }
2660  joiner.Add("organism", m_Taxname, eHideType);
2661 
2662  if ( !m_LocalAnnotsOnly ) {
2663  cds_found = x_GetSegSeqInfoViaCDS(locus, product, completeness, bsh);
2664  }
2665  if ( !cds_found) {
2666  if (! m_Strain.empty()
2667  && ! s_EndsWithStrain (m_Taxname, m_Strain) ) {
2668  joiner.Add("strain", m_Strain);
2669  } else if (! m_Clone.empty()
2670  /* && m_Clone.find(" clone ") != NPOS */) {
2671  string clnbuf;
2672  vector<CTempString> clnvec;
2673  x_DescribeClones (clnvec, clnbuf);
2674  ITERATE (vector<CTempString>, it, clnvec) {
2675  joiner.Add("clone", *it, eHideType);
2676  }
2677  } else if (! m_Isolate.empty() ) {
2678  joiner.Add("isolate", m_Isolate);
2679  }
2680  }
2681  if (! product.empty()) {
2682  joiner.Add("product", product, eHideType);
2683  }
2684  joiner.Join(&m_MainTitle);
2685  if (! locus.empty()) {
2686  m_MainTitle += " (" + locus + ")";
2687  }
2688  if ((! product.empty()) || (! locus.empty())) {
2689  m_MainTitle += " gene, " + string(completeness) + " cds";
2690  }
2692 }
2693 
2694 // generate title for TSA or non-master WGS
2696 
2697 {
2698  CDefLineJoiner joiner;
2699 
2700  joiner.Add("organism", m_Taxname, eHideType);
2701 
2702  if (! m_Strain.empty()) {
2703  if (! s_EndsWithStrain (m_Taxname, m_Strain)) {
2704  joiner.Add("strain", m_Strain.substr (0, m_Strain.find(';')));
2705  }
2707  joiner.Add("substr.", m_Substrain.substr (0, m_Substrain.find(';')));
2708  }
2709  } else if (! m_Breed.empty()) {
2710  joiner.Add("breed", m_Breed.substr (0, m_Breed.find(';')));
2711  } else if (! m_Cultivar.empty()) {
2712  joiner.Add("cultivar", m_Cultivar.substr (0, m_Cultivar.find(';')));
2713  }
2714 
2716 
2717  if (! m_Chromosome.empty()) {
2718  joiner.Add("chromosome", m_Chromosome);
2719  } else if ( !m_LinkageGroup.empty()) {
2720  joiner.Add("linkage group", m_LinkageGroup);
2721  }
2722  if (! m_Clone.empty()) {
2723  string clnbuf;
2724  vector<CTempString> clnvec;
2725  x_DescribeClones (clnvec, clnbuf);
2726  ITERATE (vector<CTempString>, it, clnvec) {
2727  joiner.Add("clone", *it, eHideType);
2728  }
2729  }
2730  if (! m_Map.empty()) {
2731  joiner.Add("map", m_Map);
2732  }
2733  if (! m_Plasmid.empty()) {
2734  if (m_IsWGS) {
2735  joiner.Add("plasmid", m_Plasmid);
2736  }
2737  }
2738  // string tmp needs to be in scope for final joiner.Join statement
2739  string tmp;
2740  if (m_Genome == NCBI_GENOME(plasmid) && m_Topology == NCBI_SEQTOPOLOGY(circular)) {
2741  } else if (m_Genome == NCBI_GENOME(chromosome)) {
2742  } else if (! m_GeneralStr.empty()) {
2743  if (m_GeneralStr != m_Chromosome && (! m_IsWGS || m_GeneralStr != m_Plasmid)) {
2744  joiner.Add("", m_GeneralStr, eHideType);
2745  }
2746  } else if (m_GeneralId > 0) {
2748  if (! tmp.empty()) {
2749  if (tmp != m_Chromosome && (! m_IsWGS || tmp != m_Plasmid)) {
2750  joiner.Add("", tmp, eHideType);
2751  }
2752  }
2753  }
2754 
2755  joiner.Join(&m_MainTitle);
2757 }
2758 
2759 // generate title for optical map
2761 
2762 {
2763  CDefLineJoiner joiner;
2764 
2765  joiner.Add("organism", m_Taxname, eHideType);
2766 
2767  if (! m_Strain.empty()) {
2768  if (! s_EndsWithStrain (m_Taxname, m_Strain)) {
2769  joiner.Add("strain", m_Strain.substr (0, m_Strain.find(';')));
2770  }
2771  }
2772  if (! m_Substrain.empty()) {
2774  joiner.Add("substr.", m_Substrain.substr (0, m_Substrain.find(';')));
2775  }
2776  }
2777  if (! m_Chromosome.empty()) {
2778  joiner.Add("chromosome", m_Chromosome);
2779  } else if (m_IsChromosome) {
2780  joiner.Add("location", "chromosome", eHideType);
2781  }
2782  if (! m_Plasmid.empty()) {
2783  joiner.Add("plasmid", m_Plasmid);
2784  } else if (m_IsPlasmid) {
2785  joiner.Add("location", "plasmid", eHideType);
2786  }
2787  if (! m_Isolate.empty()) {
2788  joiner.Add("isolate", m_Isolate);
2789  }
2790  joiner.Join(&m_MainTitle);
2791 
2792  if (! m_rEnzyme.empty()) {
2793  m_MainTitle += ", " + m_rEnzyme + " whole genome map";
2794  }
2795 
2797 }
2798 
2799 // generate TPA or TSA prefix
2801  string& prefix,
2802  const CBioseq_Handle& bsh
2803 )
2804 
2805 {
2806  prefix = kEmptyCStr;
2807 
2808  if (m_IsUnverified) {
2809  if (m_MainTitle.find ("UNVERIFIED") == NPOS) {
2811  }
2812  } else if (m_IsUnreviewed) {
2813  if (m_MainTitle.find ("UNREVIEWED") == NPOS) {
2815  }
2816  } else if (m_ThirdParty) {
2817  if (m_TPAExp) {
2818  prefix = "TPA_exp: ";
2819  } else if (m_TPAInf) {
2820  prefix = "TPA_inf: ";
2821  } else if (m_TPAReasm) {
2822  prefix = "TPA_asm: ";
2823  } else if (m_Idx && m_IsAA) {
2825  if (bsx) {
2827  auto nucx = bsxp.Lock();
2828  if (nucx) {
2829  if (nucx->IsTPAExp()) {
2830  prefix = "TPA_exp: ";
2831  } else if (nucx->IsTPAInf()) {
2832  prefix = "TPA_inf: ";
2833  } else if (nucx->IsTPAReasm()) {
2834  prefix = "TPA_asm: ";
2835  }
2836  }
2837  }
2838  }
2839  if (prefix.empty()) {
2840  prefix = "TPA: ";
2841  }
2842  } else if (m_IsTSA) {
2843  prefix = "TSA: ";
2844  } else if (m_IsTLS) {
2845  prefix = "TLS: ";
2846  } else if (m_Multispecies && m_IsWP) {
2847  prefix = "MULTISPECIES: ";
2848  } else if (m_IsPseudogene) {
2849  if (m_MainTitle.find ("PUTATIVE PSEUDOGENE") == NPOS) {
2850  prefix = "PUTATIVE PSEUDOGENE: ";
2851  }
2852  } else if (m_Idx && m_IsAA) {
2854  if (bsx) {
2856  if (sfxp) {
2857  const CMappedFeat mf = sfxp->GetMappedFeat();
2858  const CSeq_feat& cds = mf.GetOriginalFeature();
2859  if (x_CDShasLowQualityException (cds)) {
2860  if (m_MainTitle.find ("LOW QUALITY PROTEIN") == NPOS) {
2861  prefix = "LOW QUALITY PROTEIN: ";
2862  }
2863  }
2864  }
2865  }
2866  }
2867 }
2868 
2869 // generate suffix if not already present
2871  string& suffix,
2872  const CBioseq_Handle& bsh,
2873  bool appendComplete
2874 )
2875 
2876 {
2877  string type;
2878  string study;
2879  string comp;
2880 
2881  switch (m_MITech) {
2882  case NCBI_TECH(htgs_0):
2883  if (m_MainTitle.find ("LOW-PASS") == NPOS) {
2884  type = ", LOW-PASS SEQUENCE SAMPLING";
2885  }
2886  break;
2887  case NCBI_TECH(htgs_1):
2888  case NCBI_TECH(htgs_2):
2889  {
2890  if (m_HTGSDraft) {
2891  if (m_MainTitle.find ("WORKING DRAFT") == NPOS) {
2892  type = ", WORKING DRAFT SEQUENCE";
2893  }
2894  } else if (!m_HTGSCancelled) {
2895  if (m_MainTitle.find ("SEQUENCING IN") == NPOS) {
2896  type = ", *** SEQUENCING IN PROGRESS ***";
2897  }
2898  }
2899 
2900  string un;
2901  if (m_MITech == NCBI_TECH(htgs_1)) {
2902  un = "un";
2903  }
2904  if (m_IsDelta) {
2905  unsigned int pieces = 1;
2906  for (CSeqMap_CI it (bsh, CSeqMap::fFindGap); it; ++it) {
2907  ++pieces;
2908  }
2909  if (pieces == 1) {
2910  // type += (", 1 " + un + "ordered piece");
2911  } else {
2912  type += (", " + NStr::IntToString (pieces)
2913  + " " + un + "ordered pieces");
2914  }
2915  } else {
2916  // type += ", in " + un + "ordered pieces";
2917  }
2918  break;
2919  }
2920  case NCBI_TECH(htgs_3):
2921  if (m_MainTitle.find ("complete sequence") == NPOS) {
2922  type = ", complete sequence";
2923  }
2924  break;
2925  case NCBI_TECH(est):
2926  if (m_MainTitle.find ("mRNA sequence") == NPOS) {
2927  type = ", mRNA sequence";
2928  }
2929  break;
2930  case NCBI_TECH(sts):
2931  if (m_MainTitle.find ("sequence tagged site") == NPOS) {
2932  type = ", sequence tagged site";
2933  }
2934  break;
2935  case NCBI_TECH(survey):
2936  if (m_MainTitle.find ("genomic survey sequence") == NPOS) {
2937  type = ", genomic survey sequence";
2938  }
2939  break;
2940  case NCBI_TECH(wgs):
2941  if (m_WGSMaster) {
2942  if (m_MainTitle.find ("whole genome shotgun sequencing project") == NPOS){
2943  type = ", whole genome shotgun sequencing project";
2944  }
2945  } else if (m_MainTitle.find ("whole genome shotgun sequence") == NPOS) {
2946  if (! m_Organelle.empty() && m_MainTitle.find(m_Organelle) == NPOS) {
2947  if ((NStr::EqualNocase (m_Organelle, "mitochondrial") || NStr::EqualNocase (m_Organelle, "mitochondrion")) &&
2948  (m_MainTitle.find("mitochondrial") != NPOS || m_MainTitle.find("mitochondrion") != NPOS)) {
2949  } else if (NStr::EqualNocase (m_Organelle, "chromosome") &&
2950  (m_MainTitle.find("linkage group") != NPOS || m_MainTitle.find("chromosome") != NPOS)) {
2951  } else {
2952  type = " ";
2953  type += m_Organelle;
2954  }
2955  }
2956  type += ", whole genome shotgun sequence";
2957  }
2958  break;
2959  case NCBI_TECH(tsa):
2960  if (m_TSAMaster) {
2961  if (m_MainTitle.find("transcriptome shotgun assembly") == NPOS) {
2962  type = ", transcriptome shotgun assembly";
2963  }
2964  } else {
2965  if (m_MainTitle.find ("RNA sequence") == NPOS) {
2966  switch (m_MIBiomol) {
2967  case NCBI_BIOMOL(mRNA):
2968  type = ", mRNA sequence";
2969  break;
2970  case NCBI_BIOMOL(rRNA):
2971  type = ", rRNA sequence";
2972  break;
2973  case NCBI_BIOMOL(ncRNA):
2974  type = ", ncRNA sequence";
2975  break;
2976  case NCBI_BIOMOL(pre_RNA):
2977  case NCBI_BIOMOL(snRNA):
2978  case NCBI_BIOMOL(scRNA):
2979  case NCBI_BIOMOL(cRNA):
2980  case NCBI_BIOMOL(snoRNA):
2981  case NCBI_BIOMOL(transcribed_RNA):
2982  type = ", transcribed RNA sequence";
2983  break;
2984  default:
2985  break;
2986  }
2987  }
2988  }
2989  break;
2990  case NCBI_TECH(targeted):
2991  if (m_TLSMaster) {
2992  if (m_MainTitle.find ("targeted locus study") == NPOS) {
2993  type = ", targeted locus study";
2994  }
2995  } else {
2996  if (m_MainTitle.find ("sequence") == NPOS) {
2997  type += ", sequence";
2998  }
2999  }
3000  if (! m_TargetedLocus.empty() && m_MainTitle.find (m_TargetedLocus) == NPOS) {
3001  study = m_TargetedLocus;
3002  }
3003  break;
3004  default:
3005  break;
3006  }
3007 
3008  if (appendComplete) {
3009  if (m_MainTitle.find ("complete") == NPOS && m_MainTitle.find ("partial") == NPOS) {
3010  if (m_MICompleteness == NCBI_COMPLETENESS(complete)) {
3011  if (m_IsPlasmid) {
3012  comp = ", complete sequence";
3013  } else if (m_Genome == NCBI_GENOME(mitochondrion) ||
3014  m_Genome == NCBI_GENOME(chloroplast) ||
3015  m_Genome == NCBI_GENOME(kinetoplast) ||
3016  m_Genome == NCBI_GENOME(plastid) ||
3017  m_Genome == NCBI_GENOME(apicoplast)) {
3018  comp = ", complete genome";
3019  } else if (m_IsChromosome) {
3020  if (! m_Chromosome.empty()) {
3021  comp = ", complete sequence";
3022  } else if (! m_LinkageGroup.empty()) {
3023  comp = ", complete sequence";
3024  } else {
3025  comp = ", complete genome";
3026  }
3027  }
3028  }
3029  }
3030  }
3031 
3032  if (m_Unordered && m_IsDelta) {
3033  unsigned int num_gaps = 0;
3034  for (CSeqMap_CI it (bsh, CSeqMap::fFindGap); it; ++it) {
3035  ++num_gaps;
3036  }
3037  if (num_gaps > 0) {
3038  type += (", " + NStr::IntToString (num_gaps + 1)
3039  + " unordered pieces");
3040  }
3041  }
3042 
3043  if (! study.empty()) {
3044  suffix = " " + study + " " + type + comp;
3045  } else {
3046  suffix = type + comp;
3047  }
3048 }
3049 
3050 static inline void s_TrimMainTitle (string& str)
3051 {
3052  size_t pos = str.find_last_not_of (".,;~ ");
3053  if (pos != NPOS) {
3054  str.erase (pos + 1);
3055  }
3056 }
3057 
3058 /*
3059 // Strips all spaces in string in following manner. If the function
3060 // meets several spaces (spaces and tabs) in succession it replaces them
3061 // with one space. Strips all spaces after '(' and before ( ')' or ',' ).
3062 // (Code adapted from BasicCleanup.)
3063 static void x_CompressRunsOfSpaces (string& str)
3064 {
3065  if (str.empty()) {
3066  return;
3067  }
3068 
3069  string::iterator end = str.end();
3070  string::iterator it = str.begin();
3071  string::iterator new_str = it;
3072  while (it != end) {
3073  *new_str++ = *it;
3074  if ( (*it == ' ') || (*it == '\t') || (*it == '(') ) {
3075  for (++it; (it != end) && (*it == ' ' || *it == '\t'); ++it) continue;
3076  if ((it != end) && (*it == ')' || *it == ',') ) {
3077  // this "if" protects against the case "(...bunch of spaces and tabs...)".
3078  // Otherwise, the first '(' is unintentionally erased
3079  if( *(new_str - 1) != '(' ) {
3080  --new_str;
3081  }
3082  }
3083  } else {
3084  ++it;
3085  }
3086  }
3087  str.erase(new_str, str.end());
3088 }
3089 */
3090 
3091 static size_t s_TitleEndsInOrganism (
3092  string& title,
3093  CTempString taxname
3094 )
3095 
3096 {
3097  size_t pos, idx;
3098  size_t len1, len2;
3099 
3100  len1 = title.length();
3101  len2 = taxname.length();
3102 
3103  idx = len1 - len2 - 3;
3104  if (len1 > len2 + 4 && title [idx] == ' ' && title [idx + 1] == '[' && title [len1 - 1] == ']') {
3105  pos = NStr::Find(title, taxname, NStr::eNocase, NStr::eReverseSearch);
3106  if (pos == idx + 2) {
3107  return pos - 1;
3108  }
3109  }
3110 
3111  return NPOS;
3112 }
3113 
3115  const CBioseq_Handle& bsh
3116 )
3117 
3118 {
3119  CBioSource::TGenome genome;
3120  size_t pos, tpos = NPOS, opos = NPOS;
3121  int len1, len2;
3123 
3125  if (! bsx) {
3126  return;
3127  }
3128 
3129  m_Source = bsx->GetBioSource();
3130  m_Taxname = bsx->GetTaxname();
3131 
3132  m_Genome = bsx->GetGenome();
3133 
3134  m_Genus = bsx->GetGenus();
3135  m_Species = bsx->GetSpecies();
3136 
3137  m_Organelle = bsx->GetOrganelle();
3138 
3139  if (m_Source.Empty()) return;
3140 
3142 
3143  len1 = (int) m_MainTitle.length();
3144  len2 = (int) m_Taxname.length();
3145 
3146  // find [taxname]
3147 
3148  if (len1 > len2 + 4) {
3150  if (tpos == NPOS) {
3151  string descTaxname = bsx->GetDescTaxname();
3152  tpos = s_TitleEndsInOrganism(m_MainTitle, descTaxname);
3153  }
3154  if (tpos == NPOS) {
3155  string binomial = m_Genus;
3156  binomial += " ";
3157  binomial += m_Species;
3158  tpos = s_TitleEndsInOrganism(m_MainTitle, binomial);
3159  if (tpos == NPOS) {
3160  if (m_IsCrossKingdom) {
3162  if (pos != NPOS) {
3163  m_MainTitle.erase (pos + 1);
3166  }
3167  }
3168  }
3169  }
3170  }
3171 
3172  /* do not change unless [genus species] was at the end */
3173  if (tpos == NPOS) return;
3174 
3175  m_MainTitle.erase (tpos);
3177  len1 = (int) m_MainTitle.length();
3178 
3179  // find (organelle)
3180 
3181  if (len1 > 2 && m_MainTitle [len1 - 1] == ')') {
3182  pos = m_MainTitle.find_last_of ("(");
3183  if (pos != NPOS) {
3184  for ( genome = NCBI_GENOME(chloroplast); genome <= NCBI_GENOME(chromatophore); genome++ ) {
3185  string str = s_proteinOrganellePrefix [genome];
3186  if ( ! str.empty() ) {
3187  string paren = "(" + str + ")";
3188  if (NStr::EndsWith (m_MainTitle, paren )) {
3189  opos = pos;
3190  break;
3191  }
3192  }
3193  }
3194  }
3196  len1 = (int) m_MainTitle.length();
3197  }
3198 
3199  if (opos != NPOS) {
3200  m_MainTitle.erase (opos);
3202  len1 = (int) m_MainTitle.length();
3203  }
3204 
3205  if ( NStr::EndsWith (m_MainTitle, ", partial")) {
3206  m_MainTitle.erase(m_MainTitle.length() - 9);
3208  }
3209 
3210  // then reconstruct partial (organelle) [taxname] suffix
3211 
3212  if ( !x_IsComplete()) {
3213  m_MainTitle += ", partial";
3214  }
3215 
3216  if (m_OmitTaxonomicName) return;
3217 
3218  CTempString taxname = m_Taxname;
3219 
3220  if (m_Genome >= NCBI_GENOME(chloroplast) && m_Genome <= NCBI_GENOME(chromatophore)) {
3221  const char * organelle = s_proteinOrganellePrefix [m_Genome];
3222  if ( organelle[0] != '\0' && ! taxname.empty()
3223  /* && NStr::Find (taxname, organelle) == NPOS */) {
3224  m_MainTitle += " (";
3225  m_MainTitle += organelle;
3226  m_MainTitle += ")";
3227  }
3228  }
3229 
3230  // check for special taxname, go to overlapping source feature
3231  if ((taxname.empty() ||
3232  (!NStr::EqualNocase (taxname, "synthetic construct") &&
3233  !NStr::EqualNocase (taxname, "artificial sequence") &&
3234  taxname.find ("vector") == NPOS &&
3235  taxname.find ("Vector") == NPOS)) &&
3236  !m_LocalAnnotsOnly) {
3237  if (m_Idx) {
3239  if (bsx) {
3241  auto nucx = bsxp.Lock();
3242  if (nucx) {
3243  src = x_GetSourceFeatViaCDS (bsh);
3244  if (src.NotEmpty() && src->IsSetTaxname()) {
3245  taxname = src->GetTaxname();
3246  }
3247  }
3248  }
3249  } else {
3250  src = x_GetSourceFeatViaCDS (bsh);
3251  if (src.NotEmpty() && src->IsSetTaxname()) {
3252  taxname = src->GetTaxname();
3253  }
3254  }
3255  }
3256 
3257  if (m_IsCrossKingdom && ! m_FirstSuperKingdom.empty() && ! m_SecondSuperKingdom.empty()) {
3259  } else if (! taxname.empty() /* && m_MainTitle.find(taxname) == NPOS */) {
3260  m_MainTitle += " [" + string(taxname) + "]";
3261  }
3262 }
3263 
3265  const CBioseq_Handle& bsh
3266 )
3267 
3268 {
3269  CBioSource::TGenome genome;
3270  size_t pos, tpos = NPOS, opos = NPOS;
3271  int len1, len2;
3273 
3274  if (m_Source.Empty()) return;
3275 
3276  if (m_Source->IsSetTaxname()) {
3278  }
3279  if (m_Source->IsSetGenome()) {
3281  }
3282  if (m_Source->IsSetOrgname()) {
3283  const COrgName& onp = m_Source->GetOrgname();
3284  if (onp.IsSetName()) {
3285  const COrgName::TName& nam = onp.GetName();
3286  if (nam.IsBinomial()) {
3287  const CBinomialOrgName& bon = nam.GetBinomial();
3288  if (bon.IsSetGenus()) {
3289  m_Genus = bon.GetGenus();
3290  }
3291  if (bon.IsSetSpecies()) {
3292  m_Species = bon.GetSpecies();
3293  }
3294  }
3295  }
3296  }
3297 
3299 
3300  len1 = (int) m_MainTitle.length();
3301  len2 = (int) m_Taxname.length();
3302 
3303  // find [taxname]
3304 
3305  if (len1 > len2 + 4) {
3307  if (tpos == NPOS) {
3308  string binomial = m_Genus;
3309  binomial += " ";
3310  binomial += m_Species;
3311  tpos = s_TitleEndsInOrganism(m_MainTitle, binomial);
3312  if (tpos == NPOS) {
3313  if (m_IsCrossKingdom) {
3315  if (pos != NPOS) {
3316  m_MainTitle.erase (pos + 1);
3319  }
3320  }
3321  }
3322  }
3323  }
3324 
3325  /* do not change unless [genus species] was at the end */
3326  if (tpos == NPOS) return;
3327 
3328  m_MainTitle.erase (tpos);
3330  len1 = (int) m_MainTitle.length();
3331 
3332  // find (organelle)
3333 
3334  if (len1 > 2 && m_MainTitle [len1 - 1] == ')') {
3335  pos = m_MainTitle.find_last_of ("(");
3336  if (pos != NPOS) {
3337  for ( genome = NCBI_GENOME(chloroplast); genome <= NCBI_GENOME(chromatophore); genome++ ) {
3338  string str = s_proteinOrganellePrefix [genome];
3339  if ( ! str.empty() ) {
3340  string paren = "(" + str + ")";
3341  if (NStr::EndsWith (m_MainTitle, paren )) {
3342  opos = pos;
3343  break;
3344  }
3345  }
3346  }
3347  }
3349  len1 = (int) m_MainTitle.length();
3350  }
3351 
3352  if (opos != NPOS) {
3353  m_MainTitle.erase (opos);
3355  len1 = (int) m_MainTitle.length();
3356  }
3357 
3358  if ( NStr::EndsWith (m_MainTitle, ", partial")) {
3359  m_MainTitle.erase(m_MainTitle.length() - 9);
3361  }
3362 
3363  // then reconstruct partial (organelle) [taxname] suffix
3364 
3365  if ( !x_IsComplete()) {
3366  m_MainTitle += ", partial";
3367  }
3368 
3369  if (m_OmitTaxonomicName) return;
3370 
3371  CTempString taxname = m_Taxname;
3372 
3373  if (m_Genome >= NCBI_GENOME(chloroplast) && m_Genome <= NCBI_GENOME(chromatophore)) {
3374  const char * organelle = s_proteinOrganellePrefix [m_Genome];
3375  if ( organelle[0] != '\0' && ! taxname.empty()
3376  /* && NStr::Find (taxname, organelle) == NPOS */) {
3377  m_MainTitle += " (";
3378  m_MainTitle += organelle;
3379  m_MainTitle += ")";
3380  }
3381  }
3382 
3383  // check for special taxname, go to overlapping source feature
3384  if ((taxname.empty() ||
3385  (!NStr::EqualNocase (taxname, "synthetic construct") &&
3386  !NStr::EqualNocase (taxname, "artificial sequence") &&
3387  taxname.find ("vector") == NPOS &&
3388  taxname.find ("Vector") == NPOS)) &&
3389  !m_LocalAnnotsOnly) {
3390  if (m_Idx) {
3392  if (bsx) {
3394  auto nucx = bsxp.Lock();
3395  if (nucx) {
3396  src = x_GetSourceFeatViaCDS (bsh);
3397  if (src.NotEmpty() && src->IsSetTaxname()) {
3398  taxname = src->GetTaxname();
3399  }
3400  }
3401  }
3402  } else {
3403  src = x_GetSourceFeatViaCDS (bsh);
3404  if (src.NotEmpty() && src->IsSetTaxname()) {
3405  taxname = src->GetTaxname();
3406  }
3407  }
3408  }
3409 
3410  if (m_IsCrossKingdom && ! m_FirstSuperKingdom.empty() && ! m_SecondSuperKingdom.empty()) {
3412  } else if (! taxname.empty() /* && m_MainTitle.find(taxname) == NPOS */) {
3413  m_MainTitle += " [" + string(taxname) + "]";
3414  }
3415 }
3416 
3418 {
3419  switch (m_MICompleteness) {
3420  case NCBI_COMPLETENESS(complete):
3421  return true;
3422  case NCBI_COMPLETENESS(partial):
3423  case NCBI_COMPLETENESS(no_left):
3424  case NCBI_COMPLETENESS(no_right):
3425  case NCBI_COMPLETENESS(no_ends):
3426  return false;
3427  }
3428  return true;
3429 }
3430 
3431 
3432 static const char* s_tpaPrefixList [] = {
3433  "MAG ",
3434  "MAG:",
3435  "MULTISPECIES:",
3436  "TLS:",
3437  "TPA:",
3438  "TPA_exp:",
3439  "TPA_inf:",
3440  "TPA_reasm:",
3441  "TPA_asm:",
3442  "TSA:",
3443  "UNVERIFIED_ORG:",
3444  "UNVERIFIED_ASMBLY:",
3445  "UNVERIFIED_CONTAM:",
3446  "UNVERIFIED:"
3447 };
3449 {
3450  CDefLineJoiner joiner(true);
3451 
3452  x_SetBioSrc (bsh);
3453 
3454  joiner.Add("location", m_Organelle);
3455  if (m_IsChromosome || !m_Chromosome.empty()) {
3456  joiner.Add("chromosome", m_Chromosome);
3457  }
3458  if (m_IsPlasmid || !m_Plasmid.empty()) {
3459  joiner.Add("plasmid name", m_Plasmid);
3460  }
3461  if (m_MICompleteness == NCBI_COMPLETENESS(complete)) {
3462  joiner.Add("completeness", CTempString("complete"));
3463  }
3464 
3465  // print [topology=...], if necessary
3466  if (bsh.CanGetInst_Topology()) {
3467  CSeq_inst::ETopology topology = bsh.GetInst_Topology();
3468  if (topology == CSeq_inst::eTopology_circular) {
3469  joiner.Add("topology", CTempString("circular"));
3470  }
3471  }
3472 
3473  // bsh modifiers retrieved from Biosource.Org-ref
3474  // [organism=...], etc.
3475 
3476  bool strain_seen = false;
3477  string gcode; // should be in the same scope as joiner.Join() because joiner stores CTempString
3478 
3479  try {
3480  const CBioSource* bios = sequence::GetBioSource(bsh);
3481  if (bios && bios->IsSetOrg()) {
3482  const COrg_ref & org = bios->GetOrg();
3483  if (org.IsSetTaxname()) {
3484  joiner.Add("organism", org.GetTaxname());
3485  }
3486  if (org.IsSetOrgname()) {
3487  const COrg_ref::TOrgname & orgname = org.GetOrgname();
3488  if (orgname.IsSetMod()) {
3489  ITERATE(COrgName::TMod, mod_iter, orgname.GetMod()) {
3490  const COrgMod & mod = **mod_iter;
3491  if (mod.IsSetSubtype() && mod.IsSetSubname()) {
3492  const string& subname = mod.GetSubname();
3493  switch (mod.GetSubtype()) {
3495  if (strain_seen) {
3496  ERR_POST_X(9, Warning << __FUNCTION__ << ": "
3497  << "key 'strain' would appear multiple times, but only using the first.");
3498  }
3499  else {
3500  strain_seen = true;
3501  joiner.Add("strain", subname);
3502  }
3503  break;
3505  joiner.Add("substrain", subname);
3506  break;
3508  joiner.Add("type", subname);
3509  break;
3511  joiner.Add("subtype", subname);
3512  break;
3514  joiner.Add("variety", subname);
3515  break;
3517  joiner.Add("serotype", subname);
3518  break;
3520  joiner.Add("serogroup", subname);
3521  break;
3523  joiner.Add("serovar", subname);
3524  break;
3526  joiner.Add("cultivar", subname);
3527  break;
3529  joiner.Add("pathovar", subname);
3530  break;
3532  joiner.Add("chemovar", subname);
3533  break;
3535  joiner.Add("biovar", subname);
3536  break;
3538  joiner.Add("biotype", subname);
3539  break;
3541  joiner.Add("group", subname);
3542  break;
3544  joiner.Add("subgroup", subname);
3545  break;
3547  joiner.Add("isolate", subname);
3548  break;
3550  joiner.Add("common", subname);
3551  break;
3553  joiner.Add("acronym", subname);
3554  break;
3556  joiner.Add("dosage", subname);
3557  break;
3559  joiner.Add("nat_host", subname);
3560  break;
3562  joiner.Add("sub_species", subname);
3563  break;
3565  joiner.Add("specimen_voucher", subname);
3566  break;
3568  joiner.Add("authority", subname);
3569  break;
3571  joiner.Add("forma", subname);
3572  break;
3574  joiner.Add("forma_specialis", subname);
3575  break;
3577  joiner.Add("ecotype", subname);
3578  break;
3580  joiner.Add("synonym", subname);
3581  break;
3583  joiner.Add("anamorph", subname);
3584  break;
3586  joiner.Add("teleomorph", subname);
3587  break;
3589  joiner.Add("breed", subname);
3590  break;
3592  joiner.Add("gb_acronym", subname);
3593  break;
3595  joiner.Add("gb_anamorph", subname);
3596  break;
3598  joiner.Add("gb_synonym", subname);
3599  break;
3601  joiner.Add("culture_collection", subname);
3602  break;
3604  joiner.Add("bio_material", subname);
3605  break;
3607  joiner.Add("metagenome_source", subname);
3608  break;
3610  joiner.Add("type_material", subname);
3611  break;
3613  joiner.Add("nomenclature", subname);
3614  break;
3616  joiner.Add("note", subname);
3617  break;
3618  default:
3619  // ignore; do nothing
3620  break;
3621  }
3622  }
3623  }
3624  }
3626  if (bios->CanGetGenome()) {
3627  genome = bios->GetGenome();
3628  }
3629 
3630  switch ( genome ) {
3635  {
3636  // mitochondrial code
3637  if (orgname.IsSetMgcode()) {
3638  int icode = orgname.GetMgcode();
3639  gcode = std::to_string(icode);
3640  joiner.Add("gcode", gcode);
3641  }
3642  }
3643  break;
3653  {
3654  // specific plant plastid code
3655  if (orgname.IsSetPgcode()) {
3656  int icode = orgname.GetPgcode();
3657  if (icode > 0) {
3658  gcode = std::to_string(icode);
3659  joiner.Add("gcode", gcode);
3660  }
3661  } else {
3662  // bacteria and plant plastids default to code 11.
3663  joiner.Add("gcode", "11");
3664  }
3665  break;
3666  }
3667  default:
3668  {
3669  if (orgname.IsSetGcode()) {
3670  int icode = orgname.GetGcode();
3671  if (icode > 0) {
3672  gcode = std::to_string(icode);
3673  joiner.Add("gcode", gcode);
3674  }
3675  }
3676  break;
3677  }
3678  }
3679  }
3680  }
3681  if ( bios && bios->IsSetSubtype() ) {
3682  ITERATE ( CBioSource::TSubtype, sub_iter, bios->GetSubtype() ) {
3683  const CSubSource& sub = **sub_iter;
3684  if (sub.IsSetSubtype()) {
3685  if (sub.IsSetName()) {
3686  const string& subname = sub.GetName();
3687  switch (sub.GetSubtype()) {
3689  if (! m_IsChromosome && m_Chromosome.empty()) {
3690  joiner.Add("chromosome", subname);
3691  }
3692  break;
3694  joiner.Add("map", subname);
3695  break;
3697  joiner.Add("clone", subname);
3698  break;
3700  joiner.Add("subclone", subname);
3701  break;
3703  joiner.Add("haplotype", subname);
3704  break;
3706  joiner.Add("genotype", subname);
3707  break;
3709  joiner.Add("sex", subname);
3710  break;
3712  joiner.Add("cell_line", subname);
3713  break;
3715  joiner.Add("cell_type", subname);
3716  break;
3718  joiner.Add("tissue_type", subname);
3719  break;
3721  joiner.Add("clone_lib", subname);
3722  break;
3724  joiner.Add("dev_stage", subname);
3725  break;
3727  joiner.Add("frequency", subname);
3728  break;
3730  joiner.Add("lab_host", subname);
3731  break;
3733  joiner.Add("pop_variant", subname);
3734  break;
3736  joiner.Add("tissue_lib", subname);
3737  break;
3739  if (! m_IsPlasmid && m_Plasmid.empty()) {
3740  joiner.Add("plasmid_name", subname);
3741  }
3742  break;
3744  joiner.Add("transposon_name", subname);
3745  break;
3747  joiner.Add("insertion_seq_name", subname);
3748  break;
3750  joiner.Add("plastid_name", subname);
3751  break;
3754  joiner.Add("geo_loc_name", subname);
3755  } else {
3756  joiner.Add("country", subname);
3757  }
3758  break;
3760  joiner.Add("segment", subname);
3761  break;
3763  joiner.Add("endogenous_virus_name", subname);
3764  break;
3766  joiner.Add("isolation_source", subname);
3767  break;
3769  joiner.Add("lat_lon", subname);
3770  break;
3772  joiner.Add("collection_date", subname);
3773  break;
3775  joiner.Add("collected_by", subname);
3776  break;
3778  joiner.Add("identified_by", subname);
3779  break;
3781  joiner.Add("metagenomic", subname);
3782  break;
3784  joiner.Add("mating_type", subname);
3785  break;
3787  joiner.Add("linkage_group", subname);
3788  break;
3790  joiner.Add("haplogroup", subname);
3791  break;
3793  joiner.Add("whole_replicon", subname);
3794  break;
3796  joiner.Add("phenotype", subname);
3797  break;
3799  joiner.Add("altitude", subname);
3800  break;
3802  joiner.Add("note", subname);
3803  break;
3804  default:
3805  break;
3806  }
3807  } else {
3808  switch (sub.GetSubtype()) {
3810  joiner.Add("germline", "true");
3811  break;
3813  joiner.Add("rearranged", "true");
3814  break;
3816  joiner.Add("transgenic", "true");
3817  break;
3819  joiner.Add("environmental_sample", "true");
3820  break;
3821  default:
3822  break;
3823  }
3824  }
3825  }
3826  }
3827  }
3828  if ( bios && bios->IsSetPcr_primers() ) {
3829  const CBioSource_Base::TPcr_primers & primers = bios->GetPcr_primers();
3830  if ( primers.CanGet() ) {
3832 
3833  // bool has_fwd_seq = false;
3834  // bool has_rev_seq = false;
3835 
3836  if( (*it)->IsSetForward() ) {
3837  const CPCRReaction_Base::TForward &forward = (*it)->GetForward();
3838  if( forward.CanGet() ) {
3839  ITERATE( CPCRReaction_Base::TForward::Tdata, it2, forward.Get() ) {
3840  const string &fwd_name = ( (*it2)->CanGetName() ? (*it2)->GetName().Get() : kEmptyStr );
3841  if( ! fwd_name.empty() ) {
3842  joiner.Add("fwd-primer-name", fwd_name);
3843  }
3844  const string &fwd_seq = ( (*it2)->CanGetSeq() ? (*it2)->GetSeq().Get() : kEmptyStr );
3845  // NStr::ToLower( fwd_seq );
3846  if( ! fwd_seq.empty() ) {
3847  joiner.Add("fwd-primer-seq", fwd_seq);
3848  // has_fwd_seq = true;
3849  }
3850  }
3851  }
3852  }
3853  if( (*it)->IsSetReverse() ) {
3854  const CPCRReaction_Base::TReverse &reverse = (*it)->GetReverse();
3855  if( reverse.CanGet() ) {
3856  ITERATE( CPCRReaction_Base::TReverse::Tdata, it2, reverse.Get() ) {
3857  const string &rev_name = ((*it2)->CanGetName() ? (*it2)->GetName().Get() : kEmptyStr );
3858  if( ! rev_name.empty() ) {
3859  joiner.Add("rev-primer-name", rev_name);
3860  }
3861  const string &rev_seq = ( (*it2)->CanGetSeq() ? (*it2)->GetSeq().Get() : kEmptyStr );
3862  // NStr::ToLower( rev_seq ); // do we need this?
3863  if( ! rev_seq.empty() ) {
3864  joiner.Add("rev-primer-seq", rev_seq);
3865  // has_rev_seq = true;
3866  }
3867  }
3868  }
3869  }
3870  }
3871  }
3872  }
3873  }
3874  catch (CException &) {
3875  // ignore exception; it probably just means there's no org-ref
3876  }
3877 
3879  static const TTechMapEntry sc_TechArray[] = {
3880  // note that the text values do *NOT* precisely correspond with
3881  // the names in the ASN.1 schema files
3882  { CMolInfo::eTech_unknown, "?" },
3883  { CMolInfo::eTech_standard, "standard" },
3884  { CMolInfo::eTech_est, "EST" },
3885  { CMolInfo::eTech_sts, "STS" },
3886  { CMolInfo::eTech_survey, "survey" },
3887  { CMolInfo::eTech_genemap, "genetic map" },
3888  { CMolInfo::eTech_physmap, "physical map" },
3889  { CMolInfo::eTech_derived, "derived" },
3890  { CMolInfo::eTech_concept_trans, "concept-trans" },
3891  { CMolInfo::eTech_seq_pept, "seq-pept" },
3892  { CMolInfo::eTech_both, "both" },
3893  { CMolInfo::eTech_seq_pept_overlap, "seq-pept-overlap" },
3894  { CMolInfo::eTech_seq_pept_homol, "seq-pept-homol" },
3895  { CMolInfo::eTech_concept_trans_a, "concept-trans-a" },
3896  { CMolInfo::eTech_htgs_1, "htgs 1" },
3897  { CMolInfo::eTech_htgs_2, "htgs 2" },
3898  { CMolInfo::eTech_htgs_3, "htgs 3" },
3899  { CMolInfo::eTech_fli_cdna, "fli cDNA" },
3900  { CMolInfo::eTech_htgs_0, "htgs 0" },
3901  { CMolInfo::eTech_htc, "htc" },
3902  { CMolInfo::eTech_wgs, "wgs" },
3903  { CMolInfo::eTech_barcode, "barcode" },
3904  { CMolInfo::eTech_composite_wgs_htgs, "composite-wgs-htgs" },
3905  { CMolInfo::eTech_tsa, "tsa" }
3906  };
3909 
3910  // print some key-value pairs
3911  const CMolInfo * pMolInfo = sequence::GetMolInfo(bsh);
3912  if (pMolInfo != NULL) {
3913  const CMolInfo & molinfo = *pMolInfo;
3914  if (molinfo.IsSetTech()) {
3915  TTechMap::const_iterator find_iter = sc_TechMap.find(molinfo.GetTech());
3916  if (find_iter != sc_TechMap.end()) {
3917  joiner.Add("tech", CTempString(find_iter->second));
3918  }
3919  }
3920  }
3921  string modifiers;
3922  joiner.Join(&modifiers);
3923  m_MainTitle = (m_MainTitle.empty()) ? modifiers : modifiers + " " + m_MainTitle;
3924  return m_MainTitle;
3925 }
3926 
3927 
3928 // main method
3930  const CBioseq_Handle& bsh,
3932 )
3933 
3934 {
3935  bool capitalize = true;
3936  bool appendComplete = false;
3937 
3938  string prefix; // from a small set of compile-time constants
3939  string suffix;
3940  string final;
3941 
3942  // set flags from record components
3943  if (m_Idx) {
3944  x_SetFlagsIdx (bsh, flags);
3945  } else {
3946  x_SetFlags (bsh, flags);
3947  }
3948 
3949  if (flags & fShowModifiers) {
3950  return x_GetModifiers(bsh);
3951  }
3952 
3953  if (! m_Reconstruct) {
3954  // x_SetFlags set m_MainTitle from a suitable descriptor, if any;
3955  // now strip trailing periods, commas, semicolons, and spaces.
3956  size_t pos = m_MainTitle.find_last_not_of (".,;~ ");
3957  if (pos != NPOS) {
3958  m_MainTitle.erase (pos + 1);
3959  }
3960  if (! m_MainTitle.empty()) {
3961  capitalize = false;
3962  }
3963  }
3964 
3965  // adjust protein partial/organelle/taxname suffix, if necessary
3966  if ( m_IsAA && ! m_MainTitle.empty() ) {
3967  if (m_Idx) {
3969  } else {
3971  }
3972  }
3973 
3974  // use autodef user object, if present, to regenerate title
3975  if (m_MainTitle.empty() && m_IsNA && (! (flags & fDoNotUseAutoDef)) && ! m_IsTLS && ! m_IsNZ) {
3976 
3977  CSeqdesc_CI desc(bsh, CSeqdesc::e_User);
3978  while (desc && desc->GetUser().GetObjectType() != CUser_object::eObjectType_AutodefOptions) {
3979  ++desc;
3980  }
3981 
3982  if (desc) {
3983  CAutoDef autodef;
3984  autodef.SetOptionsObject(desc->GetUser());
3985  CAutoDefModifierCombo mod_combo;
3986  CAutoDefOptions options;
3987  options.InitFromUserObject(desc->GetUser());
3988  mod_combo.SetOptions(options);
3989  m_MainTitle = autodef.GetOneDefLine(&mod_combo, bsh);
3991  }
3992  }
3993 
3994  // use appropriate algorithm if title needs to be generated
3995  if (m_MainTitle.empty()) {
3996 
3997  // PDB and patent records do not normally need source data
3998  if (m_IsPDB) {
3999  x_SetTitleFromPDB ();
4000  } else if (m_IsPatent) {
4002  }
4003 
4004  if (m_MainTitle.empty()) {
4005  // set fields from source information
4006  if (m_Idx) {
4007  x_SetBioSrcIdx (bsh);
4008  } else {
4009  x_SetBioSrc (bsh);
4010  }
4011 
4012  // several record types have specific methods
4013  if (m_IsNC) {
4014  x_SetTitleFromNC ();
4015  } else if (m_IsNM && !m_LocalAnnotsOnly) {
4016  x_SetTitleFromNM (bsh);
4017  } else if (m_IsNR) {
4018  x_SetTitleFromNR (bsh);
4019  } else if (m_IsAA && m_Idx) {
4021  } else if (m_IsAA) {
4022  x_SetTitleFromProtein (bsh);
4023  } else if (m_IsSeg && (! m_IsEST_STS_GSS)) {
4024  x_SetTitleFromSegSeq (bsh);
4025  } else if (m_IsTSA || (m_IsWGS && (! m_WGSMaster)) || (m_IsTLS && (! m_TLSMaster))) {
4026  x_SetTitleFromWGS ();
4027  } else if (m_IsMap) {
4028  x_SetTitleFromMap ();
4029  }
4030 
4031  if (m_MainTitle.empty() && m_GpipeMode) {
4033  }
4034 
4035  if (m_MainTitle.empty()) {
4036  // default title using source fields
4038  if (m_MICompleteness == NCBI_COMPLETENESS(complete) && !m_MainTitle.empty()) {
4039  appendComplete = true;
4040  }
4041  }
4042  }
4043 
4044  /*
4045  if (m_MainTitle.empty()) {
4046  // last resort title created here
4047  m_MainTitle = "No definition line found";
4048  }
4049  */
4050  }
4051 
4052  // remove TPA or TSA prefix, will rely on other data in record to set
4053  for (size_t i = 0; i < sizeof (s_tpaPrefixList) / sizeof (const char*); i++) {
4054  string str = s_tpaPrefixList [i];
4056  m_MainTitle.erase (0, str.length());
4057  // strip leading spaces remaining after removal of old MAG before TPA or TSA prefixes
4058  m_MainTitle.erase (0, m_MainTitle.find_first_not_of (' '));
4059  }
4060  }
4061 
4062  // strip leading spaces remaining after removal of old TPA or TSA prefixes
4063  m_MainTitle.erase (0, m_MainTitle.find_first_not_of (' '));
4064 
4066 
4067  // strip trailing commas, semicolons, and spaces (period may be an sp.
4068  // species)
4069  size_t pos = decoded.find_last_not_of (",;~ ");
4070  if (pos != NPOS) {
4071  decoded.erase (pos + 1);
4072  }
4073 
4074  // calculate prefix
4075  x_SetPrefix(prefix, bsh);
4076 
4077  // calculate suffix
4078  x_SetSuffix (suffix, bsh, appendComplete);
4079 
4080  string mag;
4081  if (! m_MetaGenomeSource.empty()) {
4082  if ( prefix.empty() ) {
4083  mag = "MAG: ";
4084  } else {
4085  mag = "MAG ";
4086  }
4087  }
4088 
4089  // produce final result
4090  string penult = mag + prefix + decoded + suffix;
4091 
4092  x_CleanAndCompress (final, penult, m_IsAA);
4093 
4094  if (! m_IsPDB && ! m_IsPatent && ! m_IsAA && ! m_IsSeg) {
4095  if (!final.empty() && islower ((unsigned char) final[0]) && capitalize) {
4096  final [0] = toupper ((unsigned char) final [0]);
4097  }
4098  }
4099 
4101  m_Idx.Reset (NULL);
4102 
4103  return final;
4104 }
4105 
4107  const CBioseq_Handle& bsh,
4108  CSeqEntryIndex& idx,
4110 )
4111 
4112 {
4113  m_Idx = &idx;
4114 
4115  return GenerateDefline(bsh, flags);
4116 }
4117 
4119  const CBioseq& bioseq,
4120  CScope& scope,
4121  CSeqEntryIndex& idx,
4123 )
4124 
4125 {
4126  m_Idx = &idx;
4127 
4128  return GenerateDefline(bioseq, scope, flags);
4129 }
4130 
4132  const CBioseq_Handle& bsh,
4133  feature::CFeatTree& ftree,
4135 )
4136 
4137 {
4138  m_ConstructedFeatTree = true;
4139  m_InitializedFeatTree = true;
4140  m_Feat_Tree = &ftree;
4141 
4142  return GenerateDefline(bsh, flags);
4143 }
4144 
4146  const CBioseq& bioseq,
4147  CScope& scope,
4149 )
4150 
4151 {
4152  CBioseq_Handle bsh = scope.AddBioseq(bioseq,
4155  return GenerateDefline(bsh, flags);
4156 }
4157 
4159  const CBioseq& bioseq,
4160  CScope& scope,
4161  feature::CFeatTree& ftree,
4163 )
4164 
4165 {
4166  m_ConstructedFeatTree = true;
4167  m_InitializedFeatTree = true;
4168  m_Feat_Tree = &ftree;
4169 
4170  return GenerateDefline(bioseq, scope, flags);
4171 }
4172 
4174  AddWord ("heterogeneous population sequenced", 1);
4175  AddWord ("low-quality sequence region", 2);
4176  AddWord ("unextendable partial coding region", 3);
4177  Prime ();
4178 }
4179 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
void SetOptions(const CAutoDefOptions &options)
void InitFromUserObject(const CUser_object &obj)
void SetOptionsObject(const CUser_object &user)
Definition: autodef.cpp:1196
string GetOneDefLine(CAutoDefModifierCombo *mod_combo, const CBioseq_Handle &bh, CRef< feature::CFeatTree > featTree=null)
Definition: autodef.cpp:1095
CBinomialOrgName –.
const string & GetTaxname(void) const
Definition: BioSource.cpp:340
const COrgName & GetOrgname(void) const
Definition: BioSource.cpp:410
bool IsSetOrgname(void) const
Definition: BioSource.cpp:405
bool IsSetTaxname(void) const
Definition: BioSource.cpp:335
CTempString GetSubstrain(void)
Definition: indexer.cpp:2778
int GetPatentSequence(void) const
Definition: indexer.hpp:502
bool IsHTGTech(void)
Definition: indexer.cpp:2416
CSeq_inst::TLength GetLength(void) const
Definition: indexer.hpp:474
CTempString GetCultivar(void)
Definition: indexer.cpp:2736
bool HasClone(void)
Definition: indexer.cpp:2686
CTempString GetMetaGenomeSource(void)
Definition: indexer.cpp:2788
bool IsTPAReasm(void)
Definition: indexer.cpp:2848
bool IsTPAInf(void)
Definition: indexer.cpp:2838
CTempString GetBreed(void)
Definition: indexer.cpp:2726
bool IsChromosome(void)
Definition: indexer.cpp:2606
bool IsWGS(void)
Definition: indexer.cpp:2456
CTempString GetGenus(void)
Definition: indexer.cpp:2556
bool IsNR(void) const
Definition: indexer.hpp:487
CMolInfo::TTech GetTech(void)
Definition: indexer.cpp:2396
CRef< CFeatureIndex > GetFeatureForProduct(void)
Definition: indexer.cpp:2299
bool IsEST_STS_GSS(void)
Definition: indexer.cpp:2466
bool IsWP(void) const
Definition: indexer.hpp:491
CTempString GetStrain(void)
Definition: indexer.cpp:2768
bool IsTSAMaster(void) const
Definition: indexer.hpp:494
bool IsUnreviewed(void)
Definition: indexer.cpp:2938
bool IsNM(void) const
Definition: indexer.hpp:486
bool IsTPAExp(void)
Definition: indexer.cpp:2828
bool IsHTGSPooled(void)
Definition: indexer.cpp:2818
bool IsHTGSCancelled(void)
Definition: indexer.cpp:2798
bool IsDelta(void) const
Definition: indexer.hpp:476
bool IsThirdParty(void) const
Definition: indexer.hpp:492
bool IsMap(void) const
Definition: indexer.hpp:479
bool IsAA(void) const
Definition: indexer.hpp:472
bool IsPatent(void) const
Definition: indexer.hpp:489
bool IsUnverifiedFeature(void)
Definition: indexer.cpp:2898
CTempString GetSpecies(void)
Definition: indexer.cpp:2566
CConstRef< CBioSource > GetBioSource(void)
Definition: indexer.cpp:2486
string GetrEnzyme(void)
Definition: indexer.cpp:3034
bool IsWGSMaster(void) const
Definition: indexer.hpp:493
bool IsTLSMaster(void) const
Definition: indexer.hpp:495
string GetSecondSuperKingdom(void)
Definition: indexer.cpp:2636
CMolInfo::TCompleteness GetCompleteness(void)
Definition: indexer.cpp:2406
int GetPDBChain(void) const
Definition: indexer.hpp:504
string GetPatentCountry(void) const
Definition: indexer.hpp:500
bool IsPlasmid(void)
Definition: indexer.cpp:2596
bool IsCrossKingdom(void)
Definition: indexer.cpp:2646
string GetPDBChainID(void) const
Definition: indexer.hpp:505
bool IsUnordered(void)
Definition: indexer.cpp:2858
bool IsNA(void) const
Definition: indexer.hpp:471
CTempString GetIsolate(void)
Definition: indexer.cpp:2758
CTempString GetPDBCompound(void)
Definition: indexer.cpp:2868
const string & GetTitle(void)
Definition: indexer.cpp:2366
CTempString GetMap(void)
Definition: indexer.cpp:2696
string GetPatentNumber(void) const
Definition: indexer.hpp:501
bool IsUnreviewedUnannotated(void)
Definition: indexer.cpp:2948
bool IsPseudogene(void)
Definition: indexer.cpp:2978
CTempString GetSegment(void)
Definition: indexer.cpp:2716
CTempString GetLinkageGroup(void)
Definition: indexer.cpp:2666
bool IsHTGSDraft(void)
Definition: indexer.cpp:2808
const string & GetTaxname(void)
Definition: indexer.cpp:2496
CSeq_inst::TTopology GetTopology(void) const
Definition: indexer.hpp:473
bool IsNZ(void) const
Definition: indexer.hpp:488
CWeakRef< CBioseqIndex > GetBioseqForProduct(void)
Definition: indexer.cpp:2341
int GetGeneralId(void) const
Definition: indexer.hpp:498
const string & GetOrganelle(void)
Definition: indexer.cpp:2616
CRef< CFeatureIndex > GetBestProteinFeature(void)
Definition: indexer.cpp:2353
string GetGeneralStr(void) const
Definition: indexer.hpp:497
bool IsVirtual(void) const
Definition: indexer.hpp:478
CTempString GetSpecimenVoucher(void)
Definition: indexer.cpp:2747
CTempString GetClone(void)
Definition: indexer.cpp:2676
CBioSource::TGenome GetGenome(void)
Definition: indexer.cpp:2586
CTempString GetChromosome(void)
Definition: indexer.cpp:2656
bool IsUnverifiedMisassembled(void)
Definition: indexer.cpp:2918
CMolInfo::TBiomol GetBiomol(void)
Definition: indexer.cpp:2386
bool IsHTGSUnfinished(void)
Definition: indexer.cpp:2426
CTempString GetPlasmid(void)
Definition: indexer.cpp:2706
string GetFirstSuperKingdom(void)
Definition: indexer.cpp:2626
bool IsUseBiosrc(void)
Definition: indexer.cpp:2476
bool IsTSA(void)
Definition: indexer.cpp:2446
bool IsUnverifiedContaminant(void)
Definition: indexer.cpp:2928
bool IsNC(void) const
Definition: indexer.hpp:485
bool IsMultispecies(void)
Definition: indexer.cpp:2576
bool IsTLS(void)
Definition: indexer.cpp:2436
const string & GetComment(void)
Definition: indexer.cpp:2968
bool IsUnverifiedOrganism(void)
Definition: indexer.cpp:2908
CTempString GetTargetedLocus(void)
Definition: indexer.cpp:2958
bool IsPDB(void) const
Definition: indexer.hpp:490
const string & GetDescTaxname(void)
Definition: indexer.cpp:2506
bool IsUnverified(void)
Definition: indexer.cpp:2888
CBioseq_Handle –.
Definition: Dbtag.hpp:53
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
void ReplaceAndAdd(const CTempString &value, const CTempString &replace_what, const CTempString &replace_with)
void Add(const CTempString &name, const CTempString &value, EHidePart hide=eHideNone)
CDefLineJoiner(bool show_mods=false)
CTextJoiner< 64, CTempString > m_Joiner
void Join(std::string *result) const
CEMBL_block –.
Definition: EMBL_block.hpp:66
CFeatTree.
Definition: feature.hpp:173
CFeat_CI –.
Definition: feat_ci.hpp:64
CRef< CFeatureIndex > GetBestGene(void)
Definition: indexer.cpp:3204
const CMappedFeat GetMappedFeat(void) const
Definition: indexer.hpp:897
const string & GetSomeNumber(void) const
Definition: Id_pat.cpp:96
CMap_ext –.
Definition: Map_ext.hpp:66
CMappedFeat –.
Definition: mapped_feat.hpp:59
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
CPCRPrimerSet –.
CPCRReactionSet –.
CPDB_block –.
Definition: PDB_block.hpp:66
CPartialOrgName –.
CRsite_ref –.
Definition: Rsite_ref.hpp:66
CScope –.
Definition: scope.hpp:92
CRef< CBioseqIndex > GetBioseqIndex(void)
Definition: indexer.cpp:114
CSeqFeatXref –.
Definition: SeqFeatXref.hpp:66
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
CSeq_entry_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
static bool NCBI_UseGeoLocNameForCountry(void)
Definition: SubSource.cpp:94
CTaxElement –.
Definition: TaxElement.hpp:66
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
@ eObjectType_AutodefOptions
bool IsUnverifiedMisassembled() const
bool IsUnverifiedOrganism() const
bool IsUnverifiedContaminant() const
bool IsUnverifiedFeature() const
bool IsUnverified() const
bool IsUnreviewedUnannotated() const
bool IsUnreviewed() const
EObjectType GetObjectType() const
Definition: map.hpp:338
USING_SCOPE(objects)
static size_t s_TitleEndsInOrganism(string &title, CTempString taxname)
static bool x_GetSegSeqInfoViaCDS(string &locus, string &product, const char *&completeness, const CBioseq_Handle &bsh)
static void s_AddVoucherAndIsolate(const CTempString &taxname, const CTempString &strain, const CTempString &specimen_voucher, const CTempString &isolate, CDefLineJoiner &joiner)
#define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var)
void x_CleanAndCompress(string &dest, const CTempString &instr, bool isProt)
#define comma_space
#define FOR_SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE(Itr, Var, Sel)
#define space_comma
static string s_RemoveWhiteSpace(string str)
#define space_semicolon
static string s_RemoveBracketedOrgFromEnd(string str, string taxname)
#define FOR_EACH_SEQFEAT_ON_BIOSEQ_HANDLE(Itr, Var, Chs)
static void x_FlyCG_PtoR(string &s)
#define semicolon_space
#define bracket_space
#define FOR_EACH_SEQFEAT_ON_SCOPE(Itr, Var, Loc, Chs)
static void s_TrimMainTitle(string &str)
#define twocommas
static bool s_EndsWithStrain(const CTempString &taxname, const CTempString &strain)
#define space_bracket
static const char * s_proteinOrganellePrefix[]
static string s_RemoveColonsAndWhiteSpace(string str)
#define twospaces
static CConstRef< CBioSource > x_GetSourceFeatViaCDS(const CBioseq_Handle &bsh)
USING_NCBI_SCOPE
EHidePart
@ eHideValue
@ eHideNone
@ eHideType
static const char * s_tpaPrefixList[]
static bool s_IsVirusOrPhage(const CTempString &taxname)
API (CDeflineGenerator) for computing sequences' titles ("definitions").
static uch flags
std::ofstream out("events_result.xml")
main entry point for tests
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
char data[12]
Definition: iconv.c:80
#define FOR_EACH_USERFIELD_ON_USEROBJECT(Itr, Var)
FOR_EACH_USERFIELD_ON_USEROBJECT EDIT_EACH_USERFIELD_ON_USEROBJECT.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
CConstRef< CSeq_id > GetSeqId(void) const
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
CSeq_id::E_Choice Which(void) const
string GetLabel(const CSeq_id &id)
@ fAcc_master
Definition: Seq_id.hpp:256
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
CMappedFeat GetBestGeneForCds(const CMappedFeat &cds_feat, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0, CFeatTree::EBestGeneType lookup_type=CFeatTree::eBestGene_TreeOnly)
Definition: feature.cpp:3321
CMappedFeat GetBestOverlappingFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype need_subtype, sequence::EOverlapType overlap_type, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3653
@ fFGL_Content
Include its content if there is any.
Definition: feature.hpp:73
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
CSeq_loc * SeqLocRevCmpl(const CSeq_loc &loc, CScope *scope)
Get reverse complement of the seq-loc (?)
@ eOverlap_SubsetRev
1st is a subset of 2nd ranges
@ eOverlap_Contained
2nd contained within 1st extremes
void x_AdjustProteinTitleSuffixIdx(const CBioseq_Handle &bsh)
CRef< feature::CFeatTree > m_Feat_Tree
CTempString m_Chromosome
subsource fields
CTempString m_SpecimenVoucher
CDeflineGenerator(void)
Constructor.
void x_SetTitleFromNR(const CBioseq_Handle &bsh)
bool m_Reconstruct
ignore existing title is forced for certain types
const CSeq_feat * GetCDSForProduct(const CBioseq &product, CScope *scope)
Get the encoding CDS feature of a given protein sequence.
Definition: sequence.cpp:2549
void x_SetTitleFromGPipe(void)
int TUserFlags
Binary "OR" of EUserFlags.
CTempString m_PDBCompound
pdb block fields
bool m_HTGSCancelled
genbank or embl block keyword fields
void x_DescribeClones(vector< CTempString > &desc, string &buf)
CTempString m_UnreviewedPrefix
void x_SetTitleFromMap(void)
string GenerateDefline(const CBioseq_Handle &bsh, TUserFlags flags=0)
Main method.
CTempString m_LinkageGroup
void x_SetTitleFromWGS(void)
bool m_IsNA
seq-inst fields
void x_SetTitleFromProtein(const CBioseq_Handle &bsh)
CMolInfo::TTech m_MITech
void x_SetBioSrcIdx(const CBioseq_Handle &bsh)
CTempString m_Substrain
void x_SetFlagsIdx(const CBioseq_Handle &bsh, TUserFlags flags)
CSeq_inst::TLength m_Length
string m_rEnzyme
map fields
CTempString m_Organelle
const char * x_OrganelleName(CBioSource::TGenome genome) const
CConstRef< CGene_ref > x_GetGeneRefViaCDS(const CMappedFeat &mapped_cds)
CConstRef< CSeq_feat > x_GetLongestProtein(const CBioseq_Handle &bsh)
CTempString m_Cultivar
CTempString m_UnverifiedPrefix
void x_AdjustProteinTitleSuffix(const CBioseq_Handle &bsh)
void x_SetTitleFromProteinIdx(const CBioseq_Handle &bsh)
const CBioSource * GetBioSource(const CBioseq &bioseq)
Retrieve the BioSource object for a given bioseq handle.
Definition: sequence.cpp:104
CBioSource::TGenome m_Genome
string x_GetModifiers(const CBioseq_Handle &handle)
void x_SetTitleFromPatent(void)
const CMolInfo * GetMolInfo(const CBioseq &bioseq)
Retrieve the MolInfo object for a given bioseq handle.
Definition: sequence.cpp:284
CTempString m_Comment
comment fields
bool x_IsComplete() const
void x_SetTitleFromSegSeq(const CBioseq_Handle &bsh)
void x_SetTitleFromNC(void)
~CDeflineGenerator(void)
Destructor.
bool m_IsNC
seq-id fields
CMappedFeat GetMappedCDSForProduct(const CBioseq_Handle &product)
Definition: sequence.cpp:2568
CConstRef< CBioSource > m_Source
biosource fields
CTempString m_TargetedLocus
static CSafeStatic< CLowQualityTextFsm > ms_p_Low_Quality_Fsa
void x_SetBioSrc(const CBioseq_Handle &bsh)
bool m_IsUnverified
user object fields
void x_SetTitleFromBioSrc(void)
void x_Init(void)
internal methods
void x_SetTitleFromPDB(void)
CSeq_entry_Handle m_TopSEH
internal feature tree for parent mapping
void x_SetSuffix(string &suffix, const CBioseq_Handle &bsh, bool appendComplete)
CMolInfo::TCompleteness m_MICompleteness
void x_SetTitleFromNM(const CBioseq_Handle &bsh)
bool x_CDShasLowQualityException(const CSeq_feat &sft)
void x_SetFlags(const CBioseq_Handle &bsh, TUserFlags flags)
CTempString m_Breed
orgmod fields
CRef< CSeqEntryIndex > m_Idx
index with feature tree for each Bioseq
CTempString m_MetaGenomeSource
CSeq_inst::TTopology m_Topology
void x_SetPrefix(string &prefix, const CBioseq_Handle &bsh)
CMolInfo::TBiomol m_MIBiomol
molinfo fields
@ fLocalAnnotsOnly
Never use related sequences' annotations.
@ fDevMode
Development mode for testing new features.
@ fDoNotUseAutoDef
Disable internal call to auto-def.
@ fGpipeMode
Use GPipe defaults.
@ fOmitTaxonomicName
Do not add organism suffix to proteins.
@ fFastaFormat
Generate FASTA defline.
@ fShowModifiers
Show key-value pair modifiers (e.g. "[organism=Homo sapiens]")
@ fIgnoreExisting
Generate fresh titles unconditionally.
@ fAllProteinNames
List all relevant proteins, not just one.
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
@ eExist_Get
Definition: scope.hpp:260
@ kPriority_Default
Use default priority for added data.
Definition: scope.hpp:100
const TInst_Ext & GetInst_Ext(void) const
bool IsSetInst_Ext(void) const
const CSeqFeatData & GetData(void) const
bool IsAa(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
bool IsSetInst_Length(void) const
TInst_Topology GetInst_Topology(void) const
TInst_Length GetInst_Length(void) const
bool IsSetInst(void) const
bool IsSetInst_Repr(void) const
bool CanGetInst_Topology(void) const
TInst_Repr GetInst_Repr(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
bool IsNa(void) const
SAnnotSelector & SetResolveTSE(void)
SetResolveTSE() is equivalent to SetResolveMethod(eResolve_TSE).
SAnnotSelector & SetFeatType(TFeatType type)
Set feature type (also set annotation type to feat)
const CSeq_feat & GetOriginalFeature(void) const
Get original feature with unmapped location/product.
SAnnotSelector & IncludeFeatType(TFeatType type)
Include feature type in the search.
vector< CSeqdesc::E_Choice > TDescChoices
Definition: seqdesc_ci.hpp:67
@ fFindGap
Definition: seq_map.hpp:130
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TRefType Lock(void) const
Lock the object and return reference to it.
Definition: ncbiobj.hpp:2713
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
std::string CStringUTF8
Definition: ncbistl.hpp:254
void AddWord(const string &word, const int &match)
Definition: strsearch.hpp:343
void Prime(void)
Definition: strsearch.hpp:376
static string HtmlDecode(const CTempString str, EEncoding encoding=eEncoding_Unknown, THtmlDecode *result_flags=NULL)
Decode HTML entities and character references.
Definition: ncbistr.cpp:4527
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define kEmptyStr
Definition: ncbistr.hpp:123
CTextJoiner & Add(const TIn &s)
Definition: text_joiner.hpp:79
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
void Join(TOut *result) const
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static SIZE_TYPE FindCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case sensitive search.
Definition: ncbistr.hpp:5490
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
void clear(void)
Clears the string.
Definition: tempstr.hpp:351
size_t size_type
Definition: tempstr.hpp:70
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
size_type find_first_not_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character not in the matching string within the current string,...
Definition: tempstr.hpp:553
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
const char *const kEmptyCStr
Empty "C" string (points to a '\0').
Definition: ncbistr.cpp:68
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ eReverseSearch
Search in a backward direction.
Definition: ncbistr.hpp:1947
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
Definition: GB_block_.hpp:526
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
Definition: GB_block_.hpp:514
const TCountry & GetCountry(void) const
Get the Country member data.
Definition: Id_pat_.hpp:478
const Tdata & Get(void) const
Get the member data.
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
const TPcr_primers & GetPcr_primers(void) const
Get the Pcr_primers member data.
Definition: BioSource_.hpp:588
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
const Tdata & Get(void) const
Get the member data.
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
list< CRef< CSubSource > > TSubtype
Definition: BioSource_.hpp:145
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
bool IsSetPcr_primers(void) const
Check if a value has been assigned to Pcr_primers data member.
Definition: BioSource_.hpp:576
bool CanGetGenome(void) const
Check if it is safe to call GetGenome method.
Definition: BioSource_.hpp:403
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
TSubtype GetSubtype(void) const
Get the Subtype member data.
Definition: SubSource_.hpp:310
bool CanGet(void) const
Check if it is safe to call Get method.
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: SubSource_.hpp:291
bool CanGet(void) const
Check if it is safe to call Get method.
const TName & GetName(void) const
Get the Name member data.
Definition: SubSource_.hpp:350
list< CRef< CPCRReaction > > Tdata
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: SubSource_.hpp:338
list< CRef< CPCRPrimer > > Tdata
@ eSubtype_collection_date
DD-MMM-YYYY format.
Definition: SubSource_.hpp:114
@ eSubtype_lat_lon
+/- decimal degrees
Definition: SubSource_.hpp:113
@ eSubtype_collected_by
name of person who collected the sample
Definition: SubSource_.hpp:115
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eSubtype_endogenous_virus_name
Definition: SubSource_.hpp:109
@ eSubtype_identified_by
name of person who identified the sample
Definition: SubSource_.hpp:116
@ eGenome_plasmid_in_mitochondrion
Definition: BioSource_.hpp:121
const TDesc & GetDesc(void) const
Get the Desc member data.
Definition: Gene_ref_.hpp:599
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
bool IsSetDesc(void) const
descriptive name Check if a value has been assigned to Desc data member.
Definition: Gene_ref_.hpp:587
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
const TStr & GetStr(void) const
Get the variant data.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
const TData & GetData(void) const
Get the Data member data.
bool IsSetTag(void) const
appropriate tag Check if a value has been assigned to Tag data member.
Definition: Dbtag_.hpp:255
bool IsStr(void) const
Check if variant Str is selected.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TLabel & GetLabel(void) const
Get the Label member data.
const TType & GetType(void) const
Get the Type member data.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
bool IsSetPgcode(void) const
plastid genetic code Check if a value has been assigned to Pgcode data member.
Definition: OrgName_.hpp:1040
TFixed_level GetFixed_level(void) const
Get the Fixed_level member data.
TMgcode GetMgcode(void) const
Get the Mgcode member data.
Definition: OrgName_.hpp:965
TGcode GetGcode(void) const
Get the Gcode member data.
Definition: OrgName_.hpp:918
const TSubname & GetSubname(void) const
Get the Subname member data.
Definition: OrgMod_.hpp:347
bool IsSetFixed_level(void) const
Check if a value has been assigned to Fixed_level data member.
bool IsPartial(void) const
Check if variant Partial is selected.
Definition: OrgName_.hpp:753
const TName & GetName(void) const
Get the Name member data.
Definition: OrgName_.hpp:771
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetMgcode(void) const
mitochondrial genetic code Check if a value has been assigned to Mgcode data member.
Definition: OrgName_.hpp:946
const TLevel & GetLevel(void) const
Get the Level member data.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TBinomial & GetBinomial(void) const
Get the variant data.
Definition: OrgName_.cpp:121
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
const Tdata & Get(void) const
Get the member data.
const TName & GetName(void) const
Get the Name member data.
list< CRef< COrgMod > > TMod
Definition: OrgName_.hpp:332
bool IsSetGenus(void) const
required Check if a value has been assigned to Genus data member.
const TSpecies & GetSpecies(void) const
Get the Species member data.
list< CRef< CTaxElement > > Tdata
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetSubname(void) const
Check if a value has been assigned to Subname data member.
Definition: OrgMod_.hpp:335
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
bool IsSetLevel(void) const
Check if a value has been assigned to Level data member.
bool IsSetGcode(void) const
genetic code (see CdRegion) Check if a value has been assigned to Gcode data member.
Definition: OrgName_.hpp:899
const TGenus & GetGenus(void) const
Get the Genus member data.
const TPartial & GetPartial(void) const
Get the variant data.
Definition: OrgName_.cpp:193
bool IsSet(void) const
Check if a value has been assigned to data member.
bool IsSetSpecies(void) const
species required if subspecies used Check if a value has been assigned to Species data member.
TPgcode GetPgcode(void) const
Get the Pgcode member data.
Definition: OrgName_.hpp:1059
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: OrgName_.hpp:759
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
bool IsBinomial(void) const
Check if variant Binomial is selected.
Definition: OrgName_.hpp:715
@ eSubtype_biotype
Definition: OrgMod_.hpp:97
@ eSubtype_subgroup
Definition: OrgMod_.hpp:99
@ eSubtype_gb_acronym
used by taxonomy database
Definition: OrgMod_.hpp:115
@ eSubtype_gb_synonym
used by taxonomy database
Definition: OrgMod_.hpp:117
@ eSubtype_substrain
Definition: OrgMod_.hpp:86
@ eSubtype_anamorph
Definition: OrgMod_.hpp:112
@ eSubtype_pathovar
Definition: OrgMod_.hpp:94
@ eSubtype_other
ASN5: old-name (254) will be added to next spec.
Definition: OrgMod_.hpp:125
@ eSubtype_dosage
chromosome dosage of hybrid
Definition: OrgMod_.hpp:103
@ eSubtype_authority
Definition: OrgMod_.hpp:107
@ eSubtype_sub_species
Definition: OrgMod_.hpp:105
@ eSubtype_nat_host
natural host of this specimen
Definition: OrgMod_.hpp:104
@ eSubtype_cultivar
Definition: OrgMod_.hpp:93
@ eSubtype_variety
Definition: OrgMod_.hpp:89
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_metagenome_source
Definition: OrgMod_.hpp:120
@ eSubtype_biovar
Definition: OrgMod_.hpp:96
@ eSubtype_subtype
Definition: OrgMod_.hpp:88
@ eSubtype_teleomorph
Definition: OrgMod_.hpp:113
@ eSubtype_serogroup
Definition: OrgMod_.hpp:91
@ eSubtype_synonym
Definition: OrgMod_.hpp:111
@ eSubtype_group
Definition: OrgMod_.hpp:98
@ eSubtype_type_material
Definition: OrgMod_.hpp:121
@ eSubtype_acronym
Definition: OrgMod_.hpp:102
@ eSubtype_specimen_voucher
Definition: OrgMod_.hpp:106
@ eSubtype_serotype
Definition: OrgMod_.hpp:90
@ eSubtype_chemovar
Definition: OrgMod_.hpp:95
@ eSubtype_nomenclature
code of nomenclature in subname (B,P,V,Z or combination)
Definition: OrgMod_.hpp:122
@ eSubtype_serovar
Definition: OrgMod_.hpp:92
@ eSubtype_bio_material
Definition: OrgMod_.hpp:119
@ eSubtype_gb_anamorph
used by taxonomy database
Definition: OrgMod_.hpp:116
@ eSubtype_culture_collection
Definition: OrgMod_.hpp:118
@ eSubtype_ecotype
Definition: OrgMod_.hpp:110
@ eSubtype_forma_specialis
Definition: OrgMod_.hpp:109
@ eSubtype_isolate
Definition: OrgMod_.hpp:100
bool IsSetDesc(void) const
description (instead of name) Check if a value has been assigned to Desc data member.
Definition: Prot_ref_.hpp:391
EProcessed
processing status
Definition: Prot_ref_.hpp:95
TProcessed GetProcessed(void) const
Get the Processed member data.
Definition: Prot_ref_.hpp:538
bool IsSetProcessed(void) const
Check if a value has been assigned to Processed data member.
Definition: Prot_ref_.hpp:513
const TDesc & GetDesc(void) const
Get the Desc member data.
Definition: Prot_ref_.hpp:403
bool IsStr(void) const
Check if variant Str is selected.
Definition: Rsite_ref_.hpp:264
const TStr & GetStr(void) const
Get the variant data.
Definition: Rsite_ref_.hpp:270
const TData & GetData(void) const
Get the Data member data.
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsSetPartial(void) const
incomplete in some way? Check if a value has been assigned to Partial data member.
Definition: Seq_feat_.hpp:943
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
bool IsSetExcept(void) const
something funny about this? Check if a value has been assigned to Except data member.
Definition: Seq_feat_.hpp:990
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
Definition: Seq_feat_.hpp:1405
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
const TBiosrc & GetBiosrc(void) const
Get the variant data.
bool IsBiosrc(void) const
Check if variant Biosrc is selected.
const TGene & GetGene(void) const
Get the variant data.
const TProt & GetProt(void) const
Get the variant data.
TExcept GetExcept(void) const
Get the Except member data.
Definition: Seq_feat_.hpp:1009
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
Definition: Seq_feat_.hpp:1105
TChain GetChain(void) const
Get the Chain member data.
bool IsSetChain_id(void) const
chain identifier; length-independent generalization of 'chain' Check if a value has been assigned to ...
bool IsSetChain(void) const
Deprecated: 'chain' can't support multiple character PDB chain identifiers (introduced in 2015).
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
bool IsSetCit(void) const
patent citation Check if a value has been assigned to Cit data member.
TSeqid GetSeqid(void) const
Get the Seqid member data.
const TChain_id & GetChain_id(void) const
Get the Chain_id member data.
bool IsSetSeqid(void) const
number of sequence in patent Check if a value has been assigned to Seqid data member.
const TCit & GetCit(void) const
Get the Cit member data.
const TAccession & GetAccession(void) const
Get the Accession member data.
bool IsMap(void) const
Check if variant Map is selected.
Definition: Seq_ext_.hpp:330
const TSeg & GetSeg(void) const
Get the variant data.
Definition: Seq_ext_.cpp:114
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
ERepr
representation class
Definition: Seq_inst_.hpp:91
const Tdata & Get(void) const
Get the member data.
Definition: Map_ext_.hpp:164
const TMap & GetMap(void) const
Get the variant data.
Definition: Seq_ext_.cpp:158
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
list< CRef< CSeq_feat > > Tdata
Definition: Map_ext_.hpp:89
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
ETopology
topology of molecule
Definition: Seq_inst_.hpp:121
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Map_ext_.hpp:152
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_map
ordered map of any kind
Definition: Seq_inst_.hpp:99
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_physmap
from physical mapping techniques
Definition: MolInfo_.hpp:129
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_both
concept transl. w/ partial pept. seq.
Definition: MolInfo_.hpp:133
@ eTech_seq_pept_homol
sequenced peptide, ordered by homology
Definition: MolInfo_.hpp:135
@ eTech_composite_wgs_htgs
composite of WGS and HTGS
Definition: MolInfo_.hpp:145
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_seq_pept_overlap
sequenced peptide, ordered by overlap
Definition: MolInfo_.hpp:134
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_concept_trans
conceptual translation
Definition: MolInfo_.hpp:131
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_standard
standard sequencing
Definition: MolInfo_.hpp:124
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_seq_pept
peptide was sequenced
Definition: MolInfo_.hpp:132
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_barcode
barcode of life project
Definition: MolInfo_.hpp:144
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_derived
derived from other data, not a primary entity
Definition: MolInfo_.hpp:130
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ eTech_concept_trans_a
conceptual transl. supplied by author
Definition: MolInfo_.hpp:136
@ eTech_genemap
from genetic mapping techniques
Definition: MolInfo_.hpp:128
@ e_Embl
EMBL specific information.
Definition: Seqdesc_.hpp:127
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Comment
a more extensive comment
Definition: Seqdesc_.hpp:117
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
@ e_Pdb
PDB specific information.
Definition: Seqdesc_.hpp:131
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
Definition of all error codes used in objmgr libraries (xobjmgr.lib, xobjutil.lib and others).
char * buf
int i
int len
static char * subname
Definition: mdb_load.c:26
constexpr bool empty(list< Ts... >) noexcept
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const CharType(& source)[N]
Definition: pointer.h:1149
Static variables safety - create on demand, destroy on application termination.
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int isprint(Uchar c)
Definition: ncbictype.hpp:67
int ispunct(Uchar c)
Definition: ncbictype.hpp:68
int islower(Uchar c)
Definition: ncbictype.hpp:66
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
std::istream & in(std::istream &in_, double &x_)
Int4 delta(size_t dimension_, const Int4 *score_)
static const char * suffix[]
Definition: pcregrep.c:408
static const char * prefix[]
Definition: pcregrep.c:405
#define NCBI_BIOMOL(Type)
CMolInfo definitions.
Definition: seq_macros.hpp:110
#define NCBI_TECH(Type)
Definition: seq_macros.hpp:118
#define NCBI_COMPLETENESS(Type)
Definition: seq_macros.hpp:130
#define NCBI_SEQTOPOLOGY(Type)
Definition: seq_macros.hpp:66
#define FOR_EACH_COMPOUND_ON_PDBBLOCK(Itr, Var)
FOR_EACH_COMPOUND_ON_PDBBLOCK EDIT_EACH_COMPOUND_ON_PDBBLOCK.
#define FOR_EACH_NAME_ON_PROT
#define FOR_EACH_ACTIVITY_ON_PROT
#define NCBI_GENOME(Type)
@NAME Convenience macros for NCBI objects
#define FOR_EACH_SYNONYM_ON_GENEREF(Itr, Var)
FOR_EACH_SYNONYM_ON_GENEREF EDIT_EACH_SYNONYM_ON_GENEREF.
#define SWITCH_ON_SUBSOURCE_CHOICE(Var)
SWITCH_ON_SUBSOURCE_CHOICE.
#define NCBI_ORGMOD(Type)
COrgMod definitions.
#define FOR_EACH_ORGMOD_ON_BIOSOURCE(Itr, Var)
FOR_EACH_ORGMOD_ON_BIOSOURCE EDIT_EACH_ORGMOD_ON_BIOSOURCE.
#define FOR_EACH_SEQFEATXREF_ON_FEATURE
#define SWITCH_ON_ORGMOD_CHOICE(Var)
SWITCH_ON_ORGMOD_CHOICE.
#define NCBI_SEQFEAT(Type)
CSeq_feat definitions.
#define FOR_EACH_SUBSOURCE_ON_BIOSOURCE(Itr, Var)
FOR_EACH_SUBSOURCE_ON_BIOSOURCE EDIT_EACH_SUBSOURCE_ON_BIOSOURCE.
#define FEATURE_CHOICE_IS
FEATURE_CHOICE_IS SWITCH_ON_FEATURE_CHOICE.
#define NCBI_SUBSOURCE(Type)
CSubSource definitions.
CBioSource::TGenome TBIOSOURCE_GENOME
#define SWITCH_ON_FEATURE_CHOICE
#define FOR_EACH_SEQFEATXREF_ON_SEQFEAT(Itr, Var)
FOR_EACH_SEQFEATXREF_ON_SEQFEAT EDIT_EACH_SEQFEATXREF_ON_SEQFEAT.
#define FOR_EACH_SYNONYM_ON_GENE
CSeq_id::EAccessionInfo TACCN_CHOICE
#define NCBI_SEQID(Type)
@NAME Convenience macros for NCBI objects
#define NCBI_ACCN(Type)
#define FIELD_IS_SET_AND_IS(Var, Fld, Chs)
FIELD_IS_SET_AND_IS base macro.
#define FIELD_IS(Var, Fld)
Generic FIELD macros.
#define FOR_EACH_STRING_IN_LIST(Itr, Var)
FOR_EACH_STRING_IN_LIST EDIT_EACH_STRING_IN_LIST.
#define FIELD_IS_SET(Var, Fld)
FIELD_IS_SET base macro.
#define GET_FIELD(Var, Fld)
GET_FIELD base macro.
#define FOR_EACH_CHAR_IN_STRING(Itr, Var)
FOR_EACH_CHAR_IN_STRING EDIT_EACH_CHAR_IN_STRING.
static const TTechMapEntry sc_TechArray[]
SStaticPair< const char *, CMolInfo::TTech > TTechMapEntry
CStaticPairArrayMap< const char *, CMolInfo::TTech, CSourceModParser::PKeyCompare > TTechMap
#define DEFINE_STATIC_ARRAY_MAP(Type, Var, Array)
Definition: static_set.hpp:888
SAnnotSelector –.
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: type.c:6
#define _TROUBLE
#define _ASSERT
#define Type
Template for collecting and joining strings with a minimum of heap churn.
else result
Definition: token2.c:20
Modified on Wed Apr 17 13:09:51 2024 by modify_doxy.py rev. 669887