NCBI C++ ToolKit
create_defline.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jonathan Kans, Aaron Ucko
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 
33 #include <ncbi_pch.hpp>
34 
36 
38 #include <util/text_joiner.hpp>
39 #include <serial/iterator.hpp>
40 
42 #include <objects/seq/Map_ext.hpp>
44 
45 #include <objmgr/annot_ci.hpp>
46 #include <objmgr/feat_ci.hpp>
47 #include <objmgr/seq_map_ci.hpp>
48 #include <objmgr/seqdesc_ci.hpp>
49 #include <objmgr/mapped_feat.hpp>
50 #include <objmgr/seq_entry_ci.hpp>
51 #include <objmgr/error_codes.hpp>
52 
53 #include <objmgr/util/feature.hpp>
54 #include <objmgr/util/sequence.hpp>
55 #include <objmgr/util/autodef.hpp>
56 
59 USING_SCOPE(sequence);
60 USING_SCOPE(feature);
61 
62 #define NCBI_USE_ERRCODE_X ObjMgr_SeqUtil
63 
66 {
67 public:
68  CDefLineJoiner(bool show_mods = false)
69  : m_ShowMods(show_mods)
70  {
71  }
72  void Add(const CTempString &name, const CTempString &value, EHidePart hide = eHideNone)
73  {
74  if (m_ShowMods)
75  {
76  if (name.empty() || value.empty()) {
77  return;
78  }
79  // The case of no quotes is much more common, so optimize for that
80  if (value.find_first_of("\"=") != string::npos) {
81  // rarer case: bad characters in value name, so
82  // we need surrounding double-quotes and we need to change
83  // double-quotes to single-quotes.
84  m_Joiner.Add(" [").Add(name).Add("=\"");
85  ReplaceAndAdd(value, "\"", "'");
86  m_Joiner.Add("\"]");
87  } else {
88  m_Joiner.Add(" [").Add(name).Add("=").Add(value).Add("]");
89  }
90  }
91  else
92  {
93  if (eHideNone == hide && !name.empty()) {
94  m_Joiner.Add(" ").Add(name);
95  }
96  if (!value.empty()) {
97  m_Joiner.Add(" ").Add(value);
98  }
99  }
100  }
101  void Join(std::string* result) const
102  {
104  }
105 private:
107  const CTempString &replace_what, const CTempString &replace_with)
108  {
109  // commented out: CTempString is immutable
110  //string fixed = NStr::Replace(value, "\"", "'");
111  CTempString::size_type p1 = 0, p2 = value.length();
112  for (; (p2 = value.find(replace_what, p1)) != string::npos;
113  p1 = p2 + 1, p2 = value.length()) {
114  m_Joiner.Add(value.substr(p1, p2 - p1)).Add(replace_with);
115  }
116  m_Joiner.Add(value.substr(p1, p2 - p1));
117  }
120 };
121 
122 // constructor
124 {
125  m_ConstructedFeatTree = false;
126  m_InitializedFeatTree = false;
127  x_Init();
128 }
129 
130 // constructor
132 {
133  // initialize common bits (FSA)
134  x_Init();
135 
136  // then store top SeqEntry Handle for building CFeatTree when first needed
137  m_TopSEH = tseh;
138  m_ConstructedFeatTree = true;
139  m_InitializedFeatTree = false;
140 }
141 
142 // destructor
144 
145 {
146 }
147 
149 {
150  // nothing here yet
151 }
152 
153 // macros
154 
155 // SEQENTRY_HANDLE_ON_SEQENTRY_HANDLE_ITERATOR
156 // FOR_EACH_SEQENTRY_HANDLE_ON_SEQENTRY_HANDLE
157 // CSeq_entry_Handle as input,
158 // dereference with CSeq_entry_Handle var = *Itr;
159 
160 #define SEQENTRY_HANDLE_ON_SEQENTRY_HANDLE_ITERATOR(Itr, Var) \
161 CSeq_entry_CI Itr(Var)
162 
163 #define FOR_EACH_SEQENTRY_HANDLE_ON_SEQENTRY_HANDLE(Itr, Var) \
164 for (SEQENTRY_HANDLE_ON_SEQENTRY_HANDLE_ITERATOR(Itr, Var); Itr; ++Itr)
165 
166 // FOR_EACH_SEQID_ON_BIOSEQ_HANDLE
167 // CBioseq_Handle& as input,
168 // dereference with CSeq_id_Handle sid = *Itr;
169 
170 #define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var) \
171 ITERATE (CBioseq_Handle::TId, Itr, Var.GetId())
172 
173 // SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR
174 // FOR_EACH_SEQFEAT_ON_BIOSEQ_HANDLE
175 // CBioseq_Handle& as input,
176 // dereference with const CSeq_feat& sft = Itr->GetOriginalFeature();
177 
178 #define SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR(Itr, Var, Chs) \
179 CFeat_CI Itr(Var, CSeqFeatData::e_##Chs)
180 
181 #define FOR_EACH_SEQFEAT_ON_BIOSEQ_HANDLE(Itr, Var, Chs) \
182 for (SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR(Itr, Var, Chs); Itr; ++Itr)
183 
184 // SEQFEAT_ON_SCOPE_ITERATOR
185 // FOR_EACH_SEQFEAT_ON_SCOPE
186 // CScope& as input,
187 // dereference with const CSeq_feat& sft = Itr->GetOriginalFeature();
188 
189 #define SEQFEAT_ON_SCOPE_ITERATOR(Itr, Var, Loc, Chs) \
190 CFeat_CI Itr(Var, Loc, CSeqFeatData::e_##Chs)
191 
192 #define FOR_EACH_SEQFEAT_ON_SCOPE(Itr, Var, Loc, Chs) \
193 for (SEQFEAT_ON_SCOPE_ITERATOR(Itr, Var, Loc, Chs); Itr; ++Itr)
194 
195 // SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR
196 // FOR_SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE
197 // CBioseq_Handle& and SAnnotSelector as input,
198 // dereference with const CSeq_feat& sft = Itr->GetOriginalFeature();
199 
200 #define SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR(Itr, Var, Sel) \
201 CFeat_CI Itr(Var, Sel)
202 
203 #define FOR_SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE(Itr, Var, Sel) \
204 for (SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE_ITERATOR(Itr, Var, Sel); Itr; ++Itr)
205 
206 // Copied from CleanAndCompress in objtools/format/utils.cpp
207 
208 // two-bytes combinations we're looking to clean
209 #define twochars(a,b) Uint2((a) << 8 | (b))
210 #define twocommas twochars(',',',')
211 #define twospaces twochars(' ',' ')
212 #define space_comma twochars(' ',',')
213 #define space_bracket twochars(' ',')')
214 #define bracket_space twochars('(',' ')
215 #define space_semicolon twochars(' ',';')
216 #define comma_space twochars(',',' ')
217 #define semicolon_space twochars(';',' ')
218 
219 void x_CleanAndCompress(string& dest, const CTempString& instr, bool isProt)
220 {
221  size_t left = instr.size();
222  // this is the input stream
223  const char* in = instr.data();
224 
225  // skip front white spaces
226  while (left && *in == ' ')
227  {
228  in++;
229  left--;
230  }
231  // forget end white spaces
232  while (left && in[left - 1] == ' ')
233  {
234  left--;
235  }
236 
237  dest.resize(left);
238 
239  if (left < 1) return;
240 
241  // this is where we write result
242  char* out = (char*)dest.c_str();
243 
244  char curr = *in++; // initialize with first character
245  left--;
246 
247  char next = 0;
248  Uint2 two_chars = curr; // this is two bytes storage where we see current and previous symbols
249 
250  while (left > 0) {
251  next = *in++;
252 
253  two_chars = Uint2((two_chars << 8) | next);
254 
255  switch (two_chars)
256  {
257  case twocommas: // replace double commas with comma+space
258  *out++ = curr;
259  next = ' ';
260  break;
261  case twospaces: // skip multispaces (only print last one)
262  break;
263  case bracket_space: // skip space after bracket
264  next = curr;
265  two_chars = curr;
266  break;
267  case space_bracket: // skip space before bracket
268  break;
269  case space_comma:
270  case space_semicolon: // swap characters
271  *out++ = next;
272  next = curr;
273  two_chars = curr;
274  break;
275  case comma_space:
276  *out++ = curr;
277  *out++ = ' ';
278  while (next == ' ' || next == ',') {
279  next = *in;
280  in++;
281  left--;
282  }
283  two_chars = next;
284  break;
285  case semicolon_space:
286  *out++ = curr;
287  *out++ = ' ';
288  while (next == ' ' || next == ';') {
289  next = *in;
290  in++;
291  left--;
292  }
293  two_chars = next;
294  break;
295  default:
296  *out++ = curr;
297  break;
298  }
299 
300  curr = next;
301  left--;
302  }
303 
304  if (curr > 0 && curr != ' ') {
305  *out++ = curr;
306  }
307 
308  dest.resize(out - dest.c_str());
309 
310  if (isProt) {
311  NStr::ReplaceInPlace (dest, ". [", " [");
312  NStr::ReplaceInPlace (dest, ", [", " [");
313  }
314 }
315 
316 static bool s_IsVirusOrPhage(const CTempString& taxname)
317 {
318  return (NStr::FindNoCase(taxname, "virus") != NPOS ||
319  NStr::FindNoCase(taxname, "phage") != NPOS);
320 }
321 
322 
324  TBIOSOURCE_GENOME genome
325  ) const
326 {
327  const char* result = kEmptyCStr;
328 
329  const bool has_plasmid = !m_Plasmid.empty();
330 
331 
332  switch (genome) {
333  case NCBI_GENOME(chloroplast):
334  result = "chloroplast";
335  break;
336  case NCBI_GENOME(chromoplast):
337  result = "chromoplast";
338  break;
339  case NCBI_GENOME(kinetoplast):
340  result = "kinetoplast";
341  break;
342  case NCBI_GENOME(mitochondrion):
343  {
344  if (!m_FastaFormat && (has_plasmid || m_IsWGS)) {
345  result = "mitochondrial";
346  } else {
347  result = "mitochondrion";
348  }
349  break;
350  }
351  case NCBI_GENOME(plastid):
352  result = "plastid";
353  break;
354  case NCBI_GENOME(macronuclear):
355  {
356  result = "macronuclear";
357  break;
358  }
359  case NCBI_GENOME(extrachrom):
360  {
361  if (!m_IsWGS) {
362  result = "extrachromosomal";
363  }
364  break;
365  }
366  case NCBI_GENOME(plasmid):
367  {
368  if (!m_IsWGS) {
369  result = "plasmid";
370  }
371  break;
372  }
373  // transposon and insertion-seq are obsolete
374  case NCBI_GENOME(cyanelle):
375  result = "cyanelle";
376  break;
377  case NCBI_GENOME(proviral):
378  {
379  if (!s_IsVirusOrPhage(m_Taxname)) {
380  if (has_plasmid || m_IsWGS) {
381  result = "proviral";
382  } else {
383  result = "provirus";
384  }
385  }
386  break;
387  }
388  case NCBI_GENOME(virion):
389  {
390  if (!s_IsVirusOrPhage(m_Taxname)) {
391  result = "virus";
392  }
393  break;
394  }
395  case NCBI_GENOME(nucleomorph):
396  {
397  if (!m_IsWGS) {
398  result = "nucleomorph";
399  }
400  break;
401  }
402  case NCBI_GENOME(apicoplast):
403  result = "apicoplast";
404  break;
405  case NCBI_GENOME(leucoplast):
406  result = "leucoplast";
407  break;
408  case NCBI_GENOME(proplastid):
409  result = "proplastid";
410  break;
411  case NCBI_GENOME(endogenous_virus):
412  result = "endogenous virus";
413  break;
414  case NCBI_GENOME(hydrogenosome):
415  result = "hydrogenosome";
416  break;
417  case NCBI_GENOME(chromosome):
418  result = "chromosome";
419  break;
420  case NCBI_GENOME(chromatophore):
421  result = "chromatophore";
422  break;
423  }
424 
425  return result;
426 }
427 
428 // set instance variables from Seq-inst, Seq-ids, MolInfo, etc., but not
429 // BioSource
431  const CBioseq_Handle& bsh,
433 )
434 
435 {
437  if (! bsx) {
438  return;
439  }
440 
441  // set flags from record components
445  m_GpipeMode = (flags & fGpipeMode) != 0;
447  m_DevMode = (flags & fDevMode) != 0;
448  m_FastaFormat = (flags & fFastaFormat) != 0;
449 
450  // reset member variables to cleared state
451  m_IsNA = bsx->IsNA();
452  m_IsAA = bsx->IsAA();
453  m_Topology = bsx->GetTopology();
454  m_Length = bsx->GetLength();
455 
456  m_IsSeg = false;
457  m_IsDelta = bsx->IsDelta();
458  m_IsVirtual = bsx->IsVirtual();
459  m_IsMap = bsx->IsMap();
460 
461  m_IsNC = bsx->IsNC();
462  m_IsNM = bsx->IsNM();
463  m_IsNR = bsx->IsNR();
464  m_IsNZ = bsx->IsNZ();
465  m_IsPatent = bsx->IsPatent();
466  m_IsPDB = bsx->IsPDB();
467  m_IsWP = bsx->IsWP();
468  m_ThirdParty = bsx->IsThirdParty();
469  m_WGSMaster = bsx->IsWGSMaster();
470  m_TSAMaster = bsx->IsTSAMaster();
471  m_TLSMaster = bsx->IsTLSMaster();
472 
473  m_GeneralStr = bsx->GetGeneralStr();
474  m_GeneralId = bsx->GetGeneralId();
475 
479 
480  m_PDBChain = bsx->GetPDBChain();
481  m_PDBChainID = bsx->GetPDBChainID();
482 
483  m_MIBiomol = bsx->GetBiomol();
484  m_MITech = bsx->GetTech();
486 
487  m_HTGTech = bsx->IsHTGTech();
489  m_IsTLS = bsx->IsTLS();
490  m_IsTSA = bsx->IsTSA();
491  m_IsWGS = bsx->IsWGS();
493 
494  m_MainTitle.clear();
495  if (! m_HTGSUnfinished && ! m_Reconstruct) {
496  m_MainTitle = bsx->GetTitle();
497  }
498 
499  m_UseBiosrc = bsx->IsUseBiosrc();
500 
502  m_HTGSDraft = bsx->IsHTGSDraft();
503  m_HTGSPooled = bsx->IsHTGSPooled();
504  m_TPAExp = bsx->IsTPAExp();
505  m_TPAInf = bsx->IsTPAInf();
506  m_TPAReasm = bsx->IsTPAReasm();
507  m_Unordered = bsx->IsUnordered();
508 
509  m_PDBCompound = bsx->GetPDBCompound();
510 
511  m_Source = bsx->GetBioSource();
512  m_Taxname = bsx->GetTaxname();
513  m_Genus = bsx->GetGenus();
514  m_Species = bsx->GetSpecies();
516  m_Genome = bsx->GetGenome();
517  m_IsPlasmid = bsx->IsPlasmid();
518  m_IsChromosome = bsx->IsChromosome();
519 
520  m_Organelle = bsx->GetOrganelle();
521 
525 
526  m_Chromosome = bsx->GetChromosome();
528  m_Clone = bsx->GetClone();
529  m_has_clone = bsx->HasClone();
530  m_Map = bsx->GetMap();
531  m_Plasmid = bsx->GetPlasmid();
532  m_Segment = bsx->GetSegment();
533 
534  m_Breed = bsx->GetBreed();
535  m_Cultivar = bsx->GetCultivar();
537  m_Isolate = bsx->GetIsolate();
538  m_Strain = bsx->GetStrain();
539  m_Substrain = bsx->GetSubstrain();
541 
542  m_IsUnverified = bsx->IsUnverified();
544  if (m_IsUnverified) {
545  int unverified_count = 0;
546  m_UnverifiedPrefix = "UNVERIFIED: ";
547  if (bsx->IsUnverifiedFeature()) {
548  m_UnverifiedPrefix = "UNVERIFIED: ";
549  unverified_count++;
550  }
551  if (bsx->IsUnverifiedMisassembled()) {
552  m_UnverifiedPrefix = "UNVERIFIED_ASMBLY: ";
553  unverified_count++;
554  }
555  if (bsx->IsUnverifiedContaminant()) {
556  m_UnverifiedPrefix = "UNVERIFIED_CONTAM: ";
557  unverified_count++;
558  }
559  if (bsx->IsUnverifiedOrganism()) {
560  m_UnverifiedPrefix = "UNVERIFIED_ORG: ";
561  unverified_count++;
562  }
563  if (unverified_count > 1) {
564  // m_UnverifiedPrefix = "UNVERIFIED: ";
565  }
566  }
567 
568  m_IsUnreviewed = bsx->IsUnreviewed();
570  if (m_IsUnreviewed) {
571  m_UnreviewedPrefix = "UNREVIEWED: ";
572  if (bsx->IsUnreviewedUnannotated()) {
573  m_UnreviewedPrefix = "UNREVIEWED_UNANNOT: ";
574  }
575  }
576 
577  m_Comment = bsx->GetComment();
578  m_IsPseudogene = bsx->IsPseudogene();
580 
581  m_rEnzyme = bsx->GetrEnzyme();
582 
584 
585  if (m_IsPDB) {
586  if (m_Comment.empty()) {
588  } else if (m_IsNA) {
589  if ( m_Length < 25 ) {
591  } else if (NStr::Find(m_Comment, "COMPLETE GENOME") != NPOS ||
592  NStr::Find(m_Comment, "CHROMOSOME XII") != NPOS) {
594  } else if (NStr::Find(m_Comment, "Dna (5'") != NPOS ||
595  NStr::Find(m_Comment, "SEQRES") != NPOS) {
597  }
598  } else {
599  if (NStr::Find(m_Comment, "hypothetical protein") != NPOS ||
600  NStr::Find(m_Comment, "uncharacterized protein") != NPOS ||
601  NStr::Find(m_Comment, "putative uncharacterized protein") != NPOS ||
602  NStr::Find(m_Comment, "putative protein") != NPOS ||
603  NStr::Find(m_Comment, "SEQRES") != NPOS) {
605  }
606  }
607  }
608 }
609 
610 // set instance variables from Seq-inst, Seq-ids, MolInfo, etc., but not
611 // BioSource
613  const CBioseq_Handle& bsh,
615 )
616 
617 {
618  // set flags from record components
622  m_GpipeMode = (flags & fGpipeMode) != 0;
624  m_DevMode = (flags & fDevMode) != 0;
625  m_FastaFormat = (flags & fFastaFormat) != 0;
626 
627  // reset member variables to cleared state
628  m_IsNA = false;
629  m_IsAA = false;
630  m_Topology = NCBI_SEQTOPOLOGY(not_set);
631  m_Length = 0;
632 
633  m_IsSeg = false;
634  m_IsDelta = false;
635  m_IsVirtual = false;
636  m_IsMap = false;
637 
638  m_IsNC = false;
639  m_IsNM = false;
640  m_IsNR = false;
641  m_IsNZ = false;
642  m_IsPatent = false;
643  m_IsPDB = false;
644  m_IsWP = false;
645  m_ThirdParty = false;
646  m_WGSMaster = false;
647  m_TSAMaster = false;
648  m_TLSMaster = false;
649 
650  m_MainTitle.clear();
651  m_GeneralStr.clear();
652  m_GeneralId = 0;
653  m_PatentCountry.clear();
654  m_PatentNumber.clear();
655 
656  m_PatentSequence = 0;
657 
658  m_PDBChain = 0;
659  m_PDBChainID.clear();
660 
661  m_MIBiomol = NCBI_BIOMOL(unknown);
662  m_MITech = NCBI_TECH(unknown);
664 
665  m_HTGTech = false;
666  m_HTGSUnfinished = false;
667  m_IsTLS = false;
668  m_IsTSA = false;
669  m_IsWGS = false;
670  m_IsEST_STS_GSS = false;
671 
672  m_UseBiosrc = false;
673 
674  m_HTGSCancelled = false;
675  m_HTGSDraft = false;
676  m_HTGSPooled = false;
677  m_TPAExp = false;
678  m_TPAInf = false;
679  m_TPAReasm = false;
680  m_Unordered = false;
681 
683 
684  m_Source.Reset();
685  m_Taxname.clear();
686  m_Genus.clear();
687  m_Species.clear();
688  m_Multispecies = false;
689  m_Genome = NCBI_GENOME(unknown);
690  m_IsPlasmid = false;
691  m_IsChromosome = false;
692 
693  m_Organelle.clear();
694 
695  m_FirstSuperKingdom.clear();
696  m_SecondSuperKingdom.clear();
697  m_IsCrossKingdom = false;
698 
701  m_Clone.clear();
702  m_has_clone = false;
703  m_Map.clear();
704  m_Plasmid.clear();
705  m_Segment.clear();
706 
707  m_Breed.clear();
708  m_Cultivar.clear();
710  m_Isolate.clear();
711  m_Strain.clear();
712  m_Substrain.clear();
714 
715  m_IsUnverified = false;
717  m_IsUnreviewed = false;
720 
721  m_Comment.clear();
722  m_IsPseudogene = false;
723 
724  m_rEnzyme.clear();
725 
727 
728  // now start setting member variables
729  m_IsNA = bsh.IsNa();
730  m_IsAA = bsh.IsAa();
732  m_Length = bsh.GetInst_Length();
733 
734  if (bsh.IsSetInst()) {
735  if (bsh.IsSetInst_Repr()) {
736  TSEQ_REPR repr = bsh.GetInst_Repr();
737  m_IsSeg = (repr == CSeq_inst::eRepr_seg);
738  m_IsDelta = (repr == CSeq_inst::eRepr_delta);
740  m_IsMap = (repr == CSeq_inst::eRepr_map);
741  }
742  }
743 
744  // process Seq-ids
745  FOR_EACH_SEQID_ON_BIOSEQ_HANDLE (sid_itr, bsh) {
746  CSeq_id_Handle sid = *sid_itr;
747  switch (sid.Which()) {
748  case NCBI_SEQID(Tpg):
749  case NCBI_SEQID(Tpe):
750  case NCBI_SEQID(Tpd):
751  m_ThirdParty = true;
752  // fall through
753  case NCBI_SEQID(Other):
754  case NCBI_SEQID(Genbank):
755  case NCBI_SEQID(Embl):
756  case NCBI_SEQID(Ddbj):
757  {
758  CConstRef<CSeq_id> id = sid.GetSeqId();
759  const CTextseq_id& tsid = *id->GetTextseq_Id ();
760  if (tsid.IsSetAccession()) {
761  const string& acc = tsid.GetAccession ();
763  TACCN_CHOICE div = (TACCN_CHOICE) (type & NCBI_ACCN(division_mask));
764  if ( div == NCBI_ACCN(wgs) )
765  {
766  if( (type & CSeq_id::fAcc_master) != 0 ) {
767  m_WGSMaster = true;
768  }
769  } else if ( div == NCBI_ACCN(tsa) )
770  {
771  if( (type & CSeq_id::fAcc_master) != 0 && m_IsVirtual ) {
772  m_TSAMaster = true;
773  }
774  } else if (type == NCBI_ACCN(refseq_chromosome)) {
775  m_IsNC = true;
776  } else if (type == NCBI_ACCN(refseq_mrna)) {
777  m_IsNM = true;
778  } else if (type == NCBI_ACCN(refseq_mrna_predicted)) {
779  m_IsNM = true;
780  } else if (type == NCBI_ACCN(refseq_ncrna)) {
781  m_IsNR = true;
782  } else if (type == NCBI_ACCN(refseq_contig)) {
783  m_IsNZ = true;
784  } else if (type == NCBI_ACCN(refseq_unique_prot)) {
785  m_IsWP = true;
786  }
787  }
788  break;
789  }
790  case NCBI_SEQID(General):
791  {
792  CConstRef<CSeq_id> id = sid.GetSeqId();
793  const CDbtag& gen_id = id->GetGeneral ();
794  if (! gen_id.IsSkippable ()) {
795  if (gen_id.IsSetTag ()) {
796  const CObject_id& oid = gen_id.GetTag();
797  if (oid.IsStr()) {
798  m_GeneralStr = oid.GetStr();
799  } else if (oid.IsId()) {
800  m_GeneralId = oid.GetId();
801  }
802  }
803  }
804  break;
805  }
806  case NCBI_SEQID(Pdb):
807  {
808  m_IsPDB = true;
809  CConstRef<CSeq_id> id = sid.GetSeqId();
810  const CPDB_seq_id& pdb_id = id->GetPdb ();
811  if (pdb_id.IsSetChain_id()) {
812  m_PDBChainID = pdb_id.GetChain_id();
813  } else if (pdb_id.IsSetChain()) {
814  m_PDBChain = pdb_id.GetChain();
815  }
816  break;
817  }
818  case NCBI_SEQID(Patent):
819  {
820  m_IsPatent = true;
821  CConstRef<CSeq_id> id = sid.GetSeqId();
822  const CPatent_seq_id& pat_id = id->GetPatent();
823  if (pat_id.IsSetSeqid()) {
824  m_PatentSequence = pat_id.GetSeqid();
825  }
826  if (pat_id.IsSetCit()) {
827  const CId_pat& cit = pat_id.GetCit();
828  m_PatentCountry = cit.GetCountry();
830  }
831  break;
832  }
833  case NCBI_SEQID(Gpipe):
834  break;
835  default:
836  break;
837  }
838  }
839 
840  enum ENeededDescChoices {
841  fMolinfo = 1 << 0,
842  fUser = 1 << 1,
843  fSource = 1 << 2,
844  fGenbank = 1 << 3,
845  fEmbl = 1 << 4,
846  fTitle = 1 << 5,
847  fPdb = 1 << 6,
848  fComment = 1 << 7
849  };
850  int needed_desc_choices = fMolinfo | fUser | fSource | fGenbank | fEmbl | fComment;
851 
852  CSeqdesc_CI::TDescChoices desc_choices;
853  desc_choices.reserve(7);
854  desc_choices.push_back(CSeqdesc::e_Molinfo);
855  desc_choices.push_back(CSeqdesc::e_User);
856  desc_choices.push_back(CSeqdesc::e_Source);
857  // Only truly needed if (m_HTGTech || m_ThirdParty), but
858  // determining m_HTGTech requires a descriptor scan.
859  desc_choices.push_back(CSeqdesc::e_Genbank);
860  desc_choices.push_back(CSeqdesc::e_Embl);
861  desc_choices.push_back(CSeqdesc::e_Comment);
862  if (! m_Reconstruct) {
863  needed_desc_choices |= fTitle;
864  desc_choices.push_back(CSeqdesc::e_Title);
865  }
866  if (m_IsPDB) {
867  needed_desc_choices |= fPdb;
868  desc_choices.push_back(CSeqdesc::e_Pdb);
869  }
870 
871  const list <string> *keywords = NULL;
872 
873  int num_super_kingdom = 0;
874  bool super_kingdoms_different = false;
875 
876  for (CSeqdesc_CI desc_it(bsh, desc_choices);
877  needed_desc_choices != 0 && desc_it; ++desc_it) {
878  switch (desc_it->Which()) {
879  case CSeqdesc::e_Molinfo:
880  {
881  // process MolInfo tech
882  if ((needed_desc_choices & fMolinfo) == 0) {
883  continue; // already covered
884  }
885 
886  const CMolInfo& molinf = desc_it->GetMolinfo();
887  m_MIBiomol = molinf.GetBiomol();
888  m_MITech = molinf.GetTech();
890  switch (m_MITech) {
891  case NCBI_TECH(htgs_0):
892  case NCBI_TECH(htgs_1):
893  case NCBI_TECH(htgs_2):
894  m_HTGSUnfinished = true;
895  // manufacture all titles for unfinished HTG sequences
896  m_Reconstruct = true;
897  needed_desc_choices &= ~fTitle;
898  m_MainTitle.clear();
899  // fall through
900  case NCBI_TECH(htgs_3):
901  m_HTGTech = true;
902  m_UseBiosrc = true;
903  break;
904  case NCBI_TECH(est):
905  case NCBI_TECH(sts):
906  case NCBI_TECH(survey):
907  m_IsEST_STS_GSS = true;
908  m_UseBiosrc = true;
909  break;
910  case NCBI_TECH(wgs):
911  m_IsWGS = true;
912  m_UseBiosrc = true;
913  break;
914  case NCBI_TECH(tsa):
915  m_IsTSA = true;
916  m_UseBiosrc = true;
917  if (m_IsVirtual) {
918  m_TSAMaster = true;
919  }
920  break;
921  case NCBI_TECH(targeted):
922  m_IsTLS = true;
923  m_UseBiosrc = true;
924  if (m_IsVirtual) {
925  m_TLSMaster = true;
926  }
927  break;
928  default:
929  break;
930  }
931 
932  // take first, then skip remainder
933  needed_desc_choices &= ~fMolinfo;
934  break;
935  }
936 
937  case CSeqdesc::e_User:
938  {
939  // process Unverified and Unreviewed user objects
940  if ((needed_desc_choices & fUser) == 0) {
941  continue; // already covered
942  }
943 
944  const CUser_object& user_obj = desc_it->GetUser();
945  if (FIELD_IS_SET_AND_IS(user_obj, Type, Str)) {
946  if (user_obj.IsUnverified()) {
947  m_IsUnverified = true;
948  int unverified_count = 0;
949  needed_desc_choices &= ~fUser;
950  m_UnverifiedPrefix = "UNVERIFIED: ";
951  if (user_obj.IsUnverifiedFeature()) {
952  m_UnverifiedPrefix = "UNVERIFIED: ";
953  unverified_count++;
954  }
955  if (user_obj.IsUnverifiedMisassembled()) {
956  m_UnverifiedPrefix = "UNVERIFIED_ASMBLY: ";
957  unverified_count++;
958  }
959  if (user_obj.IsUnverifiedContaminant()) {
960  m_UnverifiedPrefix = "UNVERIFIED_CONTAM: ";
961  unverified_count++;
962  }
963  if (user_obj.IsUnverifiedOrganism()) {
964  m_UnverifiedPrefix = "UNVERIFIED_ORG: ";
965  unverified_count++;
966  }
967  if (unverified_count > 1) {
968  // m_UnverifiedPrefix = "UNVERIFIED: ";
969  }
970  } else if (user_obj.IsUnreviewed()) {
971  m_IsUnreviewed = true;
972  m_UnreviewedPrefix = "UNREVIEWED: ";
973  if (user_obj.IsUnreviewedUnannotated()) {
974  m_UnreviewedPrefix = "UNREVIEWED_UNANNOT: ";
975  }
976  } else if (user_obj.GetType().GetStr() == "AutodefOptions" ) {
977  FOR_EACH_USERFIELD_ON_USEROBJECT (uitr, user_obj) {
978  const CUser_field& fld = **uitr;
979  if (! FIELD_IS_SET_AND_IS(fld, Label, Str)) continue;
980  const string &label_str = GET_FIELD(fld.GetLabel(), Str);
981  if (! NStr::EqualNocase(label_str, "Targeted Locus Name")) continue;
982  if (fld.IsSetData() && fld.GetData().IsStr()) {
983  m_TargetedLocus = fld.GetData().GetStr();
984  }
985  }
986  }
987  }
988  break;
989  }
990 
991  case CSeqdesc::e_Comment:
992  {
993  // process comment
994  if ((needed_desc_choices & fComment) == 0) {
995  continue; // already covered
996  }
997 
998  m_Comment = desc_it->GetComment();
999  if (NStr::Find (m_Comment, "[CAUTION] Could be the product of a pseudogene") != string::npos) {
1000  m_IsPseudogene = true;
1001  }
1002  break;
1003  }
1004 
1005  case CSeqdesc::e_Source:
1006  {
1007  if ((needed_desc_choices & fSource) != 0) {
1008  m_Source.Reset(&desc_it->GetSource());
1009  // take first, then skip remainder
1010  needed_desc_choices &= ~fSource;
1011  }
1012  const CBioSource &bsrc = desc_it->GetSource();
1013  if (! bsrc.IsSetOrgname()) break;
1014  const COrgName &onp = bsrc.GetOrgname();
1015  if (onp.IsSetMod()) {
1016  for (auto& omd : onp.GetMod()) {
1017  if (omd->IsSetSubname()) {
1018  const string& str = omd->GetSubname();
1019  COrgMod::TSubtype subtype = omd->GetSubtype();
1020  if (subtype == COrgMod::eSubtype_metagenome_source) {
1021  if (m_MetaGenomeSource.empty()) {
1023  }
1024  }
1025  }
1026  }
1027  }
1028  if (m_IsWP) {
1029  const COrgName::TName& nam = onp.GetName();
1030  if (! nam.IsPartial()) break;
1031  const CPartialOrgName& pon = nam.GetPartial();
1032  if (! pon.IsSet()) break;
1033  const CPartialOrgName::Tdata& tx = pon.Get();
1034  ITERATE (CPartialOrgName::Tdata, itr, tx) {
1035  const CTaxElement& te = **itr;
1036  if (! te.IsSetFixed_level()) continue;
1037  if (te.GetFixed_level() != 0) continue;
1038  if (! te.IsSetLevel()) continue;
1039  const string& lvl = te.GetLevel();
1040  if (! NStr::EqualNocase (lvl, "superkingdom")) continue;
1041  num_super_kingdom++;
1042  if (m_FirstSuperKingdom.empty() && te.IsSetName()) {
1044  } else if (te.IsSetName() && ! NStr::EqualNocase (m_FirstSuperKingdom, te.GetName())) {
1045  if (m_SecondSuperKingdom.empty()) {
1046  super_kingdoms_different = true;
1048  }
1049  }
1050  if (num_super_kingdom > 1 && super_kingdoms_different) {
1051  m_IsCrossKingdom = true;
1052  }
1053  }
1054  }
1055  }
1056  break;
1057 
1058  case CSeqdesc::e_Title:
1059  if ((needed_desc_choices & fTitle) != 0) {
1060  // for everything other than PDB proteins, title must be packaged on Bioseq - RW-2005
1061  if ( m_IsPDB || desc_it.GetSeq_entry_Handle().IsSeq() ) {
1062  m_MainTitle = desc_it->GetTitle();
1063  }
1064  // take first, then skip remainder
1065  needed_desc_choices &= ~fTitle;
1066  }
1067  break;
1068 
1069  case CSeqdesc::e_Genbank:
1070  {
1071  if ((needed_desc_choices & fGenbank) == 0) {
1072  continue; // already covered
1073  }
1074  const CGB_block& gbk = desc_it->GetGenbank();
1075  if (gbk.IsSetKeywords()) {
1076  keywords = &gbk.GetKeywords();
1077  }
1078 
1079  // take first, then skip remainder along with any EMBL blocks
1080  needed_desc_choices &= ~(fGenbank | fEmbl);
1081  break;
1082  }
1083 
1084  case CSeqdesc::e_Embl:
1085  {
1086  if ((needed_desc_choices & fEmbl) == 0) {
1087  continue; // already covered
1088  }
1089  const CEMBL_block& ebk = desc_it->GetEmbl();
1090  if (ebk.IsSetKeywords()) {
1091  keywords = &ebk.GetKeywords();
1092  }
1093 
1094  // take first, then skip remainder
1095  needed_desc_choices &= ~fEmbl;
1096  break;
1097  }
1098 
1099  case CSeqdesc::e_Pdb:
1100  {
1101  if ((needed_desc_choices & fPdb) == 0) {
1102  continue; // already covered
1103  }
1104  _ASSERT(m_IsPDB);
1105  const CPDB_block& pbk = desc_it->GetPdb();
1106  FOR_EACH_COMPOUND_ON_PDBBLOCK (cp_itr, pbk) {
1107  if (m_PDBCompound.empty()) {
1108  m_PDBCompound = *cp_itr;
1109 
1110  // take first, then skip remainder
1111  needed_desc_choices &= ~fPdb;
1112  }
1113  }
1114  break;
1115  }
1116 
1117  default:
1118  _TROUBLE;
1119  }
1120  }
1121 
1122  if (keywords != NULL) {
1123  FOR_EACH_STRING_IN_LIST (kw_itr, *keywords) {
1124  const string& clause = *kw_itr;
1125  list<string> kywds;
1126  NStr::Split( clause, ";", kywds, NStr::fSplit_Tokenize );
1127  FOR_EACH_STRING_IN_LIST ( k_itr, kywds ) {
1128  const string& str = *k_itr;
1129  if (NStr::EqualNocase (str, "UNORDERED")) {
1130  m_Unordered = true;
1131  }
1132  if ((! m_HTGTech) && (! m_ThirdParty)) continue;
1133  if (NStr::EqualNocase (str, "HTGS_DRAFT")) {
1134  m_HTGSDraft = true;
1135  } else if (NStr::EqualNocase (str, "HTGS_CANCELLED")) {
1136  m_HTGSCancelled = true;
1137  } else if (NStr::EqualNocase (str, "HTGS_POOLED_MULTICLONE")) {
1138  m_HTGSPooled = true;
1139  } else if (NStr::EqualNocase (str, "TPA:experimental")) {
1140  m_TPAExp = true;
1141  } else if (NStr::EqualNocase (str, "TPA:inferential")) {
1142  m_TPAInf = true;
1143  } else if (NStr::EqualNocase (str, "TPA:reassembly")) {
1144  m_TPAReasm = true;
1145  } else if (NStr::EqualNocase (str, "TPA:assembly")) {
1146  m_TPAReasm = true;
1147  }
1148  }
1149  }
1150  }
1151 
1152  if (m_IsMap) {
1153  if (bsh.IsSetInst_Ext() && bsh.GetInst_Ext().IsMap()) {
1154  const CMap_ext& mp = bsh.GetInst_Ext().GetMap();
1155  if (mp.IsSet()) {
1156  const CMap_ext::Tdata& ft = mp.Get();
1157  ITERATE (CMap_ext::Tdata, itr, ft) {
1158  const CSeq_feat& feat = **itr;
1159  const CSeqFeatData& data = feat.GetData();
1160  if (! data.IsRsite()) continue;
1161  const CRsite_ref& rsite = data.GetRsite();
1162  if (rsite.IsStr()) {
1163  m_rEnzyme = rsite.GetStr();
1164  }
1165  }
1166  }
1167  }
1168  }
1169 
1170  if (m_IsPDB) {
1171  if (m_Comment.empty()) {
1173  } else if (m_IsNA) {
1174  if ( m_Length < 25 ) {
1176  } else if (NStr::Find(m_Comment, "COMPLETE GENOME") != NPOS ||
1177  NStr::Find(m_Comment, "CHROMOSOME XII") != NPOS) {
1179  } else if (NStr::Find(m_Comment, "Dna (5'") != NPOS ||
1180  NStr::Find(m_Comment, "SEQRES") != NPOS) {
1182  }
1183  } else {
1184  if (NStr::Find(m_Comment, "hypothetical protein") != NPOS ||
1185  NStr::Find(m_Comment, "uncharacterized protein") != NPOS ||
1186  NStr::Find(m_Comment, "putative uncharacterized protein") != NPOS ||
1187  NStr::Find(m_Comment, "putative protein") != NPOS ||
1188  NStr::Find(m_Comment, "SEQRES") != NPOS) {
1190  }
1191  }
1192  }
1193 }
1194 
1196  const CBioseq_Handle& bsh
1197 )
1198 
1199 {
1201  if (! bsx) {
1202  return;
1203  }
1204 
1205  m_Source = bsx->GetBioSource();
1206  m_Taxname = bsx->GetTaxname();
1207 
1208  m_Genome = bsx->GetGenome();
1209  m_IsPlasmid = bsx->IsPlasmid();
1210  m_IsChromosome = bsx->IsChromosome();
1211 
1212  m_Chromosome = bsx->GetChromosome();
1214  m_Clone = bsx->GetClone();
1215  m_has_clone = bsx->HasClone();
1216  m_Map = bsx->GetMap();
1217  m_Plasmid = bsx->GetPlasmid();
1218  m_Segment = bsx->GetSegment();
1219 
1220  m_Genus = bsx->GetGenus();
1221  m_Species = bsx->GetSpecies();
1222  m_Multispecies = bsx->IsMultispecies();
1223 
1224  m_Strain = bsx->GetStrain();
1225  m_Substrain = bsx->GetSubstrain();
1227  m_Cultivar = bsx->GetCultivar();
1229  m_Isolate = bsx->GetIsolate();
1230  m_Breed = bsx->GetBreed();
1231 
1232  m_Organelle = bsx->GetOrganelle();
1233 
1234  if (m_has_clone) return;
1235 
1236  try {
1238  while (feat_it) {
1239  const CSeq_feat& feat = feat_it->GetOriginalFeature();
1240  if (! feat.IsSetData ()) continue;
1241  const CSeqFeatData& sfdata = feat.GetData ();
1242  const CBioSource& source = sfdata.GetBiosrc();
1243 
1244  // process SubSource
1246  const CSubSource& sbs = **sbs_itr;
1247  if (! sbs.IsSetName()) continue;
1249  case NCBI_SUBSOURCE(clone):
1250  m_has_clone = true;
1251  return;
1252  default:
1253  break;
1254  }
1255  }
1256  ++feat_it;
1257  }
1258  } catch ( const exception& ) {
1259  // ERR_POST(Error << "Unable to iterate source features while constructing default definition line");
1260  }
1261 }
1262 
1263 // set instance variables from BioSource
1265  const CBioseq_Handle& bsh
1266 )
1267 
1268 {
1269  if (m_Source.NotEmpty()) {
1270  // get organism name
1271  if (m_Source->IsSetTaxname()) {
1273  }
1274  if (m_Source->IsSetGenome()) {
1276  m_IsPlasmid = (m_Genome == NCBI_GENOME(plasmid));
1277  m_IsChromosome = (m_Genome == NCBI_GENOME(chromosome));
1278  }
1279 
1280  // process SubSource
1282  const CSubSource& sbs = **sbs_itr;
1283  if (! sbs.IsSetName()) continue;
1284  const string& str = sbs.GetName();
1286  case NCBI_SUBSOURCE(chromosome):
1287  m_Chromosome = str;
1288  break;
1289  case NCBI_SUBSOURCE(clone):
1290  m_Clone = str;
1291  m_has_clone = true;
1292  break;
1293  case NCBI_SUBSOURCE(map):
1294  m_Map = str;
1295  break;
1296  case NCBI_SUBSOURCE(plasmid_name):
1297  m_Plasmid = str;
1298  break;
1299  case NCBI_SUBSOURCE(segment):
1300  m_Segment = str;
1301  break;
1302  case NCBI_SUBSOURCE(linkage_group):
1303  m_LinkageGroup = str;
1304  break;
1305  default:
1306  break;
1307  }
1308  }
1309 
1310  if (m_Source->IsSetOrgname()) {
1311  const COrgName& onp = m_Source->GetOrgname();
1312  if (onp.IsSetName()) {
1313  const COrgName::TName& nam = onp.GetName();
1314  if (nam.IsBinomial()) {
1315  const CBinomialOrgName& bon = nam.GetBinomial();
1316  if (bon.IsSetGenus()) {
1317  m_Genus = bon.GetGenus();
1318  }
1319  if (bon.IsSetSpecies()) {
1320  m_Species = bon.GetSpecies();
1321  }
1322  } else if (nam.IsPartial()) {
1323  const CPartialOrgName& pon = nam.GetPartial();
1324  if (pon.IsSet()) {
1325  const CPartialOrgName::Tdata& tx = pon.Get();
1326  ITERATE (CPartialOrgName::Tdata, itr, tx) {
1327  const CTaxElement& te = **itr;
1328  if (te.IsSetFixed_level()) {
1329  int fl = te.GetFixed_level();
1330  if (fl > 0) {
1331  m_Multispecies = true;
1332  } else if (te.IsSetLevel()) {
1333  const string& lvl = te.GetLevel();
1334  if (! NStr::EqualNocase (lvl, "species")) {
1335  m_Multispecies = true;
1336  }
1337  }
1338  }
1339  }
1340  }
1341  }
1342  }
1343  }
1344 
1345  // process OrgMod
1347  const COrgMod& omd = **omd_itr;
1348  if (! omd.IsSetSubname()) continue;
1349  const string& str = omd.GetSubname();
1350  SWITCH_ON_ORGMOD_CHOICE (omd) {
1351  case NCBI_ORGMOD(strain):
1352  if (m_Strain.empty()) {
1353  m_Strain = str;
1354  }
1355  break;
1356  case NCBI_ORGMOD(substrain):
1357  if (m_Substrain.empty()) {
1358  m_Substrain = str;
1359  }
1360  break;
1361  case NCBI_ORGMOD(cultivar):
1362  if (m_Cultivar.empty()) {
1363  m_Cultivar = str;
1364  }
1365  break;
1366  case NCBI_ORGMOD(specimen_voucher):
1367  if (m_SpecimenVoucher.empty()) {
1369  }
1370  break;
1371  case NCBI_ORGMOD(isolate):
1372  if (m_Isolate.empty()) {
1373  m_Isolate = str;
1374  }
1375  break;
1376  case NCBI_ORGMOD(breed):
1377  if (m_Breed.empty()) {
1378  m_Breed = str;
1379  }
1380  break;
1381  case NCBI_ORGMOD(metagenome_source):
1382  if (m_MetaGenomeSource.empty()) {
1384  }
1385  break;
1386  default:
1387  break;
1388  }
1389  }
1390  }
1391 /*
1392  bool virus_or_phage = false;
1393  bool has_plasmid = false;
1394 
1395  if (NStr::FindNoCase(m_Taxname, "virus") != NPOS ||
1396  NStr::FindNoCase(m_Taxname, "phage") != NPOS) {
1397  virus_or_phage = true;
1398  }
1399 
1400  if (! m_Plasmid.empty()) {
1401  has_plasmid = true;
1402  }
1403 */
1404 
1406 
1407  if (m_has_clone) return;
1408 
1409  try {
1411  while (feat_it) {
1412  const CSeq_feat& feat = feat_it->GetOriginalFeature();
1413  if (! feat.IsSetData ()) continue;
1414  const CSeqFeatData& sfdata = feat.GetData ();
1415  const CBioSource& source = sfdata.GetBiosrc();
1416 
1417  // process SubSource
1419  const CSubSource& sbs = **sbs_itr;
1420  if (! sbs.IsSetName()) continue;
1422  case NCBI_SUBSOURCE(clone):
1423  m_has_clone = true;
1424  return;
1425  default:
1426  break;
1427  }
1428  }
1429  ++feat_it;
1430  }
1431  } catch ( const exception& ) {
1432  // ERR_POST(Error << "Unable to iterate source features while constructing default definition line");
1433  }
1434 }
1435 
1436 // generate title from BioSource fields
1438  vector<CTempString>& desc,
1439  string& buf
1440 )
1441 
1442 {
1444  desc.push_back(", pooled multiple clones");
1445  return;
1446  }
1447 
1448  if( m_Clone.empty() ) {
1449  return;
1450  }
1451 
1452  SIZE_TYPE count = 1;
1453  for (SIZE_TYPE pos = m_Clone.find(';'); pos != NPOS;
1454  pos = m_Clone.find(';', pos + 1)) {
1455  ++count;
1456  }
1457  if (count > 3) {
1458  buf = NStr::NumericToString(count);
1459  desc.reserve(3);
1460  desc.push_back(", ");
1461  desc.push_back(buf);
1462  desc.push_back(" clones");
1463  } else {
1464  desc.reserve(2);
1465  desc.push_back(" clone ");
1466  desc.push_back(m_Clone);
1467  }
1468 }
1469 
1470 static bool s_EndsWithStrain (
1471  const CTempString& taxname,
1472  const CTempString& strain
1473 )
1474 
1475 {
1476  // return NStr::EndsWith(taxname, strain, NStr::eNocase);
1477  if (strain.size() >= taxname.size()) {
1478  return false;
1479  }
1480  SIZE_TYPE pos = taxname.find(' ');
1481  if (pos == NPOS) {
1482  return false;
1483  }
1484  pos = taxname.find(' ', pos + 1);
1485  if (pos == NPOS) {
1486  return false;
1487  }
1488 
1489  pos = NStr::Find (taxname, strain, NStr::eNocase, NStr::eReverseSearch);
1490  if (pos == taxname.size() - strain.size()) {
1491  // check for space to avoid fortuitous match to end of taxname
1492  char ch = taxname[pos - 1];
1493  if (ispunct (ch) || isspace (ch)) {
1494  return true;
1495  }
1496  } else if (pos == taxname.size() - strain.size() - 1
1497  && taxname[pos - 1] == '\''
1498  && taxname[taxname.size() - 1] == '\'') {
1499  return true;
1500  }
1501  return false;
1502 }
1503 
1504 
1505 
1506 static string s_RemoveColonsAndWhiteSpace(string str)
1507 {
1508  str.erase(remove_if(begin(str), end(str),
1509  [](char c) { return c == ':' || c == ' '|| c == '\t'; }),
1510  end(str));
1511  return str;
1512 }
1513 
1514 static string s_RemoveWhiteSpace(string str)
1515 {
1516  str.erase(remove_if(begin(str), end(str),
1517  [](char c) { return c == ' ' || c == '\t'; }),
1518  end(str));
1519  return str;
1520 }
1521 
1522 
1523 static void s_AddVoucherAndIsolate(const CTempString& taxname,
1524  const CTempString& strain,
1525  const CTempString& specimen_voucher,
1526  const CTempString& isolate,
1527  CDefLineJoiner& joiner)
1528 {
1529  if (!specimen_voucher.empty()) {
1530  if (strain.empty() || (s_RemoveColonsAndWhiteSpace(strain) != s_RemoveColonsAndWhiteSpace(specimen_voucher))) {
1531  joiner.Add("voucher", specimen_voucher);
1532  }
1533  }
1534 
1535  if (!isolate.empty() && (isolate != specimen_voucher)) {
1536  // s_EndsWithStrain just checks for supplied pattern, using here for isolate
1537  if ((!s_EndsWithStrain(taxname, isolate)) &&
1538  (s_RemoveColonsAndWhiteSpace(specimen_voucher) != s_RemoveWhiteSpace(isolate))) {
1539  joiner.Add("isolate", isolate);
1540  }
1541  }
1542 }
1543 
1544 
1546 
1547 {
1548  CDefLineJoiner joiner;
1549 
1550  joiner.Add("organism", m_Taxname, eHideType);
1551 
1552  if (! m_Strain.empty()) {
1553  CTempString add(m_Strain, 0, m_Strain.find(';'));
1554  if (! s_EndsWithStrain (m_Taxname, add)) {
1555  joiner.Add("strain", add);
1556  }
1557  }
1558  if (! m_Substrain.empty()) {
1559  CTempString add(m_Substrain, 0, m_Substrain.find(';'));
1560  if (! s_EndsWithStrain (m_Taxname, add)) {
1561  joiner.Add("substr.", add);
1562  }
1563  }
1564  if (! m_Breed.empty()) {
1565  joiner.Add("breed", m_Breed.substr (0, m_Breed.find(';')));
1566  }
1567  if (! m_Cultivar.empty()) {
1568  joiner.Add("cultivar", m_Cultivar.substr (0, m_Cultivar.find(';')));
1569  }
1570 
1572 
1573  if (! m_Chromosome.empty()) {
1574  joiner.Add("location", "chromosome", eHideType);
1575  joiner.Add("chromosome", m_Chromosome, eHideType);
1576  } else if ( !m_LinkageGroup.empty()) {
1577  joiner.Add("location", "linkage group", eHideType);
1578  joiner.Add("linkage group", m_LinkageGroup, eHideType);
1579  } else if ( !m_Plasmid.empty()) {
1580  joiner.Add("location", m_Organelle, eHideType); //"plasmid"
1581  joiner.Add("plasmid name", m_Plasmid, eHideType);
1582  } else if (! m_Organelle.empty()) {
1583  joiner.Add("location", m_Organelle, eHideType);
1584  }
1585 
1586  string clnbuf;
1587  vector<CTempString> clnvec;
1588  if (m_has_clone) {
1589  x_DescribeClones (clnvec, clnbuf);
1590  ITERATE (vector<CTempString>, it, clnvec) {
1591  joiner.Add("clone", *it, eHideType);
1592  }
1593  }
1594  if (! m_Map.empty()) {
1595  joiner.Add("map", m_Map);
1596  }
1597 
1598  joiner.Join(&m_MainTitle);
1600 }
1601 
1602 // generate title for NC
1604 
1605 {
1606  if (m_MIBiomol != NCBI_BIOMOL(genomic) &&
1607  m_MIBiomol != NCBI_BIOMOL(other_genetic)) return;
1608 
1609  // require taxname to be set
1610  if (m_Taxname.empty()) return;
1611 
1612  CDefLineJoiner joiner;
1613 
1614  joiner.Add("organism", m_Taxname, eHideType);
1615 
1616  bool add_gen_tag = false;
1617  if (NStr::FindNoCase (m_Taxname, "plasmid") != NPOS) {
1618  //
1619  } else if (m_IsPlasmid || ! m_Plasmid.empty()) {
1620  if (m_Plasmid.empty()) {
1621  joiner.Add("", "unnamed plasmid", eHideType);
1622  } else {
1623  if ( !m_IsPlasmid) { // do we need this?
1624  joiner.Add("location", m_Organelle, eHideType);
1625  }
1626  if (NStr::FindNoCase(m_Plasmid, "plasmid") == NPOS &&
1627  NStr::FindNoCase(m_Plasmid, "element") == NPOS) {
1628  joiner.Add("plasmid", m_Plasmid);
1629  } else {
1630  joiner.Add("", m_Plasmid, eHideType);
1631  }
1632  }
1633  } else if ( ! m_Organelle.empty() ) {
1634  if ( m_Chromosome.empty() ) {
1635  switch (m_Genome) {
1636  case NCBI_GENOME(mitochondrion):
1637  case NCBI_GENOME(chloroplast):
1638  case NCBI_GENOME(kinetoplast):
1639  case NCBI_GENOME(plastid):
1640  case NCBI_GENOME(apicoplast):
1641  joiner.Add("location", m_Organelle, eHideType);
1642  break;
1643  }
1644  /*
1645  if ( m_LinkageGroup.empty() ) {
1646  add_gen_tag = true;
1647  }
1648  */
1649  } else {
1650  if (! m_IsChromosome) {
1651  joiner.Add("location", m_Organelle, eHideType);
1652  }
1653  joiner.Add("chromosome", m_Chromosome);
1654  }
1655  } else if (! m_Segment.empty()) {
1656  if (m_Segment.find ("DNA") == NPOS &&
1657  m_Segment.find ("RNA") == NPOS &&
1658  m_Segment.find ("segment") == NPOS &&
1659  m_Segment.find ("Segment") == NPOS) {
1660  joiner.Add("segment", m_Segment);
1661  } else {
1662  joiner.Add("", m_Segment, eHideType);
1663  }
1664  } else if (! m_Chromosome.empty()) {
1665  joiner.Add("chromosome", m_Chromosome);
1666  } else /* if ( m_LinkageGroup.empty() ) */ {
1667  add_gen_tag = true;
1668  }
1669 
1670  if (add_gen_tag) {
1671  joiner.Add("completeness", (x_IsComplete() ? ", complete genome" : ", partial genome"), eHideType);
1672  } else {
1673  joiner.Add("completeness", (x_IsComplete() ? ", complete sequence" : ", partial sequence"), eHideType);
1674  }
1675  joiner.Join(&m_MainTitle);
1676 
1677  NStr::ReplaceInPlace (m_MainTitle, "Plasmid", "plasmid");
1678  NStr::ReplaceInPlace (m_MainTitle, "Element", "element");
1679 }
1680 
1681 // generate title for NM
1682 static void x_FlyCG_PtoR (
1683  string& s
1684 )
1685 
1686 {
1687  // s =~ s/\b(CG\d*-)P([[:alpha:]])\b/$1R$2/g, more or less.
1688  SIZE_TYPE pos = 0, len = s.size();
1689  while (pos + 3 < len && (pos = NStr::FindCase (s, "CG", pos)) != NPOS) {
1690  if (pos > 0 && !isspace((unsigned char)s[pos - 1]) ) {
1691  pos += 2;
1692  continue;
1693  }
1694  pos += 2;
1695  while (pos + 3 < len && isdigit((unsigned char)s[pos])) {
1696  ++pos;
1697  }
1698  if (s[pos] == '-' && s[pos + 1] == 'P' &&
1699  isalpha((unsigned char)s[pos + 2]) &&
1700  (pos + 3 == len || strchr(" ,;", s[pos + 3])) ) {
1701  s[pos + 1] = 'R';
1702  }
1703  }
1704 }
1705 
1707  const CBioseq_Handle& bsh
1708 )
1709 
1710 {
1711  unsigned int genes = 0, cdregions = 0;
1712  CConstRef<CSeq_feat> gene(0);
1713  CConstRef<CSeq_feat> cdregion(0);
1714 
1715  // require taxname to be set
1716  if (m_Taxname.empty()) return;
1717 
1718  CScope& scope = bsh.GetScope();
1719 
1720  SAnnotSelector sel;
1723  sel.SetResolveTSE();
1724 
1725  FOR_SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE (feat_it, bsh, sel) {
1726  const CSeq_feat& sft = feat_it->GetOriginalFeature();
1727  SWITCH_ON_FEATURE_CHOICE (sft) {
1728  case CSeqFeatData::e_Gene:
1729  ++genes;
1730  gene.Reset(&sft);
1731  break;
1733  ++cdregions;
1734  cdregion.Reset(&sft);
1735  break;
1736  default:
1737  break;
1738  }
1739  }
1740 
1741  if (genes == 1 && cdregions == 1 && (! m_Taxname.empty())) {
1742  string cds_label, gene_label;
1744 
1745  feature::GetLabel(*cdregion, &cds_label, feature::fFGL_Content, &scope);
1746  if (NStr::EqualNocase (m_Taxname, "Drosophila melanogaster")) {
1747  x_FlyCG_PtoR (cds_label);
1748  }
1749  NStr::ReplaceInPlace (cds_label, "isoform ", "transcript variant ");
1750  feature::GetLabel(*gene, &gene_label, feature::fFGL_Content, &scope);
1751  joiner.Add(m_Taxname).Add(" ").Add(cds_label).Add(" (")
1752  .Add(gene_label).Add("), mRNA");
1753  joiner.Join(&m_MainTitle);
1754  }
1755 }
1756 
1757 // generate title for NR
1759  const CBioseq_Handle& bsh
1760 )
1761 
1762 {
1763  // require taxname to be set
1764  if (m_Taxname.empty()) return;
1765 
1766  FOR_EACH_SEQFEAT_ON_BIOSEQ_HANDLE (feat_it, bsh, Gene) {
1767  const CSeq_feat& sft = feat_it->GetOriginalFeature();
1768  m_MainTitle = string(m_Taxname) + " ";
1770  m_MainTitle += ", ";
1771  switch (m_MIBiomol) {
1772  case NCBI_BIOMOL(pre_RNA):
1773  m_MainTitle += "precursorRNA";
1774  break;
1775  case NCBI_BIOMOL(mRNA):
1776  m_MainTitle += "mRNA";
1777  break;
1778  case NCBI_BIOMOL(rRNA):
1779  m_MainTitle += "rRNA";
1780  break;
1781  case NCBI_BIOMOL(tRNA):
1782  m_MainTitle += "tRNA";
1783  break;
1784  case NCBI_BIOMOL(snRNA):
1785  m_MainTitle += "snRNA";
1786  break;
1787  case NCBI_BIOMOL(scRNA):
1788  m_MainTitle += "scRNA";
1789  break;
1790  case NCBI_BIOMOL(cRNA):
1791  m_MainTitle += "cRNA";
1792  break;
1793  case NCBI_BIOMOL(snoRNA):
1794  m_MainTitle += "snoRNA";
1795  break;
1796  case NCBI_BIOMOL(transcribed_RNA):
1797  m_MainTitle += "miscRNA";
1798  break;
1799  case NCBI_BIOMOL(ncRNA):
1800  m_MainTitle += "ncRNA";
1801  break;
1802  case NCBI_BIOMOL(tmRNA):
1803  m_MainTitle += "tmRNA";
1804  break;
1805  default:
1806  break;
1807  }
1808 
1809  // take first, then break to skip remainder
1810  break;
1811  }
1812 }
1813 
1814 // generate title for Patent
1816 
1817 {
1818  string seqno = NStr::IntToString(m_PatentSequence);
1820  joiner.Add("Sequence ").Add(seqno).Add(" from Patent ")
1822  joiner.Join(&m_MainTitle);
1823 }
1824 
1826 
1827 {
1828  if (! m_PDBChainID.empty()) {
1829  string chain(m_PDBChainID);
1832  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_PDBCompound);
1833  } else {
1834  std::size_t found = m_Comment.find_first_not_of("0123456789");
1835  if (found != std::string::npos && found < m_Comment.length() && m_Comment[found] == ' ') {
1836  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_Comment.substr (found));
1837  } else {
1838  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_Comment);
1839  }
1840  }
1841  joiner.Join(&m_MainTitle);
1842  } else if (isprint ((unsigned char) m_PDBChain)) {
1843  string chain(1, (char) m_PDBChain);
1846  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_PDBCompound);
1847  } else {
1848  std::size_t found = m_Comment.find_first_not_of("0123456789");
1849  if (found != std::string::npos && found < m_Comment.length() && m_Comment[found] == ' ') {
1850  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_Comment.substr (found));
1851  } else {
1852  joiner.Add("Chain ").Add(chain).Add(", ").Add(m_Comment);
1853  }
1854  }
1855  joiner.Join(&m_MainTitle);
1856  } else {
1858  }
1859 }
1860 
1862 
1863 {
1864  CDefLineJoiner joiner;
1865 
1866  joiner.Add("organism", m_Taxname, eHideType);
1867 
1868  if ( ! m_Organelle.empty() && NStr::FindNoCase (m_Organelle, "plasmid") != NPOS) {
1869  joiner.Add("location", m_Organelle, eHideType);
1870  }
1871 
1872  if (! m_Strain.empty()) {
1873  CTempString add(m_Strain, 0, m_Strain.find(';'));
1874  if (! s_EndsWithStrain (m_Taxname, add)) {
1875  joiner.Add("strain", add);
1876  }
1877  }
1878  if (! m_Strain.empty()) {
1879  CTempString add(m_Substrain, 0, m_Substrain.find(';'));
1880  if (! s_EndsWithStrain (m_Taxname, add)) {
1881  joiner.Add("substr.", add);
1882  }
1883  }
1884  if (! m_Chromosome.empty()) {
1885  joiner.Add("chromosome", m_Chromosome);
1886  }
1887  if (m_has_clone) {
1888  string clnbuf;
1889  vector<CTempString> clnvec;
1890  x_DescribeClones (clnvec, clnbuf);
1891  ITERATE (vector<CTempString>, it, clnvec) {
1892  joiner.Add("clone", *it, eHideType);
1893  }
1894  }
1895  if (! m_Map.empty()) {
1896  joiner.Add("map", m_Map);
1897  }
1898  if (! m_Plasmid.empty()) {
1899  if (NStr::FindNoCase(m_Plasmid, "plasmid") == NPOS &&
1900  NStr::FindNoCase(m_Plasmid, "element") == NPOS) {
1901  joiner.Add("plasmid", m_Plasmid);
1902  } else {
1903  joiner.Add("", m_Plasmid);
1904  }
1905  }
1906 
1907  if (x_IsComplete()) {
1908  joiner.Add("completeness", ", complete sequence", eHideType);
1909  }
1910 
1911  joiner.Join(&m_MainTitle);
1913 }
1914 
1915 // generate title for protein
1917  const CBioseq_Handle& bsh
1918 )
1919 
1920 {
1921  TSeqPos longest = 0;
1923  CProt_ref::EProcessed processed;
1925  CConstRef<CSeq_feat> prot_feat;
1926  TSeqPos seq_len = UINT_MAX;
1927 
1928  CScope& scope = bsh.GetScope();
1929 
1930  if (bsh.IsSetInst ()) {
1931  if (bsh.IsSetInst_Length ()) {
1932  seq_len = bsh.GetInst_Length ();
1933  }
1934  }
1935 
1936  FOR_EACH_SEQFEAT_ON_BIOSEQ_HANDLE (feat_it, bsh, Prot) {
1937  const CSeq_feat& feat = feat_it->GetOriginalFeature();
1938  if (! feat.IsSetData ()) continue;
1939  const CSeqFeatData& sfdata = feat.GetData ();
1940  const CProt_ref& prp = sfdata.GetProt();
1941  processed = CProt_ref::eProcessed_not_set;
1942  if (prp.IsSetProcessed()) {
1943  processed = prp.GetProcessed();
1944  }
1945  if (! feat.IsSetLocation ()) continue;
1946  const CSeq_loc& loc = feat.GetLocation ();
1947  TSeqPos prot_length = GetLength (loc, &scope);
1948  if (prot_length > longest) {
1949  prot_feat = &feat;
1950  longest = prot_length;
1951  bestprocessed = processed;
1952  } else if (prot_length == longest) {
1953  // unprocessed 0 preferred over preprotein 1 preferred
1954  // over mat peptide 2
1955  if (processed < bestprocessed) {
1956  prot_feat = &feat;
1957  longest = prot_length;
1958  bestprocessed = processed;
1959  }
1960  }
1961  }
1962 
1963  if (longest == seq_len && prot_feat) {
1964  return prot_feat;
1965  }
1966 
1967  // confirm that this will automatically check features on
1968  // parts and segset in pathological segmented protein ???
1969 
1970  if (prot_feat) {
1971  return prot_feat;
1972  }
1973 
1974  CSeq_loc everywhere;
1975  everywhere.SetWhole().Assign(*bsh.GetSeqId());
1976 
1977  prot_feat = GetBestOverlappingFeat (everywhere, CSeqFeatData::e_Prot,
1978  eOverlap_Contained, scope);
1979 
1980  if (prot_feat) {
1981  return prot_feat;
1982  }
1983 
1984  return CConstRef<CSeq_feat> ();
1985 }
1986 
1987 // m_LocalAnnotsOnly test is unnecessary because feature iterator is built on local features only
1988 // sqd-4081: it appears that test still does matter. reinstated and even more rigorously applied.
1990  const CMappedFeat& mapped_cds)
1991 
1992 {
1993  CConstRef<CGene_ref> gene_ref;
1994 
1995  if (mapped_cds) {
1996  const CSeq_feat& cds_feat = mapped_cds.GetOriginalFeature();
1997  FOR_EACH_SEQFEATXREF_ON_FEATURE (xf_itr, cds_feat) {
1998  const CSeqFeatXref& sfx = **xf_itr;
1999  if (sfx.IsSetData()) {
2000  const CSeqFeatData& sfd = sfx.GetData();
2001  if (sfd.IsGene()) {
2002  gene_ref = &sfd.GetGene();
2003  }
2004  }
2005  }
2006 
2007  if (gene_ref) {
2008  return gene_ref;
2009  }
2010 
2011  if (m_ConstructedFeatTree) {
2012  if (! m_InitializedFeatTree) {
2013  CFeat_CI iter (m_TopSEH);
2014  m_Feat_Tree.Reset (new CFeatTree (iter));
2015  m_InitializedFeatTree = true;
2016  }
2017  }
2018  if (m_Feat_Tree.Empty ()) {
2019  m_Feat_Tree.Reset (new CFeatTree);
2020  }
2021  if (! m_ConstructedFeatTree) {
2022  m_Feat_Tree->AddGenesForCds (mapped_cds);
2023  }
2024 
2025  try {
2026  CMappedFeat mapped_gene = GetBestGeneForCds (mapped_cds, m_Feat_Tree);
2027  if (mapped_gene) {
2028  const CSeq_feat& gene_feat = mapped_gene.GetOriginalFeature();
2029  gene_ref = &gene_feat.GetData().GetGene();
2030  }
2031  } catch ( const exception& ) {
2032  // ERR_POST(Error << "x_GetGeneRefViaCDS GetBestGeneForCds failure");
2033  }
2034 
2035  // clearing m_InitializedFeatTree may remove artifact after first protein is indexed and second protein is requested
2036  if (m_ConstructedFeatTree) {
2037  m_InitializedFeatTree = false;
2038  }
2039  }
2040 
2041  return gene_ref;
2042 }
2043 
2045  const CBioseq_Handle& bsh
2046 )
2047 
2048 {
2049  CConstRef<CSeq_feat> cds_feat;
2050  CConstRef<CSeq_loc> cds_loc;
2051  CConstRef<CBioSource> src_ref;
2052 
2053  CScope& scope = bsh.GetScope();
2054 
2055  cds_feat = GetCDSForProduct (bsh);
2056 
2057  if (cds_feat) {
2058  /*
2059  const CSeq_feat& feat = *cds_feat;
2060  */
2061  cds_loc = &cds_feat->GetLocation();
2062  if (cds_loc) {
2063  CRef<CSeq_loc> cleaned_location( new CSeq_loc );
2064  cleaned_location->Assign( *cds_loc );
2066  if (src_feat) {
2067  const CSeq_feat& feat = *src_feat;
2068  if (feat.IsSetData()) {
2069  const CSeqFeatData& sfd = feat.GetData();
2070  if (sfd.IsBiosrc()) {
2071  src_ref = &sfd.GetBiosrc();
2072  }
2073  }
2074  } else {
2075  CRef<CSeq_loc> rev_loc(SeqLocRevCmpl(*cleaned_location, &scope));
2076  cleaned_location->Assign(*rev_loc);
2077  src_feat = GetBestOverlappingFeat (*cleaned_location, CSeqFeatData::eSubtype_biosrc, eOverlap_SubsetRev, scope);
2078  if (src_feat) {
2079  const CSeq_feat& feat = *src_feat;
2080  if (feat.IsSetData()) {
2081  const CSeqFeatData& sfd = feat.GetData();
2082  if (sfd.IsBiosrc()) {
2083  src_ref = &sfd.GetBiosrc();
2084  }
2085  }
2086  }
2087  }
2088  }
2089  }
2090 
2091  if (src_ref) {
2092  return src_ref;
2093  }
2094 
2095  return CConstRef<CBioSource> ();
2096 }
2097 
2099  const CSeq_feat& sft
2100 )
2101 
2102 {
2103  if (! FEATURE_CHOICE_IS (sft, NCBI_SEQFEAT(Cdregion))) return false;
2104  if (! sft.IsSetExcept()) return false;
2105  if (! sft.GetExcept()) return false;
2106  if (! sft.IsSetExcept_text()) return false;
2107 
2108  const string& str = sft.GetExcept_text();
2109  int current_state = 0;
2110  FOR_EACH_CHAR_IN_STRING (str_itr, str) {
2111  const char ch = *str_itr;
2112  int next_state = ms_p_Low_Quality_Fsa->GetNextState (current_state, ch);
2113  if (ms_p_Low_Quality_Fsa->IsMatchFound (next_state)) {
2114  return true;
2115  }
2116  current_state = next_state;
2117  }
2118 
2119 
2120  return false;
2121 }
2122 
2123 static const char* s_proteinOrganellePrefix [] = {
2124  "", // "",
2125  "", // "",
2126  "chloroplast", // "chloroplast",
2127  "chromoplast", // "chromoplast",
2128  "kinetoplast", // "kinetoplast",
2129  "mitochondrion", // "mitochondrion",
2130  "plastid", // "plastid",
2131  "macronuclear", // "macronuclear",
2132  "", // "extrachromosomal",
2133  "plasmid", // "plasmid",
2134  "", // "",
2135  "", // "",
2136  "cyanelle", // "cyanelle",
2137  "", // "proviral",
2138  "", // "virus",
2139  "nucleomorph", // "nucleomorph",
2140  "apicoplast", // "apicoplast",
2141  "leucoplast", // "leucoplast",
2142  "protoplast", // "protoplast",
2143  "endogenous virus", // "endogenous virus",
2144  "hydrogenosome", // "hydrogenosome",
2145  "", // "chromosome",
2146  "chromatophore" // "chromatophore"
2147 };
2148 
2149 static string s_RemoveBracketedOrgFromEnd (string str, string taxname)
2150 
2151 {
2152  string final;
2153  if (str.empty()) return str;
2154  if (taxname.empty()) return str;
2155  SIZE_TYPE taxlen = taxname.length();
2156  int len = (int) str.length();
2157  if (len < 5) return str;
2158  if (str [len - 1] != ']') return str;
2160  if (cp == NPOS) return str;
2161  string suffix = str.substr(cp+1);
2162  if (NStr::StartsWith(suffix, "NAD")) return str;
2163  if (suffix.length() != taxlen + 1) return str;
2164  if (NStr::StartsWith(suffix, taxname)) {
2165  str.erase (cp);
2166  x_CleanAndCompress(final, str, true);
2167  return final;
2168 
2169  }
2170  return str;
2171 }
2172 
2174  const CBioseq_Handle& bsh
2175 )
2176 
2177 {
2178  CConstRef<CSeq_feat> cds_feat;
2180  CConstRef<CSeq_feat> prot_feat;
2181  CConstRef<CGene_ref> gene;
2183  CTempString locus_tag;
2184 
2186  if (! bsx) {
2187  return;
2188  }
2189 
2191 
2193 
2194  if (prtx) {
2195  const CMappedFeat mf = prtx->GetMappedFeat();
2196  const CProt_ref& prp = mf.GetData().GetProt();
2197 
2198  const char* prefix = "";
2199  FOR_EACH_NAME_ON_PROT (prp_itr, prp) {
2200  const string& str = *prp_itr;
2201  string trimmed = s_RemoveBracketedOrgFromEnd (str, m_Taxname);
2202  m_MainTitle += prefix;
2203  m_MainTitle += trimmed;
2204  if (! m_AllProtNames) {
2205  break;
2206  }
2207  prefix = "; ";
2208  }
2209 
2210  if (! m_MainTitle.empty()) {
2211  // strip trailing periods, commas, and spaces
2212  SIZE_TYPE pos = m_MainTitle.find_last_not_of (".,;~ ");
2213  if (pos != NPOS) {
2214  m_MainTitle.erase (pos + 1);
2215  }
2216 
2217  size_t offset = 0;
2218  size_t delta = 0;
2219  string comma;
2220  string isoform;
2221  if (NStr::StartsWith (m_MainTitle, "hypothetical protein")) {
2222  offset = 20;
2223  } else if (NStr::StartsWith (m_MainTitle, "uncharacterized protein")) {
2224  offset = 23;
2225  }
2226  if (offset > 0 && offset < m_MainTitle.length()) {
2227  if (m_MainTitle [offset] == ',' && m_MainTitle [offset + 1] == ' ') {
2228  comma = ", isoform ";
2229  delta = 2;
2230  }
2231  if (m_MainTitle [offset] == ' ') {
2232  comma = " isoform ";
2233  delta = 1;
2234  }
2235  if (NStr::StartsWith (m_MainTitle.substr (offset + delta), "isoform ")) {
2236  isoform = m_MainTitle.substr (offset + delta + 8);
2237  // !!! check for single alphanumeric string
2238  m_MainTitle.erase (offset);
2239  }
2240  }
2241  if ((NStr::EqualNocase (m_MainTitle, "hypothetical protein") ||
2242  NStr::EqualNocase (m_MainTitle, "uncharacterized protein"))
2243  /* && !m_LocalAnnotsOnly */ ) {
2244  if (sfxp) {
2245  CRef<CFeatureIndex> fsx = sfxp->GetBestGene();
2246  if (fsx) {
2247  const CGene_ref& grp = fsx->GetMappedFeat().GetData().GetGene();
2248  if (grp.IsSetLocus_tag()) {
2249  locus_tag = grp.GetLocus_tag();
2250  }
2251  }
2252  }
2253  if (! locus_tag.empty()) {
2254  m_MainTitle += " " + string(locus_tag) + string(comma) + string(isoform);
2255  }
2256  }
2257  }
2258  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2259  if (prp.IsSetDesc()) {
2260  m_MainTitle = prp.GetDesc();
2261  }
2262  }
2263  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2264  FOR_EACH_ACTIVITY_ON_PROT (act_itr, prp) {
2265  const string& str = *act_itr;
2266  m_MainTitle = str;
2267  break;
2268  }
2269  }
2270  }
2271 
2272  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2273  if (sfxp) {
2274  CRef<CFeatureIndex> fsx = sfxp->GetBestGene();
2275  if (fsx) {
2276  const CGene_ref& grp = fsx->GetMappedFeat().GetData().GetGene();
2277  if (grp.IsSetLocus()) {
2278  m_MainTitle = grp.GetLocus();
2279  }
2280  if (m_MainTitle.empty()) {
2281  FOR_EACH_SYNONYM_ON_GENE (syn_itr, grp) {
2282  const string& str = *syn_itr;
2283  m_MainTitle = str;
2284  break;
2285  }
2286  }
2287  if (m_MainTitle.empty()) {
2288  if (grp.IsSetDesc()) {
2289  m_MainTitle = grp.GetDesc();
2290  }
2291  }
2292  }
2293  }
2294  if (! m_MainTitle.empty()) {
2295  m_MainTitle += " gene product";
2296  }
2297  }
2298 
2299  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2300  m_MainTitle = "unnamed protein product";
2301  if (sfxp) {
2302  CRef<CFeatureIndex> fsx = sfxp->GetBestGene();
2303  if (fsx) {
2304  const CGene_ref& grp = fsx->GetMappedFeat().GetData().GetGene();
2305  if (grp.IsSetLocus_tag()) {
2306  locus_tag = grp.GetLocus_tag();
2307  }
2308  }
2309  }
2310  if (! locus_tag.empty()) {
2311  m_MainTitle += " " + string(locus_tag);
2312  }
2313  }
2314 
2315  if (sfxp) {
2316  const CMappedFeat mf = sfxp->GetMappedFeat();
2317  const CSeq_feat& cds = mf.GetOriginalFeature();
2318  if (x_CDShasLowQualityException (cds)) {
2319  const string& low_qual = "LOW QUALITY PROTEIN: ";
2320  if (NStr::FindNoCase (m_MainTitle, low_qual, 0) == NPOS) {
2321  string tmp = m_MainTitle;
2322  m_MainTitle = low_qual + tmp;
2323  }
2324  }
2325  }
2326 
2327  // strip trailing periods, commas, and spaces
2328  SIZE_TYPE pos = m_MainTitle.find_last_not_of (".,;~ ");
2329  if (pos != NPOS) {
2330  m_MainTitle.erase (pos + 1);
2331  }
2332 
2333  if (! x_IsComplete() /* && m_MainTitle.find(", partial") == NPOS */) {
2334  m_MainTitle += ", partial";
2335  }
2336 
2337  if (m_OmitTaxonomicName) return;
2338 
2339  CTempString taxname = m_Taxname;
2340 
2341  if (m_Genome >= NCBI_GENOME(chloroplast) && m_Genome <= NCBI_GENOME(chromatophore)) {
2342  const char * organelle = s_proteinOrganellePrefix [m_Genome];
2343  if ( organelle[0] != '\0' && ! taxname.empty()
2344  /* && NStr::Find (taxname, organelle) == NPOS */) {
2345  m_MainTitle += " (";
2346  m_MainTitle += organelle;
2347  m_MainTitle += ")";
2348  }
2349  }
2350 
2351  // check for special taxname, go to overlapping source feature
2352  if ((taxname.empty() ||
2353  (!NStr::EqualNocase (taxname, "synthetic construct") &&
2354  !NStr::EqualNocase (taxname, "artificial sequence") &&
2355  taxname.find ("vector") == NPOS &&
2356  taxname.find ("Vector") == NPOS)) &&
2357  !m_LocalAnnotsOnly) {
2358  /*
2359  CWeakRef<CBioseqIndex> bsxp = bsx->GetBioseqForProduct();
2360  auto nucx = bsxp.Lock();
2361  if (nucx) {
2362  if (nucx->HasSource()) {
2363  src = x_GetSourceFeatViaCDS (bsh);
2364  if (src.NotEmpty() && src->IsSetTaxname()) {
2365  taxname = src->GetTaxname();
2366  }
2367  }
2368  }
2369  */
2370  src = x_GetSourceFeatViaCDS (bsh);
2371  if (src.NotEmpty() && src->IsSetTaxname()) {
2372  taxname = src->GetTaxname();
2373  }
2374  }
2375 
2376  if (m_IsCrossKingdom && ! m_FirstSuperKingdom.empty() && ! m_SecondSuperKingdom.empty()) {
2378  } else if (! taxname.empty() /* && m_MainTitle.find(taxname) == NPOS */) {
2379  m_MainTitle += " [" + string(taxname) + "]";
2380  }
2381 }
2382 
2384  const CBioseq_Handle& bsh
2385 )
2386 
2387 {
2388  CConstRef<CSeq_feat> cds_feat;
2390  CConstRef<CSeq_feat> prot_feat;
2391  CConstRef<CGene_ref> gene;
2393  CTempString locus_tag;
2394 
2395  // gets longest protein on Bioseq, parts set, or seg set, even if not
2396  // full-length
2397 
2398  prot_feat = x_GetLongestProtein (bsh);
2399 
2400  if (prot_feat) {
2401  prot = &prot_feat->GetData().GetProt();
2402  }
2403 
2404  const CMappedFeat& mapped_cds = GetMappedCDSForProduct (bsh);
2405 
2406  if (prot) {
2407  const CProt_ref& prp = *prot;
2408  const char* prefix = "";
2409  FOR_EACH_NAME_ON_PROT (prp_itr, prp) {
2410  const string& str = *prp_itr;
2411  string trimmed = s_RemoveBracketedOrgFromEnd (str, m_Taxname);
2412  m_MainTitle += prefix;
2413  m_MainTitle += trimmed;
2414  if (! m_AllProtNames) {
2415  break;
2416  }
2417  prefix = "; ";
2418  }
2419 
2420  if (! m_MainTitle.empty()) {
2421  // strip trailing periods, commas, and spaces
2422  SIZE_TYPE pos = m_MainTitle.find_last_not_of (".,;~ ");
2423  if (pos != NPOS) {
2424  m_MainTitle.erase (pos + 1);
2425  }
2426 
2427  int offset = 0;
2428  int delta = 0;
2429  string comma;
2430  string isoform;
2431  if (NStr::StartsWith (m_MainTitle, "hypothetical protein")) {
2432  offset = 20;
2433  } else if (NStr::StartsWith (m_MainTitle, "uncharacterized protein")) {
2434  offset = 23;
2435  }
2436  if (offset > 0 && offset < m_MainTitle.length()) {
2437  if (m_MainTitle [offset] == ',' && m_MainTitle [offset + 1] == ' ') {
2438  comma = ", isoform ";
2439  delta = 2;
2440  }
2441  if (m_MainTitle [offset] == ' ') {
2442  comma = " isoform ";
2443  delta = 1;
2444  }
2445  if (NStr::StartsWith (m_MainTitle.substr (offset + delta), "isoform ")) {
2446  isoform = m_MainTitle.substr (offset + delta + 8);
2447  // !!! check for single alphanumeric string
2448  m_MainTitle.erase (offset);
2449  }
2450  }
2451  if ((NStr::EqualNocase (m_MainTitle, "hypothetical protein") ||
2452  NStr::EqualNocase (m_MainTitle, "uncharacterized protein"))
2453  /* && !m_LocalAnnotsOnly */ ) {
2454  gene = x_GetGeneRefViaCDS (mapped_cds);
2455  if (gene) {
2456  const CGene_ref& grp = *gene;
2457  if (grp.IsSetLocus_tag()) {
2458  locus_tag = grp.GetLocus_tag();
2459  }
2460  }
2461  if (! locus_tag.empty()) {
2462  m_MainTitle += " " + string(locus_tag) + string(comma) + string(isoform);
2463  }
2464  }
2465  }
2466  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2467  if (prp.IsSetDesc()) {
2468  m_MainTitle = prp.GetDesc();
2469  }
2470  }
2471  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2472  FOR_EACH_ACTIVITY_ON_PROT (act_itr, prp) {
2473  const string& str = *act_itr;
2474  m_MainTitle = str;
2475  break;
2476  }
2477  }
2478  }
2479 
2480  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2481  gene = x_GetGeneRefViaCDS (mapped_cds);
2482  if (gene) {
2483  const CGene_ref& grp = *gene;
2484  if (grp.IsSetLocus()) {
2485  m_MainTitle = grp.GetLocus();
2486  }
2487  if (m_MainTitle.empty()) {
2488  FOR_EACH_SYNONYM_ON_GENE (syn_itr, grp) {
2489  const string& str = *syn_itr;
2490  m_MainTitle = str;
2491  break;
2492  }
2493  }
2494  if (m_MainTitle.empty()) {
2495  if (grp.IsSetDesc()) {
2496  m_MainTitle = grp.GetDesc();
2497  }
2498  }
2499  }
2500  if (! m_MainTitle.empty()) {
2501  m_MainTitle += " gene product";
2502  }
2503  }
2504 
2505  if (m_MainTitle.empty() && !m_LocalAnnotsOnly) {
2506  m_MainTitle = "unnamed protein product";
2507  gene = x_GetGeneRefViaCDS (mapped_cds);
2508  if (gene) {
2509  const CGene_ref& grp = *gene;
2510  if (grp.IsSetLocus_tag()) {
2511  locus_tag = grp.GetLocus_tag();
2512  }
2513  }
2514  if (! locus_tag.empty()) {
2515  m_MainTitle += " " + string(locus_tag);
2516  }
2517  }
2518 
2519  if (mapped_cds) {
2520  const CSeq_feat& cds = mapped_cds.GetOriginalFeature();
2521  if (x_CDShasLowQualityException (cds)) {
2522  const string& low_qual = "LOW QUALITY PROTEIN: ";
2523  if (NStr::FindNoCase (m_MainTitle, low_qual, 0) == NPOS) {
2524  string tmp = m_MainTitle;
2525  m_MainTitle = low_qual + tmp;
2526  }
2527  }
2528  }
2529 
2530  // strip trailing periods, commas, and spaces
2531  SIZE_TYPE pos = m_MainTitle.find_last_not_of (".,;~ ");
2532  if (pos != NPOS) {
2533  m_MainTitle.erase (pos + 1);
2534  }
2535 
2536  if (! x_IsComplete() /* && m_MainTitle.find(", partial") == NPOS */) {
2537  m_MainTitle += ", partial";
2538  }
2539 
2540  if (m_OmitTaxonomicName) return;
2541 
2542  CTempString taxname = m_Taxname;
2543 
2544  if (m_Genome >= NCBI_GENOME(chloroplast) && m_Genome <= NCBI_GENOME(chromatophore)) {
2545  const char * organelle = s_proteinOrganellePrefix [m_Genome];
2546  if ( organelle[0] != '\0' && ! taxname.empty()
2547  /* && NStr::Find (taxname, organelle) == NPOS */) {
2548  m_MainTitle += " (";
2549  m_MainTitle += organelle;
2550  m_MainTitle += ")";
2551  }
2552  }
2553 
2554  // check for special taxname, go to overlapping source feature
2555  if ((taxname.empty() ||
2556  (!NStr::EqualNocase (taxname, "synthetic construct") &&
2557  !NStr::EqualNocase (taxname, "artificial sequence") &&
2558  taxname.find ("vector") == NPOS &&
2559  taxname.find ("Vector") == NPOS)) &&
2560  !m_LocalAnnotsOnly) {
2561  src = x_GetSourceFeatViaCDS (bsh);
2562  if (src.NotEmpty() && src->IsSetTaxname()) {
2563  taxname = src->GetTaxname();
2564  }
2565  }
2566 
2567  if (m_IsCrossKingdom && ! m_FirstSuperKingdom.empty() && ! m_SecondSuperKingdom.empty()) {
2569  } else if (! taxname.empty() /* && m_MainTitle.find(taxname) == NPOS */) {
2570  m_MainTitle += " [" + string(taxname) + "]";
2571  }
2572 }
2573 
2574 // generate title for segmented sequence
2576  string& locus,
2577  string& product,
2578  const char*& completeness,
2579  const CBioseq_Handle& bsh
2580 )
2581 
2582 {
2583  CScope& scope = bsh.GetScope();
2584 
2585  // check C toolkit code to understand what is happening here ???
2586 
2587  CSeq_loc everywhere;
2588  everywhere.SetMix().Set() = bsh.GetInst_Ext().GetSeg();
2589 
2590  FOR_EACH_SEQFEAT_ON_SCOPE (it, scope, everywhere, Cdregion) {
2591  const CSeq_feat& cds = it->GetOriginalFeature();
2592  if (! cds.IsSetLocation ()) continue;
2593  const CSeq_loc& cds_loc = cds.GetLocation();
2594 
2595  GetLabel (cds, &product, feature::fFGL_Content, &scope);
2596 
2597  if (cds.IsSetPartial()) {
2598  completeness = "partial";
2599  }
2600 
2601  FOR_EACH_SEQFEATXREF_ON_SEQFEAT (xr_itr, cds) {
2602  const CSeqFeatXref& sfx = **xr_itr;
2603  if (! FIELD_IS_SET (sfx, Data)) continue;
2604  const CSeqFeatData& sfd = GET_FIELD (sfx, Data);
2605  if (! FIELD_IS (sfd, Gene)) continue;
2606  const CGene_ref& gr = GET_FIELD (sfd, Gene);
2607  if (FIELD_IS_SET (gr, Locus)) {
2608  locus = GET_FIELD (gr, Locus);
2609  } else {
2610  FOR_EACH_SYNONYM_ON_GENEREF (syn_itr, gr) {
2611  locus = *syn_itr;
2612  // take first, then break to skip remainder
2613  break;
2614  }
2615  }
2616  }
2617 
2618  if (locus.empty()) {
2619  CConstRef<CSeq_feat> gene_feat
2620  = GetBestOverlappingFeat(cds_loc,
2623  scope);
2624  if (gene_feat.NotEmpty()) {
2625  const CSeq_feat& gene = *gene_feat;
2626  GetLabel (gene, &locus, feature::fFGL_Content, &scope);
2627  /*
2628  if (gene_feat->GetData().GetGene().IsSetLocus()) {
2629  locus = gene_feat->GetData().GetGene().GetLocus();
2630  } else if (gene_feat->GetData().GetGene().IsSetSyn()) {
2631  locus = *gene_feat->GetData().GetGene().GetSyn().begin();
2632  }
2633  */
2634  }
2635  }
2636 
2637  return true;
2638  }
2639 
2640  return false;
2641 }
2642 
2644  const CBioseq_Handle& bsh
2645 )
2646 
2647 {
2648  const char * completeness = "complete";
2649  bool cds_found = false;
2650  string locus, product;
2651  CDefLineJoiner joiner;
2652 
2653  if (m_Taxname.empty()) {
2654  m_Taxname = "Unknown";
2655  }
2656  joiner.Add("organism", m_Taxname, eHideType);
2657 
2658  if ( !m_LocalAnnotsOnly ) {
2659  cds_found = x_GetSegSeqInfoViaCDS(locus, product, completeness, bsh);
2660  }
2661  if ( !cds_found) {
2662  if (! m_Strain.empty()
2663  && ! s_EndsWithStrain (m_Taxname, m_Strain) ) {
2664  joiner.Add("strain", m_Strain);
2665  } else if (! m_Clone.empty()
2666  /* && m_Clone.find(" clone ") != NPOS */) {
2667  string clnbuf;
2668  vector<CTempString> clnvec;
2669  x_DescribeClones (clnvec, clnbuf);
2670  ITERATE (vector<CTempString>, it, clnvec) {
2671  joiner.Add("clone", *it, eHideType);
2672  }
2673  } else if (! m_Isolate.empty() ) {
2674  joiner.Add("isolate", m_Isolate);
2675  }
2676  }
2677  if (! product.empty()) {
2678  joiner.Add("product", product, eHideType);
2679  }
2680  joiner.Join(&m_MainTitle);
2681  if (! locus.empty()) {
2682  m_MainTitle += " (" + locus + ")";
2683  }
2684  if ((! product.empty()) || (! locus.empty())) {
2685  m_MainTitle += " gene, " + string(completeness) + " cds";
2686  }
2688 }
2689 
2690 // generate title for TSA or non-master WGS
2692 
2693 {
2694  CDefLineJoiner joiner;
2695 
2696  joiner.Add("organism", m_Taxname, eHideType);
2697 
2698  if (! m_Strain.empty()) {
2699  if (! s_EndsWithStrain (m_Taxname, m_Strain)) {
2700  joiner.Add("strain", m_Strain.substr (0, m_Strain.find(';')));
2701  }
2703  joiner.Add("substr.", m_Substrain.substr (0, m_Substrain.find(';')));
2704  }
2705  } else if (! m_Breed.empty()) {
2706  joiner.Add("breed", m_Breed.substr (0, m_Breed.find(';')));
2707  } else if (! m_Cultivar.empty()) {
2708  joiner.Add("cultivar", m_Cultivar.substr (0, m_Cultivar.find(';')));
2709  }
2710 
2712 
2713  if (! m_Chromosome.empty()) {
2714  joiner.Add("chromosome", m_Chromosome);
2715  } else if ( !m_LinkageGroup.empty()) {
2716  joiner.Add("linkage group", m_LinkageGroup);
2717  }
2718  if (! m_Clone.empty()) {
2719  string clnbuf;
2720  vector<CTempString> clnvec;
2721  x_DescribeClones (clnvec, clnbuf);
2722  ITERATE (vector<CTempString>, it, clnvec) {
2723  joiner.Add("clone", *it, eHideType);
2724  }
2725  }
2726  if (! m_Map.empty()) {
2727  joiner.Add("map", m_Map);
2728  }
2729  if (! m_Plasmid.empty()) {
2730  if (m_IsWGS) {
2731  joiner.Add("plasmid", m_Plasmid);
2732  }
2733  }
2734  // string tmp needs to be in scope for final joiner.Join statement
2735  string tmp;
2736  if (m_Genome == NCBI_GENOME(plasmid) && m_Topology == NCBI_SEQTOPOLOGY(circular)) {
2737  } else if (m_Genome == NCBI_GENOME(chromosome)) {
2738  } else if (! m_GeneralStr.empty()) {
2739  if (m_GeneralStr != m_Chromosome && (! m_IsWGS || m_GeneralStr != m_Plasmid)) {
2740  joiner.Add("", m_GeneralStr, eHideType);
2741  }
2742  } else if (m_GeneralId > 0) {
2744  if (! tmp.empty()) {
2745  if (tmp != m_Chromosome && (! m_IsWGS || tmp != m_Plasmid)) {
2746  joiner.Add("", tmp, eHideType);
2747  }
2748  }
2749  }
2750 
2751  joiner.Join(&m_MainTitle);
2753 }
2754 
2755 // generate title for optical map
2757 
2758 {
2759  CDefLineJoiner joiner;
2760 
2761  joiner.Add("organism", m_Taxname, eHideType);
2762 
2763  if (! m_Strain.empty()) {
2764  if (! s_EndsWithStrain (m_Taxname, m_Strain)) {
2765  joiner.Add("strain", m_Strain.substr (0, m_Strain.find(';')));
2766  }
2767  }
2768  if (! m_Substrain.empty()) {
2770  joiner.Add("substr.", m_Substrain.substr (0, m_Substrain.find(';')));
2771  }
2772  }
2773  if (! m_Chromosome.empty()) {
2774  joiner.Add("chromosome", m_Chromosome);
2775  } else if (m_IsChromosome) {
2776  joiner.Add("location", "chromosome", eHideType);
2777  }
2778  if (! m_Plasmid.empty()) {
2779  joiner.Add("plasmid", m_Plasmid);
2780  } else if (m_IsPlasmid) {
2781  joiner.Add("location", "plasmid", eHideType);
2782  }
2783  if (! m_Isolate.empty()) {
2784  joiner.Add("isolate", m_Isolate);
2785  }
2786  joiner.Join(&m_MainTitle);
2787 
2788  if (! m_rEnzyme.empty()) {
2789  m_MainTitle += ", " + m_rEnzyme + " whole genome map";
2790  }
2791 
2793 }
2794 
2795 // generate TPA or TSA prefix
2797  string& prefix,
2798  const CBioseq_Handle& bsh
2799 )
2800 
2801 {
2802  prefix = kEmptyCStr;
2803 
2804  if (m_IsUnverified) {
2805  if (m_MainTitle.find ("UNVERIFIED") == NPOS) {
2807  }
2808  } else if (m_IsUnreviewed) {
2809  if (m_MainTitle.find ("UNREVIEWED") == NPOS) {
2811  }
2812  } else if (m_ThirdParty) {
2813  if (m_TPAExp) {
2814  prefix = "TPA_exp: ";
2815  } else if (m_TPAInf) {
2816  prefix = "TPA_inf: ";
2817  } else if (m_TPAReasm) {
2818  prefix = "TPA_asm: ";
2819  } else if (m_Idx && m_IsAA) {
2821  if (bsx) {
2823  auto nucx = bsxp.Lock();
2824  if (nucx) {
2825  if (nucx->IsTPAExp()) {
2826  prefix = "TPA_exp: ";
2827  } else if (nucx->IsTPAInf()) {
2828  prefix = "TPA_inf: ";
2829  } else if (nucx->IsTPAReasm()) {
2830  prefix = "TPA_asm: ";
2831  }
2832  }
2833  }
2834  }
2835  if (prefix.empty()) {
2836  prefix = "TPA: ";
2837  }
2838  } else if (m_IsTSA) {
2839  prefix = "TSA: ";
2840  } else if (m_IsTLS) {
2841  prefix = "TLS: ";
2842  } else if (m_Multispecies && m_IsWP) {
2843  prefix = "MULTISPECIES: ";
2844  } else if (m_IsPseudogene) {
2845  if (m_MainTitle.find ("PUTATIVE PSEUDOGENE") == NPOS) {
2846  prefix = "PUTATIVE PSEUDOGENE: ";
2847  }
2848  } else if (m_Idx && m_IsAA) {
2850  if (bsx) {
2852  if (sfxp) {
2853  const CMappedFeat mf = sfxp->GetMappedFeat();
2854  const CSeq_feat& cds = mf.GetOriginalFeature();
2855  if (x_CDShasLowQualityException (cds)) {
2856  if (m_MainTitle.find ("LOW QUALITY PROTEIN") == NPOS) {
2857  prefix = "LOW QUALITY PROTEIN: ";
2858  }
2859  }
2860  }
2861  }
2862  }
2863 }
2864 
2865 // generate suffix if not already present
2867  string& suffix,
2868  const CBioseq_Handle& bsh,
2869  bool appendComplete
2870 )
2871 
2872 {
2873  string type;
2874  string study;
2875  string comp;
2876 
2877  switch (m_MITech) {
2878  case NCBI_TECH(htgs_0):
2879  if (m_MainTitle.find ("LOW-PASS") == NPOS) {
2880  type = ", LOW-PASS SEQUENCE SAMPLING";
2881  }
2882  break;
2883  case NCBI_TECH(htgs_1):
2884  case NCBI_TECH(htgs_2):
2885  {
2886  if (m_HTGSDraft) {
2887  if (m_MainTitle.find ("WORKING DRAFT") == NPOS) {
2888  type = ", WORKING DRAFT SEQUENCE";
2889  }
2890  } else if (!m_HTGSCancelled) {
2891  if (m_MainTitle.find ("SEQUENCING IN") == NPOS) {
2892  type = ", *** SEQUENCING IN PROGRESS ***";
2893  }
2894  }
2895 
2896  string un;
2897  if (m_MITech == NCBI_TECH(htgs_1)) {
2898  un = "un";
2899  }
2900  if (m_IsDelta) {
2901  unsigned int pieces = 1;
2902  for (CSeqMap_CI it (bsh, CSeqMap::fFindGap); it; ++it) {
2903  ++pieces;
2904  }
2905  if (pieces == 1) {
2906  // type += (", 1 " + un + "ordered piece");
2907  } else {
2908  type += (", " + NStr::IntToString (pieces)
2909  + " " + un + "ordered pieces");
2910  }
2911  } else {
2912  // type += ", in " + un + "ordered pieces";
2913  }
2914  break;
2915  }
2916  case NCBI_TECH(htgs_3):
2917  if (m_MainTitle.find ("complete sequence") == NPOS) {
2918  type = ", complete sequence";
2919  }
2920  break;
2921  case NCBI_TECH(est):
2922  if (m_MainTitle.find ("mRNA sequence") == NPOS) {
2923  type = ", mRNA sequence";
2924  }
2925  break;
2926  case NCBI_TECH(sts):
2927  if (m_MainTitle.find ("sequence tagged site") == NPOS) {
2928  type = ", sequence tagged site";
2929  }
2930  break;
2931  case NCBI_TECH(survey):
2932  if (m_MainTitle.find ("genomic survey sequence") == NPOS) {
2933  type = ", genomic survey sequence";
2934  }
2935  break;
2936  case NCBI_TECH(wgs):
2937  if (m_WGSMaster) {
2938  if (m_MainTitle.find ("whole genome shotgun sequencing project") == NPOS){
2939  type = ", whole genome shotgun sequencing project";
2940  }
2941  } else if (m_MainTitle.find ("whole genome shotgun sequence") == NPOS) {
2942  if (! m_Organelle.empty() && m_MainTitle.find(m_Organelle) == NPOS) {
2943  if ((NStr::EqualNocase (m_Organelle, "mitochondrial") || NStr::EqualNocase (m_Organelle, "mitochondrion")) &&
2944  (m_MainTitle.find("mitochondrial") != NPOS || m_MainTitle.find("mitochondrion") != NPOS)) {
2945  } else if (NStr::EqualNocase (m_Organelle, "chromosome") &&
2946  (m_MainTitle.find("linkage group") != NPOS || m_MainTitle.find("chromosome") != NPOS)) {
2947  } else {
2948  type = " ";
2949  type += m_Organelle;
2950  }
2951  }
2952  type += ", whole genome shotgun sequence";
2953  }
2954  break;
2955  case NCBI_TECH(tsa):
2956  if (m_TSAMaster) {
2957  if (m_MainTitle.find("transcriptome shotgun assembly") == NPOS) {
2958  type = ", transcriptome shotgun assembly";
2959  }
2960  } else {
2961  if (m_MainTitle.find ("RNA sequence") == NPOS) {
2962  switch (m_MIBiomol) {
2963  case NCBI_BIOMOL(mRNA):
2964  type = ", mRNA sequence";
2965  break;
2966  case NCBI_BIOMOL(rRNA):
2967  type = ", rRNA sequence";
2968  break;
2969  case NCBI_BIOMOL(ncRNA):
2970  type = ", ncRNA sequence";
2971  break;
2972  case NCBI_BIOMOL(pre_RNA):
2973  case NCBI_BIOMOL(snRNA):
2974  case NCBI_BIOMOL(scRNA):
2975  case NCBI_BIOMOL(cRNA):
2976  case NCBI_BIOMOL(snoRNA):
2977  case NCBI_BIOMOL(transcribed_RNA):
2978  type = ", transcribed RNA sequence";
2979  break;
2980  default:
2981  break;
2982  }
2983  }
2984  }
2985  break;
2986  case NCBI_TECH(targeted):
2987  if (m_TLSMaster) {
2988  if (m_MainTitle.find ("targeted locus study") == NPOS) {
2989  type = ", targeted locus study";
2990  }
2991  } else {
2992  if (m_MainTitle.find ("sequence") == NPOS) {
2993  type += ", sequence";
2994  }
2995  }
2996  if (! m_TargetedLocus.empty() && m_MainTitle.find (m_TargetedLocus) == NPOS) {
2997  study = m_TargetedLocus;
2998  }
2999  break;
3000  default:
3001  break;
3002  }
3003 
3004  if (appendComplete) {
3005  if (m_MainTitle.find ("complete") == NPOS && m_MainTitle.find ("partial") == NPOS) {
3006  if (m_MICompleteness == NCBI_COMPLETENESS(complete)) {
3007  if (m_IsPlasmid) {
3008  comp = ", complete sequence";
3009  } else if (m_Genome == NCBI_GENOME(mitochondrion) ||
3010  m_Genome == NCBI_GENOME(chloroplast) ||
3011  m_Genome == NCBI_GENOME(kinetoplast) ||
3012  m_Genome == NCBI_GENOME(plastid) ||
3013  m_Genome == NCBI_GENOME(apicoplast)) {
3014  comp = ", complete genome";
3015  } else if (m_IsChromosome) {
3016  if (! m_Chromosome.empty()) {
3017  comp = ", complete sequence";
3018  } else if (! m_LinkageGroup.empty()) {
3019  comp = ", complete sequence";
3020  } else {
3021  comp = ", complete genome";
3022  }
3023  }
3024  }
3025  }
3026  }
3027 
3028  if (m_Unordered && m_IsDelta) {
3029  unsigned int num_gaps = 0;
3030  for (CSeqMap_CI it (bsh, CSeqMap::fFindGap); it; ++it) {
3031  ++num_gaps;
3032  }
3033  if (num_gaps > 0) {
3034  type += (", " + NStr::IntToString (num_gaps + 1)
3035  + " unordered pieces");
3036  }
3037  }
3038 
3039  if (! study.empty()) {
3040  suffix = " " + study + " " + type + comp;
3041  } else {
3042  suffix = type + comp;
3043  }
3044 }
3045 
3046 static inline void s_TrimMainTitle (string& str)
3047 {
3048  size_t pos = str.find_last_not_of (".,;~ ");
3049  if (pos != NPOS) {
3050  str.erase (pos + 1);
3051  }
3052 }
3053 
3054 /*
3055 // Strips all spaces in string in following manner. If the function
3056 // meets several spaces (spaces and tabs) in succession it replaces them
3057 // with one space. Strips all spaces after '(' and before ( ')' or ',' ).
3058 // (Code adapted from BasicCleanup.)
3059 static void x_CompressRunsOfSpaces (string& str)
3060 {
3061  if (str.empty()) {
3062  return;
3063  }
3064 
3065  string::iterator end = str.end();
3066  string::iterator it = str.begin();
3067  string::iterator new_str = it;
3068  while (it != end) {
3069  *new_str++ = *it;
3070  if ( (*it == ' ') || (*it == '\t') || (*it == '(') ) {
3071  for (++it; (it != end) && (*it == ' ' || *it == '\t'); ++it) continue;
3072  if ((it != end) && (*it == ')' || *it == ',') ) {
3073  // this "if" protects against the case "(...bunch of spaces and tabs...)".
3074  // Otherwise, the first '(' is unintentionally erased
3075  if( *(new_str - 1) != '(' ) {
3076  --new_str;
3077  }
3078  }
3079  } else {
3080  ++it;
3081  }
3082  }
3083  str.erase(new_str, str.end());
3084 }
3085 */
3086 
3087 static size_t s_TitleEndsInOrganism (
3088  string& title,
3089  CTempString taxname
3090 )
3091 
3092 {
3093  size_t pos, idx;
3094  size_t len1, len2;
3095 
3096  len1 = title.length();
3097  len2 = taxname.length();
3098 
3099  idx = len1 - len2 - 3;
3100  if (len1 > len2 + 4 && title [idx] == ' ' && title [idx + 1] == '[' && title [len1 - 1] == ']') {
3101  pos = NStr::Find(title, taxname, NStr::eNocase, NStr::eReverseSearch);
3102  if (pos == idx + 2) {
3103  return pos - 1;
3104  }
3105  }
3106 
3107  return NPOS;
3108 }
3109 
3111  const CBioseq_Handle& bsh
3112 )
3113 
3114 {
3115  CBioSource::TGenome genome;
3116  size_t pos, tpos = NPOS, opos = NPOS;
3117  int len1, len2;
3119 
3121  if (! bsx) {
3122  return;
3123  }
3124 
3125  m_Source = bsx->GetBioSource();
3126  m_Taxname = bsx->GetTaxname();
3127 
3128  m_Genome = bsx->GetGenome();
3129 
3130  m_Genus = bsx->GetGenus();
3131  m_Species = bsx->GetSpecies();
3132 
3133  m_Organelle = bsx->GetOrganelle();
3134 
3135  if (m_Source.Empty()) return;
3136 
3138 
3139  len1 = (int) m_MainTitle.length();
3140  len2 = (int) m_Taxname.length();
3141 
3142  // find [taxname]
3143 
3144  if (len1 > len2 + 4) {
3146  if (tpos == NPOS) {
3147  string descTaxname = bsx->GetDescTaxname();
3148  tpos = s_TitleEndsInOrganism(m_MainTitle, descTaxname);
3149  }
3150  if (tpos == NPOS) {
3151  string binomial = m_Genus;
3152  binomial += " ";
3153  binomial += m_Species;
3154  tpos = s_TitleEndsInOrganism(m_MainTitle, binomial);
3155  if (tpos == NPOS) {
3156  if (m_IsCrossKingdom) {
3158  if (pos != NPOS) {
3159  m_MainTitle.erase (pos + 1);
3162  }
3163  }
3164  }
3165  }
3166  }
3167 
3168  /* do not change unless [genus species] was at the end */
3169  if (tpos == NPOS) return;
3170 
3171  m_MainTitle.erase (tpos);
3173  len1 = (int) m_MainTitle.length();
3174 
3175  // find (organelle)
3176 
3177  if (len1 > 2 && m_MainTitle [len1 - 1] == ')') {
3178  pos = m_MainTitle.find_last_of ("(");
3179  if (pos != NPOS) {
3180  for ( genome = NCBI_GENOME(chloroplast); genome <= NCBI_GENOME(chromatophore); genome++ ) {
3181  string str = s_proteinOrganellePrefix [genome];
3182  if ( ! str.empty() ) {
3183  string paren = "(" + str + ")";
3184  if (NStr::EndsWith (m_MainTitle, paren )) {
3185  opos = pos;
3186  break;
3187  }
3188  }
3189  }
3190  }
3192  len1 = (int) m_MainTitle.length();
3193  }
3194 
3195  if (opos != NPOS) {
3196  m_MainTitle.erase (opos);
3198  len1 = (int) m_MainTitle.length();
3199  }
3200 
3201  if ( NStr::EndsWith (m_MainTitle, ", partial")) {
3202  m_MainTitle.erase(m_MainTitle.length() - 9);
3204  }
3205 
3206  // then reconstruct partial (organelle) [taxname] suffix
3207 
3208  if ( !x_IsComplete()) {
3209  m_MainTitle += ", partial";
3210  }
3211 
3212  if (m_OmitTaxonomicName) return;
3213 
3214  CTempString taxname = m_Taxname;
3215 
3216  if (m_Genome >= NCBI_GENOME(chloroplast) && m_Genome <= NCBI_GENOME(chromatophore)) {
3217  const char * organelle = s_proteinOrganellePrefix [m_Genome];
3218  if ( organelle[0] != '\0' && ! taxname.empty()
3219  /* && NStr::Find (taxname, organelle) == NPOS */) {
3220  m_MainTitle += " (";
3221  m_MainTitle += organelle;
3222  m_MainTitle += ")";
3223  }
3224  }
3225 
3226  // check for special taxname, go to overlapping source feature
3227  if ((taxname.empty() ||
3228  (!NStr::EqualNocase (taxname, "synthetic construct") &&
3229  !NStr::EqualNocase (taxname, "artificial sequence") &&
3230  taxname.find ("vector") == NPOS &&
3231  taxname.find ("Vector") == NPOS)) &&
3232  !m_LocalAnnotsOnly) {
3233  if (m_Idx) {
3235  if (bsx) {
3237  auto nucx = bsxp.Lock();
3238  if (nucx) {
3239  src = x_GetSourceFeatViaCDS (bsh);
3240  if (src.NotEmpty() && src->IsSetTaxname()) {
3241  taxname = src->GetTaxname();
3242  }
3243  }
3244  }
3245  } else {
3246  src = x_GetSourceFeatViaCDS (bsh);
3247  if (src.NotEmpty() && src->IsSetTaxname()) {
3248  taxname = src->GetTaxname();
3249  }
3250  }
3251  }
3252 
3253  if (m_IsCrossKingdom && ! m_FirstSuperKingdom.empty() && ! m_SecondSuperKingdom.empty()) {
3255  } else if (! taxname.empty() /* && m_MainTitle.find(taxname) == NPOS */) {
3256  m_MainTitle += " [" + string(taxname) + "]";
3257  }
3258 }
3259 
3261  const CBioseq_Handle& bsh
3262 )
3263 
3264 {
3265  CBioSource::TGenome genome;
3266  size_t pos, tpos = NPOS, opos = NPOS;
3267  int len1, len2;
3269 
3270  if (m_Source.Empty()) return;
3271 
3272  if (m_Source->IsSetTaxname()) {
3274  }
3275  if (m_Source->IsSetGenome()) {
3277  }
3278  if (m_Source->IsSetOrgname()) {
3279  const COrgName& onp = m_Source->GetOrgname();
3280  if (onp.IsSetName()) {
3281  const COrgName::TName& nam = onp.GetName();
3282  if (nam.IsBinomial()) {
3283  const CBinomialOrgName& bon = nam.GetBinomial();
3284  if (bon.IsSetGenus()) {
3285  m_Genus = bon.GetGenus();
3286  }
3287  if (bon.IsSetSpecies()) {
3288  m_Species = bon.GetSpecies();
3289  }
3290  }
3291  }
3292  }
3293 
3295 
3296  len1 = (int) m_MainTitle.length();
3297  len2 = (int) m_Taxname.length();
3298 
3299  // find [taxname]
3300 
3301  if (len1 > len2 + 4) {
3303  if (tpos == NPOS) {
3304  string binomial = m_Genus;
3305  binomial += " ";
3306  binomial += m_Species;
3307  tpos = s_TitleEndsInOrganism(m_MainTitle, binomial);
3308  if (tpos == NPOS) {
3309  if (m_IsCrossKingdom) {
3311  if (pos != NPOS) {
3312  m_MainTitle.erase (pos + 1);
3315  }
3316  }
3317  }
3318  }
3319  }
3320 
3321  /* do not change unless [genus species] was at the end */
3322  if (tpos == NPOS) return;
3323 
3324  m_MainTitle.erase (tpos);
3326  len1 = (int) m_MainTitle.length();
3327 
3328  // find (organelle)
3329 
3330  if (len1 > 2 && m_MainTitle [len1 - 1] == ')') {
3331  pos = m_MainTitle.find_last_of ("(");
3332  if (pos != NPOS) {
3333  for ( genome = NCBI_GENOME(chloroplast); genome <= NCBI_GENOME(chromatophore); genome++ ) {
3334  string str = s_proteinOrganellePrefix [genome];
3335  if ( ! str.empty() ) {
3336  string paren = "(" + str + ")";
3337  if (NStr::EndsWith (m_MainTitle, paren )) {
3338  opos = pos;
3339  break;
3340  }
3341  }
3342  }
3343  }
3345  len1 = (int) m_MainTitle.length();
3346  }
3347 
3348  if (opos != NPOS) {
3349  m_MainTitle.erase (opos);
3351  len1 = (int) m_MainTitle.length();
3352  }
3353 
3354  if ( NStr::EndsWith (m_MainTitle, ", partial")) {
3355  m_MainTitle.erase(m_MainTitle.length() - 9);
3357  }
3358 
3359  // then reconstruct partial (organelle) [taxname] suffix
3360 
3361  if ( !x_IsComplete()) {
3362  m_MainTitle += ", partial";
3363  }
3364 
3365  if (m_OmitTaxonomicName) return;
3366 
3367  CTempString taxname = m_Taxname;
3368 
3369  if (m_Genome >= NCBI_GENOME(chloroplast) && m_Genome <= NCBI_GENOME(chromatophore)) {
3370  const char * organelle = s_proteinOrganellePrefix [m_Genome];
3371  if ( organelle[0] != '\0' && ! taxname.empty()
3372  /* && NStr::Find (taxname, organelle) == NPOS */) {
3373  m_MainTitle += " (";
3374  m_MainTitle += organelle;
3375  m_MainTitle += ")";
3376  }
3377  }
3378 
3379  // check for special taxname, go to overlapping source feature
3380  if ((taxname.empty() ||
3381  (!NStr::EqualNocase (taxname, "synthetic construct") &&
3382  !NStr::EqualNocase (taxname, "artificial sequence") &&
3383  taxname.find ("vector") == NPOS &&
3384  taxname.find ("Vector") == NPOS)) &&
3385  !m_LocalAnnotsOnly) {
3386  if (m_Idx) {
3388  if (bsx) {
3390  auto nucx = bsxp.Lock();
3391  if (nucx) {
3392  src = x_GetSourceFeatViaCDS (bsh);
3393  if (src.NotEmpty() && src->IsSetTaxname()) {
3394  taxname = src->GetTaxname();
3395  }
3396  }
3397  }
3398  } else {
3399  src = x_GetSourceFeatViaCDS (bsh);
3400  if (src.NotEmpty() && src->IsSetTaxname()) {
3401  taxname = src->GetTaxname();
3402  }
3403  }
3404  }
3405 
3406  if (m_IsCrossKingdom && ! m_FirstSuperKingdom.empty() && ! m_SecondSuperKingdom.empty()) {
3408  } else if (! taxname.empty() /* && m_MainTitle.find(taxname) == NPOS */) {
3409  m_MainTitle += " [" + string(taxname) + "]";
3410  }
3411 }
3412 
3414 {
3415  switch (m_MICompleteness) {
3416  case NCBI_COMPLETENESS(complete):
3417  return true;
3418  case NCBI_COMPLETENESS(partial):
3419  case NCBI_COMPLETENESS(no_left):
3420  case NCBI_COMPLETENESS(no_right):
3421  case NCBI_COMPLETENESS(no_ends):
3422  return false;
3423  }
3424  return true;
3425 }
3426 
3427 
3428 static const char* s_tpaPrefixList [] = {
3429  "MAG ",
3430  "MAG:",
3431  "MULTISPECIES:",
3432  "TLS:",
3433  "TPA:",
3434  "TPA_exp:",
3435  "TPA_inf:",
3436  "TPA_reasm:",
3437  "TPA_asm:",
3438  "TSA:",
3439  "UNVERIFIED_ORG:",
3440  "UNVERIFIED_ASMBLY:",
3441  "UNVERIFIED_CONTAM:",
3442  "UNVERIFIED:"
3443 };
3445 {
3446  CDefLineJoiner joiner(true);
3447 
3448  x_SetBioSrc (bsh);
3449 
3450  joiner.Add("location", m_Organelle);
3451  if (m_IsChromosome || !m_Chromosome.empty()) {
3452  joiner.Add("chromosome", m_Chromosome);
3453  }
3454  if (m_IsPlasmid || !m_Plasmid.empty()) {
3455  joiner.Add("plasmid name", m_Plasmid);
3456  }
3457  if (m_MICompleteness == NCBI_COMPLETENESS(complete)) {
3458  joiner.Add("completeness", CTempString("complete"));
3459  }
3460 
3461  // print [topology=...], if necessary
3462  if (bsh.CanGetInst_Topology()) {
3463  CSeq_inst::ETopology topology = bsh.GetInst_Topology();
3464  if (topology == CSeq_inst::eTopology_circular) {
3465  joiner.Add("topology", CTempString("circular"));
3466  }
3467  }
3468 
3469  // bsh modifiers retrieved from Biosource.Org-ref
3470  // [organism=...], etc.
3471 
3472  bool strain_seen = false;
3473  string gcode; // should be in the same scope as joiner.Join() because joiner stores CTempString
3474 
3475  try {
3476  const CBioSource* bios = sequence::GetBioSource(bsh);
3477  if (bios && bios->IsSetOrg()) {
3478  const COrg_ref & org = bios->GetOrg();
3479  if (org.IsSetTaxname()) {
3480  joiner.Add("organism", org.GetTaxname());
3481  }
3482  if (org.IsSetOrgname()) {
3483  const COrg_ref::TOrgname & orgname = org.GetOrgname();
3484  if (orgname.IsSetMod()) {
3485  ITERATE(COrgName::TMod, mod_iter, orgname.GetMod()) {
3486  const COrgMod & mod = **mod_iter;
3487  if (mod.IsSetSubtype() && mod.IsSetSubname()) {
3488  const string& subname = mod.GetSubname();
3489  switch (mod.GetSubtype()) {
3491  if (strain_seen) {
3492  ERR_POST_X(9, Warning << __FUNCTION__ << ": "
3493  << "key 'strain' would appear multiple times, but only using the first.");
3494  }
3495  else {
3496  strain_seen = true;
3497  joiner.Add("strain", subname);
3498  }
3499  break;
3501  joiner.Add("substrain", subname);
3502  break;
3504  joiner.Add("type", subname);
3505  break;
3507  joiner.Add("subtype", subname);
3508  break;
3510  joiner.Add("variety", subname);
3511  break;
3513  joiner.Add("serotype", subname);
3514  break;
3516  joiner.Add("serogroup", subname);
3517  break;
3519  joiner.Add("serovar", subname);
3520  break;
3522  joiner.Add("cultivar", subname);
3523  break;
3525  joiner.Add("pathovar", subname);
3526  break;
3528  joiner.Add("chemovar", subname);
3529  break;
3531  joiner.Add("biovar", subname);
3532  break;
3534  joiner.Add("biotype", subname);
3535  break;
3537  joiner.Add("group", subname);
3538  break;
3540  joiner.Add("subgroup", subname);
3541  break;
3543  joiner.Add("isolate", subname);
3544  break;
3546  joiner.Add("common", subname);
3547  break;
3549  joiner.Add("acronym", subname);
3550  break;
3552  joiner.Add("dosage", subname);
3553  break;
3555  joiner.Add("nat_host", subname);
3556  break;
3558  joiner.Add("sub_species", subname);
3559  break;
3561  joiner.Add("specimen_voucher", subname);
3562  break;
3564  joiner.Add("authority", subname);
3565  break;
3567  joiner.Add("forma", subname);
3568  break;
3570  joiner.Add("forma_specialis", subname);
3571  break;
3573  joiner.Add("ecotype", subname);
3574  break;
3576  joiner.Add("synonym", subname);
3577  break;
3579  joiner.Add("anamorph", subname);
3580  break;
3582  joiner.Add("teleomorph", subname);
3583  break;
3585  joiner.Add("breed", subname);
3586  break;
3588  joiner.Add("gb_acronym", subname);
3589  break;
3591  joiner.Add("gb_anamorph", subname);
3592  break;
3594  joiner.Add("gb_synonym", subname);
3595  break;
3597  joiner.Add("culture_collection", subname);
3598  break;
3600  joiner.Add("bio_material", subname);
3601  break;
3603  joiner.Add("metagenome_source", subname);
3604  break;
3606  joiner.Add("type_material", subname);
3607  break;
3609  joiner.Add("nomenclature", subname);
3610  break;
3612  joiner.Add("note", subname);
3613  break;
3614  default:
3615  // ignore; do nothing
3616  break;
3617  }
3618  }
3619  }
3620  }
3622  if (bios->CanGetGenome()) {
3623  genome = bios->GetGenome();
3624  }
3625 
3626  switch ( genome ) {
3631  {
3632  // mitochondrial code
3633  if (orgname.IsSetMgcode()) {
3634  int icode = orgname.GetMgcode();
3635  gcode = std::to_string(icode);
3636  joiner.Add("gcode", gcode);
3637  }
3638  }
3639  break;
3649  {
3650  // specific plant plastid code
3651  if (orgname.IsSetPgcode()) {
3652  int icode = orgname.GetPgcode();
3653  if (icode > 0) {
3654  gcode = std::to_string(icode);
3655  joiner.Add("gcode", gcode);
3656  }
3657  } else {
3658  // bacteria and plant plastids default to code 11.
3659  joiner.Add("gcode", "11");
3660  }
3661  break;
3662  }
3663  default:
3664  {
3665  if (orgname.IsSetGcode()) {
3666  int icode = orgname.GetGcode();
3667  if (icode > 0) {
3668  gcode = std::to_string(icode);
3669  joiner.Add("gcode", gcode);
3670  }
3671  }
3672  break;
3673  }
3674  }
3675  }
3676  }
3677  if ( bios && bios->IsSetSubtype() ) {
3678  ITERATE ( CBioSource::TSubtype, sub_iter, bios->GetSubtype() ) {
3679  const CSubSource& sub = **sub_iter;
3680  if (sub.IsSetSubtype()) {
3681  if (sub.IsSetName()) {
3682  const string& subname = sub.GetName();
3683  switch (sub.GetSubtype()) {
3685  if (! m_IsChromosome && m_Chromosome.empty()) {
3686  joiner.Add("chromosome", subname);
3687  }
3688  break;
3690  joiner.Add("map", subname);
3691  break;
3693  joiner.Add("clone", subname);
3694  break;
3696  joiner.Add("subclone", subname);
3697  break;
3699  joiner.Add("haplotype", subname);
3700  break;
3702  joiner.Add("genotype", subname);
3703  break;
3705  joiner.Add("sex", subname);
3706  break;
3708  joiner.Add("cell_line", subname);
3709  break;
3711  joiner.Add("cell_type", subname);
3712  break;
3714  joiner.Add("tissue_type", subname);
3715  break;
3717  joiner.Add("clone_lib", subname);
3718  break;
3720  joiner.Add("dev_stage", subname);
3721  break;
3723  joiner.Add("frequency", subname);
3724  break;
3726  joiner.Add("lab_host", subname);
3727  break;
3729  joiner.Add("pop_variant", subname);
3730  break;
3732  joiner.Add("tissue_lib", subname);
3733  break;
3735  if (! m_IsPlasmid && m_Plasmid.empty()) {
3736  joiner.Add("plasmid_name", subname);
3737  }
3738  break;
3740  joiner.Add("transposon_name", subname);
3741  break;
3743  joiner.Add("insertion_seq_name", subname);
3744  break;
3746  joiner.Add("plastid_name", subname);
3747  break;
3750  joiner.Add("geo_loc_name", subname);
3751  } else {
3752  joiner.Add("country", subname);
3753  }
3754  break;
3756  joiner.Add("segment", subname);
3757  break;
3759  joiner.Add("endogenous_virus_name", subname);
3760  break;
3762  joiner.Add("isolation_source", subname);
3763  break;
3765  joiner.Add("lat_lon", subname);
3766  break;
3768  joiner.Add("collection_date", subname);
3769  break;
3771  joiner.Add("collected_by", subname);
3772  break;
3774  joiner.Add("identified_by", subname);
3775  break;
3777  joiner.Add("metagenomic", subname);
3778  break;
3780  joiner.Add("mating_type", subname);
3781  break;
3783  joiner.Add("linkage_group", subname);
3784  break;
3786  joiner.Add("haplogroup", subname);
3787  break;
3789  joiner.Add("whole_replicon", subname);
3790  break;
3792  joiner.Add("phenotype", subname);
3793  break;
3795  joiner.Add("altitude", subname);
3796  break;
3798  joiner.Add("note", subname);
3799  break;
3800  default:
3801  break;
3802  }
3803  } else {
3804  switch (sub.GetSubtype()) {
3806  joiner.Add("germline", "true");
3807  break;
3809  joiner.Add("rearranged", "true");
3810  break;
3812  joiner.Add("transgenic", "true");
3813  break;
3815  joiner.Add("environmental_sample", "true");
3816  break;
3817  default:
3818  break;
3819  }
3820  }
3821  }
3822  }
3823  }
3824  if ( bios && bios->IsSetPcr_primers() ) {
3825  const CBioSource_Base::TPcr_primers & primers = bios->GetPcr_primers();
3826  if ( primers.CanGet() ) {
3828 
3829  // bool has_fwd_seq = false;
3830  // bool has_rev_seq = false;
3831 
3832  if( (*it)->IsSetForward() ) {
3833  const CPCRReaction_Base::TForward &forward = (*it)->GetForward();
3834  if( forward.CanGet() ) {
3835  ITERATE( CPCRReaction_Base::TForward::Tdata, it2, forward.Get() ) {
3836  const string &fwd_name = ( (*it2)->CanGetName() ? (*it2)->GetName().Get() : kEmptyStr );
3837  if( ! fwd_name.empty() ) {
3838  joiner.Add("fwd-primer-name", fwd_name);
3839  }
3840  const string &fwd_seq = ( (*it2)->CanGetSeq() ? (*it2)->GetSeq().Get() : kEmptyStr );
3841  // NStr::ToLower( fwd_seq );
3842  if( ! fwd_seq.empty() ) {
3843  joiner.Add("fwd-primer-seq", fwd_seq);
3844  // has_fwd_seq = true;
3845  }
3846  }
3847  }
3848  }
3849  if( (*it)->IsSetReverse() ) {
3850  const CPCRReaction_Base::TReverse &reverse = (*it)->GetReverse();
3851  if( reverse.CanGet() ) {
3852  ITERATE( CPCRReaction_Base::TReverse::Tdata, it2, reverse.Get() ) {
3853  const string &rev_name = ((*it2)->CanGetName() ? (*it2)->GetName().Get() : kEmptyStr );
3854  if( ! rev_name.empty() ) {
3855  joiner.Add("rev-primer-name", rev_name);
3856  }
3857  const string &rev_seq = ( (*it2)->CanGetSeq() ? (*it2)->GetSeq().Get() : kEmptyStr );
3858  // NStr::ToLower( rev_seq ); // do we need this?
3859  if( ! rev_seq.empty() ) {
3860  joiner.Add("rev-primer-seq", rev_seq);
3861  // has_rev_seq = true;
3862  }
3863  }
3864  }
3865  }
3866  }
3867  }
3868  }
3869  }
3870  catch (CException &) {
3871  // ignore exception; it probably just means there's no org-ref
3872  }
3873 
3875  static const TTechMapEntry sc_TechArray[] = {
3876  // note that the text values do *NOT* precisely correspond with
3877  // the names in the ASN.1 schema files
3878  { CMolInfo::eTech_unknown, "?" },
3879  { CMolInfo::eTech_standard, "standard" },
3880  { CMolInfo::eTech_est, "EST" },
3881  { CMolInfo::eTech_sts, "STS" },
3882  { CMolInfo::eTech_survey, "survey" },
3883  { CMolInfo::eTech_genemap, "genetic map" },
3884  { CMolInfo::eTech_physmap, "physical map" },
3885  { CMolInfo::eTech_derived, "derived" },
3886  { CMolInfo::eTech_concept_trans, "concept-trans" },
3887  { CMolInfo::eTech_seq_pept, "seq-pept" },
3888  { CMolInfo::eTech_both, "both" },
3889  { CMolInfo::eTech_seq_pept_overlap, "seq-pept-overlap" },
3890  { CMolInfo::eTech_seq_pept_homol, "seq-pept-homol" },
3891  { CMolInfo::eTech_concept_trans_a, "concept-trans-a" },
3892  { CMolInfo::eTech_htgs_1, "htgs 1" },
3893  { CMolInfo::eTech_htgs_2, "htgs 2" },
3894  { CMolInfo::eTech_htgs_3, "htgs 3" },
3895  { CMolInfo::eTech_fli_cdna, "fli cDNA" },
3896  { CMolInfo::eTech_htgs_0, "htgs 0" },
3897  { CMolInfo::eTech_htc, "htc" },
3898  { CMolInfo::eTech_wgs, "wgs" },
3899  { CMolInfo::eTech_barcode, "barcode" },
3900  { CMolInfo::eTech_composite_wgs_htgs, "composite-wgs-htgs" },
3901  { CMolInfo::eTech_tsa, "tsa" }
3902  };
3905 
3906  // print some key-value pairs
3907  const CMolInfo * pMolInfo = sequence::GetMolInfo(bsh);
3908  if (pMolInfo != NULL) {
3909  const CMolInfo & molinfo = *pMolInfo;
3910  if (molinfo.IsSetTech()) {
3911  TTechMap::const_iterator find_iter = sc_TechMap.find(molinfo.GetTech());
3912  if (find_iter != sc_TechMap.end()) {
3913  joiner.Add("tech", CTempString(find_iter->second));
3914  }
3915  }
3916  }
3917  string modifiers;
3918  joiner.Join(&modifiers);
3919  m_MainTitle = (m_MainTitle.empty()) ? modifiers : modifiers + " " + m_MainTitle;
3920  return m_MainTitle;
3921 }
3922 
3923 
3924 // main method
3926  const CBioseq_Handle& bsh,
3928 )
3929 
3930 {
3931  bool capitalize = true;
3932  bool appendComplete = false;
3933 
3934  string prefix; // from a small set of compile-time constants
3935  string suffix;
3936  string final;
3937 
3938  // set flags from record components
3939  if (m_Idx) {
3940  x_SetFlagsIdx (bsh, flags);
3941  } else {
3942  x_SetFlags (bsh, flags);
3943  }
3944 
3945  if (flags & fShowModifiers) {
3946  return x_GetModifiers(bsh);
3947  }
3948 
3949  if (! m_Reconstruct) {
3950  // x_SetFlags set m_MainTitle from a suitable descriptor, if any;
3951  // now strip trailing periods, commas, semicolons, and spaces.
3952  size_t pos = m_MainTitle.find_last_not_of (".,;~ ");
3953  if (pos != NPOS) {
3954  m_MainTitle.erase (pos + 1);
3955  }
3956  if (! m_MainTitle.empty()) {
3957  capitalize = false;
3958  }
3959  }
3960 
3961  // adjust protein partial/organelle/taxname suffix, if necessary
3962  if ( m_IsAA && ! m_MainTitle.empty() ) {
3963  if (m_Idx) {
3965  } else {
3967  }
3968  }
3969 
3970  // use autodef user object, if present, to regenerate title
3971  if (m_MainTitle.empty() && m_IsNA && (! (flags & fDoNotUseAutoDef)) && ! m_IsTLS && ! m_IsNZ) {
3972 
3973  CSeqdesc_CI desc(bsh, CSeqdesc::e_User);
3974  while (desc && desc->GetUser().GetObjectType() != CUser_object::eObjectType_AutodefOptions) {
3975  ++desc;
3976  }
3977 
3978  if (desc) {
3979  CAutoDef autodef;
3980  autodef.SetOptionsObject(desc->GetUser());
3981  CAutoDefModifierCombo mod_combo;
3982  CAutoDefOptions options;
3983  options.InitFromUserObject(desc->GetUser());
3984  mod_combo.SetOptions(options);
3985  m_MainTitle = autodef.GetOneDefLine(&mod_combo, bsh);
3987  }
3988  }
3989 
3990  // use appropriate algorithm if title needs to be generated
3991  if (m_MainTitle.empty()) {
3992 
3993  // PDB and patent records do not normally need source data
3994  if (m_IsPDB) {
3995  x_SetTitleFromPDB ();
3996  } else if (m_IsPatent) {
3998  }
3999 
4000  if (m_MainTitle.empty()) {
4001  // set fields from source information
4002  if (m_Idx) {
4003  x_SetBioSrcIdx (bsh);
4004  } else {
4005  x_SetBioSrc (bsh);
4006  }
4007 
4008  // several record types have specific methods
4009  if (m_IsNC) {
4010  x_SetTitleFromNC ();
4011  } else if (m_IsNM && !m_LocalAnnotsOnly) {
4012  x_SetTitleFromNM (bsh);
4013  } else if (m_IsNR) {
4014  x_SetTitleFromNR (bsh);
4015  } else if (m_IsAA && m_Idx) {
4017  } else if (m_IsAA) {
4018  x_SetTitleFromProtein (bsh);
4019  } else if (m_IsSeg && (! m_IsEST_STS_GSS)) {
4020  x_SetTitleFromSegSeq (bsh);
4021  } else if (m_IsTSA || (m_IsWGS && (! m_WGSMaster)) || (m_IsTLS && (! m_TLSMaster))) {
4022  x_SetTitleFromWGS ();
4023  } else if (m_IsMap) {
4024  x_SetTitleFromMap ();
4025  }
4026 
4027  if (m_MainTitle.empty() && m_GpipeMode) {
4029  }
4030 
4031  if (m_MainTitle.empty()) {
4032  // default title using source fields
4034  if (m_MICompleteness == NCBI_COMPLETENESS(complete) && !m_MainTitle.empty()) {
4035  appendComplete = true;
4036  }
4037  }
4038  }
4039 
4040  /*
4041  if (m_MainTitle.empty()) {
4042  // last resort title created here
4043  m_MainTitle = "No definition line found";
4044  }
4045  */
4046  }
4047 
4048  // remove TPA or TSA prefix, will rely on other data in record to set
4049  for (size_t i = 0; i < sizeof (s_tpaPrefixList) / sizeof (const char*); i++) {
4050  string str = s_tpaPrefixList [i];
4052  m_MainTitle.erase (0, str.length());
4053  // strip leading spaces remaining after removal of old MAG before TPA or TSA prefixes
4054  m_MainTitle.erase (0, m_MainTitle.find_first_not_of (' '));
4055  }
4056  }
4057 
4058  // strip leading spaces remaining after removal of old TPA or TSA prefixes
4059  m_MainTitle.erase (0, m_MainTitle.find_first_not_of (' '));
4060 
4062 
4063  // strip trailing commas, semicolons, and spaces (period may be an sp.
4064  // species)
4065  size_t pos = decoded.find_last_not_of (",;~ ");
4066  if (pos != NPOS) {
4067  decoded.erase (pos + 1);
4068  }
4069 
4070  // calculate prefix
4071  x_SetPrefix(prefix, bsh);
4072 
4073  // calculate suffix
4074  x_SetSuffix (suffix, bsh, appendComplete);
4075 
4076  string mag;
4077  if (! m_MetaGenomeSource.empty()) {
4078  if ( prefix.empty() ) {
4079  mag = "MAG: ";
4080  } else {
4081  mag = "MAG ";
4082  }
4083  }
4084 
4085  // produce final result
4086  string penult = mag + prefix + decoded + suffix;
4087 
4088  x_CleanAndCompress (final, penult, m_IsAA);
4089 
4090  if (! m_IsPDB && ! m_IsPatent && ! m_IsAA && ! m_IsSeg) {
4091  if (!final.empty() && islower ((unsigned char) final[0]) && capitalize) {
4092  final [0] = toupper ((unsigned char) final [0]);
4093  }
4094  }
4095 
4097  m_Idx.Reset (NULL);
4098 
4099  return final;
4100 }
4101 
4103  const CBioseq_Handle& bsh,
4104  CSeqEntryIndex& idx,
4106 )
4107 
4108 {
4109  m_Idx = &idx;
4110 
4111  return GenerateDefline(bsh, flags);
4112 }
4113 
4115  const CBioseq& bioseq,
4116  CScope& scope,
4117  CSeqEntryIndex& idx,
4119 )
4120 
4121 {
4122  m_Idx = &idx;
4123 
4124  return GenerateDefline(bioseq, scope, flags);
4125 }
4126 
4128  const CBioseq_Handle& bsh,
4129  feature::CFeatTree& ftree,
4131 )
4132 
4133 {
4134  m_ConstructedFeatTree = true;
4135  m_InitializedFeatTree = true;
4136  m_Feat_Tree = &ftree;
4137 
4138  return GenerateDefline(bsh, flags);
4139 }
4140 
4142  const CBioseq& bioseq,
4143  CScope& scope,
4145 )
4146 
4147 {
4148  CBioseq_Handle bsh = scope.AddBioseq(bioseq,
4151  return GenerateDefline(bsh, flags);
4152 }
4153 
4155  const CBioseq& bioseq,
4156  CScope& scope,
4157  feature::CFeatTree& ftree,
4159 )
4160 
4161 {
4162  m_ConstructedFeatTree = true;
4163  m_InitializedFeatTree = true;
4164  m_Feat_Tree = &ftree;
4165 
4166  return GenerateDefline(bioseq, scope, flags);
4167 }
4168 
4170  AddWord ("heterogeneous population sequenced", 1);
4171  AddWord ("low-quality sequence region", 2);
4172  AddWord ("unextendable partial coding region", 3);
4173  Prime ();
4174 }
4175 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
void SetOptions(const CAutoDefOptions &options)
void InitFromUserObject(const CUser_object &obj)
void SetOptionsObject(const CUser_object &user)
Definition: autodef.cpp:1196
string GetOneDefLine(CAutoDefModifierCombo *mod_combo, const CBioseq_Handle &bh, CRef< feature::CFeatTree > featTree=null)
Definition: autodef.cpp:1095
CBinomialOrgName –.
const string & GetTaxname(void) const
Definition: BioSource.cpp:340
const COrgName & GetOrgname(void) const
Definition: BioSource.cpp:410
bool IsSetOrgname(void) const
Definition: BioSource.cpp:405
bool IsSetTaxname(void) const
Definition: BioSource.cpp:335
CTempString GetSubstrain(void)
Definition: indexer.cpp:2778
int GetPatentSequence(void) const
Definition: indexer.hpp:502
bool IsHTGTech(void)
Definition: indexer.cpp:2416
CSeq_inst::TLength GetLength(void) const
Definition: indexer.hpp:474
CTempString GetCultivar(void)
Definition: indexer.cpp:2736
bool HasClone(void)
Definition: indexer.cpp:2686
CTempString GetMetaGenomeSource(void)
Definition: indexer.cpp:2788
bool IsTPAReasm(void)
Definition: indexer.cpp:2848
bool IsTPAInf(void)
Definition: indexer.cpp:2838
CTempString GetBreed(void)
Definition: indexer.cpp:2726
bool IsChromosome(void)
Definition: indexer.cpp:2606
bool IsWGS(void)
Definition: indexer.cpp:2456
CTempString GetGenus(void)
Definition: indexer.cpp:2556
bool IsNR(void) const
Definition: indexer.hpp:487
CMolInfo::TTech GetTech(void)
Definition: indexer.cpp:2396
CRef< CFeatureIndex > GetFeatureForProduct(void)
Definition: indexer.cpp:2299
bool IsEST_STS_GSS(void)
Definition: indexer.cpp:2466
bool IsWP(void) const
Definition: indexer.hpp:491
CTempString GetStrain(void)
Definition: indexer.cpp:2768
bool IsTSAMaster(void) const
Definition: indexer.hpp:494
bool IsUnreviewed(void)
Definition: indexer.cpp:2938
bool IsNM(void) const
Definition: indexer.hpp:486
bool IsTPAExp(void)
Definition: indexer.cpp:2828
bool IsHTGSPooled(void)
Definition: indexer.cpp:2818
bool IsHTGSCancelled(void)
Definition: indexer.cpp:2798
bool IsDelta(void) const
Definition: indexer.hpp:476
bool IsThirdParty(void) const
Definition: indexer.hpp:492
bool IsMap(void) const
Definition: indexer.hpp:479
bool IsAA(void) const
Definition: indexer.hpp:472
bool IsPatent(void) const
Definition: indexer.hpp:489
bool IsUnverifiedFeature(void)
Definition: indexer.cpp:2898
CTempString GetSpecies(void)
Definition: indexer.cpp:2566
CConstRef< CBioSource > GetBioSource(void)
Definition: indexer.cpp:2486
string GetrEnzyme(void)
Definition: indexer.cpp:3034
bool IsWGSMaster(void) const
Definition: indexer.hpp:493
bool IsTLSMaster(void) const
Definition: indexer.hpp:495
string GetSecondSuperKingdom(void)
Definition: indexer.cpp:2636
CMolInfo::TCompleteness GetCompleteness(void)
Definition: indexer.cpp:2406
int GetPDBChain(void) const
Definition: indexer.hpp:504
string GetPatentCountry(void) const
Definition: indexer.hpp:500
bool IsPlasmid(void)
Definition: indexer.cpp:2596
bool IsCrossKingdom(void)
Definition: indexer.cpp:2646
string GetPDBChainID(void) const
Definition: indexer.hpp:505
bool IsUnordered(void)
Definition: indexer.cpp:2858
bool IsNA(void) const
Definition: indexer.hpp:471
CTempString GetIsolate(void)
Definition: indexer.cpp:2758
CTempString GetPDBCompound(void)
Definition: indexer.cpp:2868
const string & GetTitle(void)
Definition: indexer.cpp:2366
CTempString GetMap(void)
Definition: indexer.cpp:2696
string GetPatentNumber(void) const
Definition: indexer.hpp:501
bool IsUnreviewedUnannotated(void)
Definition: indexer.cpp:2948
bool IsPseudogene(void)
Definition: indexer.cpp:2978
CTempString GetSegment(void)
Definition: indexer.cpp:2716
CTempString GetLinkageGroup(void)
Definition: indexer.cpp:2666
bool IsHTGSDraft(void)
Definition: indexer.cpp:2808
const string & GetTaxname(void)
Definition: indexer.cpp:2496
CSeq_inst::TTopology GetTopology(void) const
Definition: indexer.hpp:473
bool IsNZ(void) const
Definition: indexer.hpp:488
CWeakRef< CBioseqIndex > GetBioseqForProduct(void)
Definition: indexer.cpp:2341
int GetGeneralId(void) const
Definition: indexer.hpp:498
const string & GetOrganelle(void)
Definition: indexer.cpp:2616
CRef< CFeatureIndex > GetBestProteinFeature(void)
Definition: indexer.cpp:2353
string GetGeneralStr(void) const
Definition: indexer.hpp:497
bool IsVirtual(void) const
Definition: indexer.hpp:478
CTempString GetSpecimenVoucher(void)
Definition: indexer.cpp:2747
CTempString GetClone(void)
Definition: indexer.cpp:2676
CBioSource::TGenome GetGenome(void)
Definition: indexer.cpp:2586
CTempString GetChromosome(void)
Definition: indexer.cpp:2656
bool IsUnverifiedMisassembled(void)
Definition: indexer.cpp:2918
CMolInfo::TBiomol GetBiomol(void)
Definition: indexer.cpp:2386
bool IsHTGSUnfinished(void)
Definition: indexer.cpp:2426
CTempString GetPlasmid(void)
Definition: indexer.cpp:2706
string GetFirstSuperKingdom(void)
Definition: indexer.cpp:2626
bool IsUseBiosrc(void)
Definition: indexer.cpp:2476
bool IsTSA(void)
Definition: indexer.cpp:2446
bool IsUnverifiedContaminant(void)
Definition: indexer.cpp:2928
bool IsNC(void) const
Definition: indexer.hpp:485
bool IsMultispecies(void)
Definition: indexer.cpp:2576
bool IsTLS(void)
Definition: indexer.cpp:2436
const string & GetComment(void)
Definition: indexer.cpp:2968
bool IsUnverifiedOrganism(void)
Definition: indexer.cpp:2908
CTempString GetTargetedLocus(void)
Definition: indexer.cpp:2958
bool IsPDB(void) const
Definition: indexer.hpp:490
const string & GetDescTaxname(void)
Definition: indexer.cpp:2506
bool IsUnverified(void)
Definition: indexer.cpp:2888
CBioseq_Handle –.
Definition: Dbtag.hpp:53
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
void ReplaceAndAdd(const CTempString &value, const CTempString &replace_what, const CTempString &replace_with)
void Add(const CTempString &name, const CTempString &value, EHidePart hide=eHideNone)
CDefLineJoiner(bool show_mods=false)
CTextJoiner< 64, CTempString > m_Joiner
void Join(std::string *result) const
CEMBL_block –.
Definition: EMBL_block.hpp:66
CFeatTree.
Definition: feature.hpp:173
CFeat_CI –.
Definition: feat_ci.hpp:64
CRef< CFeatureIndex > GetBestGene(void)
Definition: indexer.cpp:3204
const CMappedFeat GetMappedFeat(void) const
Definition: indexer.hpp:897
const string & GetSomeNumber(void) const
Definition: Id_pat.cpp:96
CMap_ext –.
Definition: Map_ext.hpp:66
CMappedFeat –.
Definition: mapped_feat.hpp:59
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
CPCRPrimerSet –.
CPCRReactionSet –.
CPDB_block –.
Definition: PDB_block.hpp:66
CPartialOrgName –.
CRsite_ref –.
Definition: Rsite_ref.hpp:66
CScope –.
Definition: scope.hpp:92
CRef< CBioseqIndex > GetBioseqIndex(void)
Definition: indexer.cpp:114
CSeqFeatXref –.
Definition: SeqFeatXref.hpp:66
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
CSeq_entry_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
static bool NCBI_UseGeoLocNameForCountry(void)
Definition: SubSource.cpp:95
CTaxElement –.
Definition: TaxElement.hpp:66
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
@ eObjectType_AutodefOptions
bool IsUnverifiedMisassembled() const
bool IsUnverifiedOrganism() const
bool IsUnverifiedContaminant() const
bool IsUnverifiedFeature() const
bool IsUnverified() const
bool IsUnreviewedUnannotated() const
bool IsUnreviewed() const
EObjectType GetObjectType() const
Definition: map.hpp:338
USING_SCOPE(objects)
static size_t s_TitleEndsInOrganism(string &title, CTempString taxname)
static bool x_GetSegSeqInfoViaCDS(string &locus, string &product, const char *&completeness, const CBioseq_Handle &bsh)
static void s_AddVoucherAndIsolate(const CTempString &taxname, const CTempString &strain, const CTempString &specimen_voucher, const CTempString &isolate, CDefLineJoiner &joiner)
#define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var)
void x_CleanAndCompress(string &dest, const CTempString &instr, bool isProt)
#define comma_space
#define FOR_SELECTED_SEQFEAT_ON_BIOSEQ_HANDLE(Itr, Var, Sel)
#define space_comma
static string s_RemoveWhiteSpace(string str)
#define space_semicolon
static string s_RemoveBracketedOrgFromEnd(string str, string taxname)
#define FOR_EACH_SEQFEAT_ON_BIOSEQ_HANDLE(Itr, Var, Chs)
static void x_FlyCG_PtoR(string &s)
#define semicolon_space
#define bracket_space
#define FOR_EACH_SEQFEAT_ON_SCOPE(Itr, Var, Loc, Chs)
static void s_TrimMainTitle(string &str)
#define twocommas
static bool s_EndsWithStrain(const CTempString &taxname, const CTempString &strain)
#define space_bracket
static const char * s_proteinOrganellePrefix[]
static string s_RemoveColonsAndWhiteSpace(string str)
#define twospaces
static CConstRef< CBioSource > x_GetSourceFeatViaCDS(const CBioseq_Handle &bsh)
USING_NCBI_SCOPE
EHidePart
@ eHideValue
@ eHideNone
@ eHideType
static const char * s_tpaPrefixList[]
static bool s_IsVirusOrPhage(const CTempString &taxname)
API (CDeflineGenerator) for computing sequences' titles ("definitions").
static uch flags
std::ofstream out("events_result.xml")
main entry point for tests
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
char data[12]
Definition: iconv.c:80
#define FOR_EACH_USERFIELD_ON_USEROBJECT(Itr, Var)
FOR_EACH_USERFIELD_ON_USEROBJECT EDIT_EACH_USERFIELD_ON_USEROBJECT.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
CConstRef< CSeq_id > GetSeqId(void) const
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
CSeq_id::E_Choice Which(void) const
string GetLabel(const CSeq_id &id)
@ fAcc_master
Definition: Seq_id.hpp:256
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
CMappedFeat GetBestGeneForCds(const CMappedFeat &cds_feat, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0, CFeatTree::EBestGeneType lookup_type=CFeatTree::eBestGene_TreeOnly)
Definition: feature.cpp:3321
CMappedFeat GetBestOverlappingFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype need_subtype, sequence::EOverlapType overlap_type, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3653
@ fFGL_Content
Include its content if there is any.
Definition: feature.hpp:73
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
CSeq_loc * SeqLocRevCmpl(const CSeq_loc &loc, CScope *scope)
Get reverse complement of the seq-loc (?)
@ eOverlap_SubsetRev
1st is a subset of 2nd ranges
@ eOverlap_Contained
2nd contained within 1st extremes
void x_AdjustProteinTitleSuffixIdx(const CBioseq_Handle &bsh)
CRef< feature::CFeatTree > m_Feat_Tree
CTempString m_Chromosome
subsource fields
CTempString m_SpecimenVoucher
CDeflineGenerator(void)
Constructor.
void x_SetTitleFromNR(const CBioseq_Handle &bsh)
bool m_Reconstruct
ignore existing title is forced for certain types
const CSeq_feat * GetCDSForProduct(const CBioseq &product, CScope *scope)
Get the encoding CDS feature of a given protein sequence.
Definition: sequence.cpp:2549
void x_SetTitleFromGPipe(void)
int TUserFlags
Binary "OR" of EUserFlags.
CTempString m_PDBCompound
pdb block fields
bool m_HTGSCancelled
genbank or embl block keyword fields
void x_DescribeClones(vector< CTempString > &desc, string &buf)
CTempString m_UnreviewedPrefix
void x_SetTitleFromMap(void)
string GenerateDefline(const CBioseq_Handle &bsh, TUserFlags flags=0)
Main method.
CTempString m_LinkageGroup
void x_SetTitleFromWGS(void)
bool m_IsNA
seq-inst fields
void x_SetTitleFromProtein(const CBioseq_Handle &bsh)
CMolInfo::TTech m_MITech
void x_SetBioSrcIdx(const CBioseq_Handle &bsh)
CTempString m_Substrain
void x_SetFlagsIdx(const CBioseq_Handle &bsh, TUserFlags flags)
CSeq_inst::TLength m_Length
string m_rEnzyme
map fields
CTempString m_Organelle
const char * x_OrganelleName(CBioSource::TGenome genome) const
CConstRef< CGene_ref > x_GetGeneRefViaCDS(const CMappedFeat &mapped_cds)
CConstRef< CSeq_feat > x_GetLongestProtein(const CBioseq_Handle &bsh)
CTempString m_Cultivar
CTempString m_UnverifiedPrefix
void x_AdjustProteinTitleSuffix(const CBioseq_Handle &bsh)
void x_SetTitleFromProteinIdx(const CBioseq_Handle &bsh)
const CBioSource * GetBioSource(const CBioseq &bioseq)
Retrieve the BioSource object for a given bioseq handle.
Definition: sequence.cpp:104
CBioSource::TGenome m_Genome
string x_GetModifiers(const CBioseq_Handle &handle)
void x_SetTitleFromPatent(void)
const CMolInfo * GetMolInfo(const CBioseq &bioseq)
Retrieve the MolInfo object for a given bioseq handle.
Definition: sequence.cpp:284
CTempString m_Comment
comment fields
bool x_IsComplete() const
void x_SetTitleFromSegSeq(const CBioseq_Handle &bsh)
void x_SetTitleFromNC(void)
~CDeflineGenerator(void)
Destructor.
bool m_IsNC
seq-id fields
CMappedFeat GetMappedCDSForProduct(const CBioseq_Handle &product)
Definition: sequence.cpp:2568
CConstRef< CBioSource > m_Source
biosource fields
CTempString m_TargetedLocus
static CSafeStatic< CLowQualityTextFsm > ms_p_Low_Quality_Fsa
void x_SetBioSrc(const CBioseq_Handle &bsh)
bool m_IsUnverified
user object fields
void x_SetTitleFromBioSrc(void)
void x_Init(void)
internal methods
void x_SetTitleFromPDB(void)
CSeq_entry_Handle m_TopSEH
internal feature tree for parent mapping
void x_SetSuffix(string &suffix, const CBioseq_Handle &bsh, bool appendComplete)
CMolInfo::TCompleteness m_MICompleteness
void x_SetTitleFromNM(const CBioseq_Handle &bsh)
bool x_CDShasLowQualityException(const CSeq_feat &sft)
void x_SetFlags(const CBioseq_Handle &bsh, TUserFlags flags)
CTempString m_Breed
orgmod fields
CRef< CSeqEntryIndex > m_Idx
index with feature tree for each Bioseq
CTempString m_MetaGenomeSource
CSeq_inst::TTopology m_Topology
void x_SetPrefix(string &prefix, const CBioseq_Handle &bsh)
CMolInfo::TBiomol m_MIBiomol
molinfo fields
@ fLocalAnnotsOnly
Never use related sequences' annotations.
@ fDevMode
Development mode for testing new features.