NCBI C++ ToolKit
wgsread.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: wgsread.cpp 102987 2024-08-16 17:07:33Z vasilche $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eugene Vasilchenko
27  *
28  * File Description:
29  * Access to WGS files
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
36 #include <corelib/ncbistr.hpp>
37 #include <corelib/ncbifile.hpp>
38 #include <corelib/ncbi_param.hpp>
39 #include <util/line_reader.hpp>
41 #include <objects/seq/seq__.hpp>
48 #include <serial/objistrasnb.hpp>
49 #include <serial/objostrasnb.hpp>
50 #include <serial/serial.hpp>
51 #include <serial/pack_string.hpp>
52 #include <serial/objhook.hpp>
53 #include <serial/objectio.hpp>
54 #include <sra/error_codes.hpp>
55 
57 #include <ncbi/ncbi.h>
58 #include <insdc/insdc.h>
60 #include <vdb/vdb-priv.h>
61 #include <numeric>
62 
63 //#define COLLECT_PROFILE
64 //#define TEST_ACC_VERSION
65 //#define USE_TEST_PATH
66 
67 #define USE_GLOBAL_AMBIGUITY_CACHE
68 
69 #ifdef USE_GLOBAL_AMBIGUITY_CACHE
70 # define DEFAULT_AMBIGUITY_CACHE_SIZE "128MB"
71 #else
72 # define DEFAULT_AMBIGUITY_CACHE_SIZE "32MB"
73 #endif
74 
76 
77 #define NCBI_USE_ERRCODE_X WGSReader
79 
81 
82 
84 NCBI_PARAM_DEF_EX(int, WGS, DEBUG, 0, eParam_NoThread, WGS_DEBUG);
85 
86 
87 static int s_GetDebugLevel(void)
88 {
89  static int value = NCBI_PARAM_TYPE(WGS, DEBUG)::GetDefault();
90  return value;
91 }
92 
93 
94 NCBI_PARAM_DECL(bool, WGS, MASTER_DESCR);
95 NCBI_PARAM_DEF(bool, WGS, MASTER_DESCR, true);
96 
97 
98 NCBI_PARAM_DECL(bool, WGS, CLIP_BY_QUALITY);
99 NCBI_PARAM_DEF_EX(bool, WGS, CLIP_BY_QUALITY, true,
100  eParam_NoThread, CSRA_CLIP_BY_QUALITY);
101 
102 
103 static bool s_GetClipByQuality(void)
104 {
105  static CSafeStatic<NCBI_PARAM_TYPE(WGS, CLIP_BY_QUALITY)> s_Value;
106  return s_Value->Get();
107 }
108 
109 
110 NCBI_PARAM_DECL(bool, WGS, USE_AMBIGUITY_MASK);
111 NCBI_PARAM_DEF_EX(bool, WGS, USE_AMBIGUITY_MASK, true,
112  eParam_NoThread, WGS_USE_AMBIGUITY_MASK);
113 
114 
115 static bool s_UseAmbiguityMask(void)
116 {
117  static bool v = NCBI_PARAM_TYPE(WGS, USE_AMBIGUITY_MASK)::GetDefault();
118  return v;
119 }
120 
121 
122 NCBI_PARAM_DECL(bool, WGS, USE_GAP_INFO);
123 NCBI_PARAM_DEF_EX(bool, WGS, USE_GAP_INFO, true,
124  eParam_NoThread, WGS_USE_GAP_INFO);
125 
126 
127 static bool s_UseGapInfo(void)
128 {
129  static bool v = NCBI_PARAM_TYPE(WGS, USE_GAP_INFO)::GetDefault();
130  return v;
131 }
132 
133 
134 NCBI_PARAM_DECL(bool, WGS, USE_AMBIGUITY_4NA);
135 NCBI_PARAM_DEF_EX(bool, WGS, USE_AMBIGUITY_4NA, true,
136  eParam_NoThread, WGS_USE_AMBIGUITY_4NA);
137 
138 
139 static bool s_UseAmbiguity4na(void)
140 {
141  static bool v = NCBI_PARAM_TYPE(WGS, USE_AMBIGUITY_4NA)::GetDefault();
142  return v;
143 }
144 
145 
146 NCBI_PARAM_DECL(bool, WGS, USE_FULL_4NA_BLOCKS);
147 NCBI_PARAM_DEF_EX(bool, WGS, USE_FULL_4NA_BLOCKS, false,
148  eParam_NoThread, WGS_USE_FULL_4NA_BLOCKS);
149 
150 
151 static bool s_UseFull4naBlocks(void)
152 {
153  static bool v = NCBI_PARAM_TYPE(WGS, USE_FULL_4NA_BLOCKS)::GetDefault();
154  return v;
155 }
156 
157 
158 NCBI_PARAM_DECL(string, WGS, AMBIGUITY_CACHE);
159 NCBI_PARAM_DEF_EX(string, WGS, AMBIGUITY_CACHE, DEFAULT_AMBIGUITY_CACHE_SIZE,
160  eParam_NoThread, WGS_AMBIGUITY_CACHE);
161 
162 
163 static size_t s_GetAmbiguityCacheSize(void)
164 {
165  static size_t v = NStr::StringToUInt8_DataSize(NCBI_PARAM_TYPE(WGS, AMBIGUITY_CACHE)::GetDefault());
166  return v;
167 }
168 
169 
170 #ifdef USE_TEST_PATH
171 NCBI_PARAM_DECL(string, WGS, TEST_PATH);
172 NCBI_PARAM_DEF_EX(string, WGS, TEST_PATH, "",
173  eParam_NoThread, WGS_TEST_PATH);
174 #endif
175 
176 
177 // fixed WGS VDB parameters
178 static const char kSeq_descrFirstByte = 49; // first byte of Seq-descr ASN.1
179 static const TSeqPos kAmbiguityBlockSize = 1024; // defined by WGS VDB schema
180 
181 // split parameters, turn on/off splitting of different pieces of information
182 static bool kEnableSplitQual = true;
183 static bool kEnableSplitData = true;
184 static bool kEnableSplitProd = true;
185 static bool kEnableSplitFeat = true;
186 
187 // split info fixed parameters
189 static int kMainEntryId = 1;
195  kChunkIdStep = 4
196 };
197 
198 // split configurable parameters
199 static const size_t kProdPerChunk = 64;
200 static const size_t kMinFeatCountToSplit = 64;
201 static const size_t kFeatPerChunk = 256;
202 static const TSeqPos kQualChunkSize = 64<<10; // 64KiB
203 static const TSeqPos kDataChunkSize = 256<<10; // 64KiB in 2na encoding
204 static const TSeqPos kMinDataSplitSize = 128<<10;
205 
206 #ifdef COLLECT_PROFILE
207 struct SProfiler
208 {
209  const char* name;
210  size_t count;
211  CStopWatch sw;
212  SProfiler() : name(0), count(0) {}
213  ~SProfiler() {
214  if ( name )
215  cout << name<<" calls: "<<count<<" time: "<<sw.Elapsed()<<endl;
216  }
217 };
218 struct SProfilerGuard
219 {
220  SProfiler& sw;
221  SProfilerGuard(SProfiler& sw, const char* name)
222  : sw(sw)
223  {
224  sw.name = name;
225  sw.count += 1;
226  sw.sw.Start();
227  }
228  ~SProfilerGuard()
229  {
230  sw.sw.Stop();
231  }
232 };
233 
234 static SProfiler sw_Serialize;
235 static SProfiler sw_Feat;
236 static SProfiler sw_GetAccSeq_id;
237 static SProfiler sw_GetBioseq;
238 static SProfiler sw_GetSeq_entry;
239 static SProfiler sw_GetSeq_entryData;
240 static SProfiler sw_GetSplitInfo;
241 static SProfiler sw_GetSplitInfoData;
242 static SProfiler sw_InitSplit;
243 static SProfiler sw_GetFeatLocIdTypeRange;
244 static SProfiler sw_GetFeatLocIdTypeFeat;
245 static SProfiler sw_GetFeatLocIdTypeFeatBytes;
246 static SProfiler sw_GetFeatBytes;
247 static SProfiler sw_GetChunk;
248 static SProfiler sw_CreateQualityChunk;
249 static SProfiler sw_CreateDataChunk;
250 static SProfiler sw_CreateProductsChunk;
251 static SProfiler sw_CreateFeaturesChunk;
252 static SProfiler sw__GetProtFeat;
253 static SProfiler sw___GetProtAnnot;
254 static SProfiler sw___GetProtInst;
255 static SProfiler sw___GetProtDescr;
256 static SProfiler sw____GetProtWGSAcc;
257 static SProfiler sw____GetProtAccVer;
258 static SProfiler sw____GetProtAcc;
259 static SProfiler sw____GetProtGI;
260 static SProfiler sw____GetProtGISeq_id;
261 static SProfiler sw____GetProtGnlSeq_id;
262 static SProfiler sw____GetProtAccSeq_id;
263 static SProfiler sw___GetProtIds;
264 static SProfiler sw__GetProtBioseq;
265 static SProfiler sw_GetProtEntry;
266 static SProfiler sw__GetScaffoldFeat;
267 static SProfiler sw___GetScaffoldQual;
268 static SProfiler sw___GetScaffoldAnnot;
269 static SProfiler sw___GetScaffoldInst;
270 static SProfiler sw___GetScaffoldDescr;
271 static SProfiler sw___GetScaffoldIds;
272 static SProfiler sw__GetScaffoldBioseq;
273 static SProfiler sw_GetScaffoldEntry;
274 static SProfiler sw__GetContigFeat;
275 static SProfiler sw___GetContigQual;
276 static SProfiler sw____GetContigQualSize;
277 static SProfiler sw____GetContigQualData;
278 static SProfiler sw____GetContigQualMinMax;
279 static SProfiler sw___GetContigAnnot;
280 static SProfiler sw____IsGap;
281 static SProfiler sw____Get2naLen;
282 static SProfiler sw____Get4naLen;
283 static SProfiler sw____GetGapLen;
284 static SProfiler sw____GetRaw2na;
285 static SProfiler sw____GetRaw4na;
286 static SProfiler sw____GetAmb2Mask;
287 static SProfiler sw____Get4na2Mask;
288 static SProfiler sw____Scan4na;
289 static SProfiler sw____GetCvt4na;
290 static SProfiler sw____GetAmb4na;
291 static SProfiler sw____GetBlk4na;
292 static SProfiler sw____SetGaps;
293 static SProfiler sw___GetContigInst;
294 static SProfiler sw___GetContigDescr;
295 static SProfiler sw___GetContigIds;
296 static SProfiler sw__GetContigBioseq;
297 static SProfiler sw_GetContigEntry;
298 static SProfiler sw_FeatIterator;
299 static SProfiler sw_ProtIterator;
300 static SProfiler sw_ScafIterator;
301 static SProfiler sw_SeqIterator;
302 static SProfiler sw_WGSOpen;
303 
304 # define PROFILE(var) SProfilerGuard guard(var, #var)
305 #else
306 # define PROFILE(var)
307 #endif
308 
309 /////////////////////////////////////////////////////////////////////////////
310 // CAsnBinData
311 /////////////////////////////////////////////////////////////////////////////
312 
313 
315  : m_MainObject(&obj)
316 {
317 }
318 
319 
321 {
322 }
323 
324 
326 {
327  out << *m_MainObject;
328 }
329 
330 
332 {
333 public:
335  : CAsnBinData(obj),
337  {
338  }
339  virtual ~CWGSAsnBinData(void)
340  {
341  }
342 
343  virtual void Serialize(CObjectOStreamAsnBinary& out) const;
344 
346  struct SFtableInfo {
347  vector<char> m_Bytes;
348 
350  {
351  m_Bytes.insert(m_Bytes.end(), data.begin(), data.end());
352  }
353  };
354  typedef vector<char> TDescrInfo;
355 
357  {
358  m_FtableMap[&ftable].AddFeature(data);
359  }
360  void AddDescr(CBioseq& seq, const CTempString& data)
361  {
362  seq.SetDescr(*m_EmptyDescr);
363  vector<char>& dst = m_DescrMap[&seq];
364  if ( data[0] == kSeq_descrFirstByte ) {
365  // test for DESCR variant with Seqdesc list insead of Seq-descr
366  dst.assign(data.begin()+2, data.end()-2);
367  }
368  else {
369  dst.assign(data.begin(), data.end());
370  }
371  }
372 
378 };
379 
381 {
382 public:
387  : info_map(info_map)
388  {
389  }
390 
392  const CConstObjectInfoCV& variant)
393  {
394  CConstObjectInfo var_info = variant.GetVariant();
395  TKey key = (TKey)var_info.GetObjectPtr();
397  if ( iter != info_map.end() ) {
398  COStreamContainer cont(out, var_info);
400  cont << **it;
401  }
402  const TInfo& info = iter->second;
403  out.Write(info.m_Bytes.data(), info.m_Bytes.size());
404  }
405  else {
406  DefaultWrite(out, variant);
407  }
408  }
409 
411 };
412 
413 
415 {
416 public:
417  typedef const CBioseq* TKey;
421  : info_map(info_map)
422  {
423  }
424 
426  const CConstObjectInfoMI& member)
427  {
428  TKey key = (TKey)member.GetClassObject().GetObjectPtr();
430  if ( iter != info_map.end() ) {
431  COStreamClassMember mem(out, member);
432  const TInfo& info = iter->second;
433  if ( info.data()[0] == kSeq_descrFirstByte ) {
434  // Seq-descr
435  out.Write(info.data(), info.size());
436  }
437  else {
438  CObjectTypeInfo cont = member.GetMemberType();
439  while ( cont.GetTypeFamily() == eTypeFamilyPointer ) {
440  cont = cont.GetPointedType();
441  }
442  COStreamContainer mem(out, cont);
443  out.Write(info.data(), info.size());
444  }
445  }
446  else {
447  DefaultWrite(out, member);
448  }
449  }
450 
452 };
453 
454 
456 {
457  PROFILE(sw_Serialize);
459  CObjectHookGuard<CSeq_annot::TData> guard1("ftable", hook1, &out);
461  CObjectHookGuard<CBioseq> guard2("descr", hook2, &out);
463 }
464 
465 
466 /////////////////////////////////////////////////////////////////////////////
467 // CWGSDb_Impl cursors
468 /////////////////////////////////////////////////////////////////////////////
469 
470 
471 // SSeq0TableCursor is helper accessor structure for SEQUENCE table
473  explicit SSeq0TableCursor(const CVDBTable& table);
474 
476 
479  DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX);
481  DECLARE_VDB_COLUMN_AS(NCBI_taxid, TAXID);
482 };
483 
484 
485 // SSeqTableCursor is helper accessor structure for SEQUENCE table
487  explicit SSeqTableCursor(const CVDBTable& table);
488 
490 
491  DECLARE_VDB_COLUMN_AS(NCBI_gi, GI);
512  DECLARE_VDB_COLUMN_AS(bool, CIRCULAR);
516  DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_PRODUCT_ROW_ID);
517  typedef pair<TVDBRowId, TVDBRowId> row_range_t;
518  DECLARE_VDB_COLUMN_AS(row_range_t, CONTIG_NAME_ROW_RANGE);
520  DECLARE_VDB_COLUMN_AS(Uint1, AMBIGUITY_MASK);
522  DECLARE_VDB_COLUMN_AS(INSDC_4na_bin, AMBIGUITY_4NA);
523 };
524 
525 
526 // SSeqTableCursor is helper accessor structure for SEQUENCE table
528  explicit SSeq4naTableCursor(const CVDBTable& table);
529 
531 
532  DECLARE_VDB_COLUMN_AS(INSDC_4na_bin, READ); // unpacked 4na, one base per byte
533 };
534 
535 
537  : m_Cursor(table),
538  INIT_VDB_COLUMN(ACC_PREFIX),
539  INIT_VDB_COLUMN(ACC_CONTIG_LEN),
540  INIT_OPTIONAL_VDB_COLUMN(SEQID_GNL_PREFIX),
543 {
544 }
545 
546 
548  : m_Cursor(table),
550  INIT_VDB_COLUMN(ACCESSION),
551  INIT_VDB_COLUMN(ACC_VERSION),
552  INIT_VDB_COLUMN(CONTIG_NAME),
554  INIT_VDB_COLUMN(TITLE),
556  INIT_VDB_COLUMN(READ_START),
557  INIT_VDB_COLUMN(READ_LEN),
558  INIT_VDB_COLUMN(TRIM_START),
559  INIT_VDB_COLUMN(TRIM_LEN),
561  INIT_OPTIONAL_VDB_COLUMN(NUC_PROT_DESCR),
563  INIT_OPTIONAL_VDB_COLUMN(GB_STATE),
564  INIT_OPTIONAL_VDB_COLUMN(PUBLIC_COMMENT),
565  INIT_OPTIONAL_VDB_COLUMN(GAP_START),
566  INIT_OPTIONAL_VDB_COLUMN(GAP_LEN),
567  INIT_OPTIONAL_VDB_COLUMN(GAP_PROPS),
568  INIT_OPTIONAL_VDB_COLUMN(GAP_LINKAGE),
569  INIT_OPTIONAL_VDB_COLUMN(QUALITY),
570  INIT_OPTIONAL_VDB_COLUMN(CIRCULAR),
572  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_START),
573  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_END),
574  INIT_OPTIONAL_VDB_COLUMN(FEAT_PRODUCT_ROW_ID),
575  INIT_OPTIONAL_VDB_COLUMN(CONTIG_NAME_ROW_RANGE),
576  m_READ_2na(m_Cursor, "(INSDC:2na:packed)READ",
577  NULL, CVDBColumn::eMissing_Allow), // packed 2na
578  INIT_OPTIONAL_VDB_COLUMN(AMBIGUITY_MASK),
579  INIT_OPTIONAL_VDB_COLUMN(AMBIGUITY_POS),
580  INIT_OPTIONAL_VDB_COLUMN(AMBIGUITY_4NA)
581 {
582  if ( !s_UseAmbiguityMask() ) {
583  m_AMBIGUITY_MASK = CVDBColumnBits<8>();
584  }
585  if ( !s_UseGapInfo() ) {
586  m_GAP_START = CVDBColumnBits<32>();
587  }
588  if ( s_UseAmbiguity4na() && m_GAP_START && m_GAP_LEN && m_AMBIGUITY_POS && m_AMBIGUITY_4NA ) {
589  // all fields to restore ambiguities are present
590  }
591  else {
592  // otherwise we need 4na data
593  m_AMBIGUITY_POS.Reset();
594  m_AMBIGUITY_4NA.Reset();
595  }
596 
597  // optimization - treat completely empty QUALITY column as inexistent - no quality graphs
598  m_QUALITY.ResetIfAlwaysEmpty(m_Cursor);
599 }
600 
601 
603  : m_Cursor(table),
604  INIT_VDB_COLUMN_AS(READ, INSDC:4na:bin)
605 {
606 }
607 
608 
609 // SScfTableCursor is helper accessor structure for optional SCAFFOLD table
612 
614 
617  DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX);
623  DECLARE_VDB_COLUMN_AS(bool, CIRCULAR);
626  DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_PRODUCT_ROW_ID);
628 };
629 
630 
632  : m_Cursor(table),
633  INIT_VDB_COLUMN(SCAFFOLD_NAME),
634  INIT_OPTIONAL_VDB_COLUMN(ACCESSION),
635  INIT_OPTIONAL_VDB_COLUMN(SEQID_GNL_PREFIX),
636  INIT_VDB_COLUMN(COMPONENT_ID),
637  INIT_VDB_COLUMN(COMPONENT_START),
638  INIT_VDB_COLUMN(COMPONENT_LEN),
639  INIT_VDB_COLUMN(COMPONENT_PROPS),
640  INIT_OPTIONAL_VDB_COLUMN(COMPONENT_LINKAGE),
641  INIT_OPTIONAL_VDB_COLUMN(CIRCULAR),
642  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_START),
643  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_END),
644  INIT_OPTIONAL_VDB_COLUMN(FEAT_PRODUCT_ROW_ID),
645  INIT_OPTIONAL_VDB_COLUMN(GB_STATE)
646 {
647 }
648 
649 
650 // SProt0TableCursor is helper accessor structure for optional PROTEIN table
652  explicit SProt0TableCursor(const CVDBTable& table);
653 
655 
656  DECLARE_VDB_COLUMN_AS(NCBI_gi, GI);
657  //DECLARE_VDB_COLUMN_AS_STRING(ACCESSION);
660  DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX);
662 };
663 
664 
665 // SProtTableCursor is helper accessor structure for optional PROTEIN table
667  explicit SProtTableCursor(const CVDBTable& table);
668 
670 
671  //DECLARE_VDB_COLUMN_AS(NCBI_gi, GI);
672  //DECLARE_VDB_COLUMN_AS_STRING(ACCESSION);
673  //DECLARE_VDB_COLUMN_AS_STRING(GB_ACCESSION);
674  //DECLARE_VDB_COLUMN_AS(uint32_t, ACC_VERSION);
675  //DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX);
682  //DECLARE_VDB_COLUMN_AS_STRING(PROTEIN_NAME);
684  DECLARE_VDB_COLUMN_AS(NCBI_taxid, TAXID);
689  DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_PRODUCT_ROW_ID);
693 };
694 
695 
697  : m_Cursor(table),
699  //INIT_VDB_COLUMN(ACCESSION),
700  INIT_OPTIONAL_VDB_COLUMN(GB_ACCESSION),
701  INIT_VDB_COLUMN(ACC_VERSION),
702  INIT_OPTIONAL_VDB_COLUMN(SEQID_GNL_PREFIX),
703  INIT_VDB_COLUMN(PROTEIN_NAME)
704 {
705 }
706 
707 
709  : m_Cursor(table),
710  //INIT_OPTIONAL_VDB_COLUMN(GI),
711  //INIT_VDB_COLUMN(ACCESSION),
712  //INIT_OPTIONAL_VDB_COLUMN(GB_ACCESSION),
713  //INIT_VDB_COLUMN(ACC_VERSION),
714  //INIT_OPTIONAL_VDB_COLUMN(SEQID_GNL_PREFIX),
718  INIT_VDB_COLUMN(GB_STATE),
719  INIT_OPTIONAL_VDB_COLUMN(PUBLIC_COMMENT),
720  INIT_VDB_COLUMN(PROTEIN_LEN),
721  //INIT_VDB_COLUMN(PROTEIN_NAME),
722  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_NAME),
724  INIT_OPTIONAL_VDB_COLUMN(REF_ACC),
726  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_START),
727  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_END),
728  INIT_OPTIONAL_VDB_COLUMN(FEAT_PRODUCT_ROW_ID),
729  INIT_OPTIONAL_VDB_COLUMN(PROTEIN),
730  INIT_OPTIONAL_VDB_COLUMN(REPLACED_BY),
731  INIT_OPTIONAL_VDB_COLUMN(REPLACES)
732 {
733 }
734 
735 
736 // SFeatTableCursor is helper accessor structure for optional FEATURE table
738  explicit SFeatTableCursor(const CVDBTable& table);
739 
749  DECLARE_VDB_COLUMN_AS_STRING(PRODUCT_ACCESSION);
754 
756 };
757 
758 
760  : m_Cursor(table),
761  INIT_VDB_COLUMN(FEAT_TYPE),
762  INIT_VDB_COLUMN(LOC_SEQ_TYPE),
763  INIT_VDB_COLUMN(LOC_ACCESSION),
764  INIT_VDB_COLUMN(LOC_ROW_ID),
765  INIT_VDB_COLUMN(LOC_START),
766  INIT_VDB_COLUMN(LOC_LEN),
767  INIT_VDB_COLUMN(LOC_STRAND),
768  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_SEQ_TYPE),
769  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_ACCESSION),
770  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_ROW_ID),
771  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_START),
772  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_LEN),
773  INIT_VDB_COLUMN(SEQ_FEAT)
774 {
776  if ( 1 ) {
778  type.FindVariant("str")
779  .SetLocalReadHook(m_ObjStr, new CPackStringChoiceHook);
780  }
781  if ( 1 ) {
783  type.FindMember("key")
784  .SetLocalReadHook(m_ObjStr, new CPackStringClassHook(32, 128));
785  }
786  if ( 1 ) {
788  type.FindMember("db")
789  .SetLocalReadHook(m_ObjStr, new CPackStringClassHook);
790  }
791  if ( 1 ) {
792  type = CType<CGb_qual>();
793  type.FindMember("qual")
794  .SetLocalReadHook(m_ObjStr, new CPackStringClassHook);
795  }
796 }
797 
798 
799 // SGiIdxTableCursor is helper accessor structure for optional GI_IDX table
801  explicit SGiIdxTableCursor(const CVDBTable& table);
802 
804 
807 };
808 
809 
811  : m_Cursor(table),
812  INIT_OPTIONAL_VDB_COLUMN(NUC_ROW_ID),
813  INIT_OPTIONAL_VDB_COLUMN(PROT_ROW_ID)
814 {
815 }
816 
817 
818 // SProtIdxTableCursor is helper accessor structure for optional PROT_IDX table
820  explicit SProtIdxTableCursor(const CVDBTable& table);
821 
823 
824  typedef pair<TVDBRowId, TVDBRowId> row_range_t;
827 };
828 
829 
831  : m_Cursor(table),
832  INIT_VDB_COLUMN_BACKUP(NAME_ROW_RANGE, ACCESSION_ROW_RANGE),
833  INIT_VDB_COLUMN_BACKUP(ROW_ID, PROTEIN_ROW_ID)
834 {
835 }
836 
837 
838 /////////////////////////////////////////////////////////////////////////////
839 // CWGSDb_Impl
840 /////////////////////////////////////////////////////////////////////////////
841 
842 
844  CTempString path_or_acc,
845  CTempString vol_path)
846  : m_Mgr(mgr),
847  m_WGSPath(NormalizePathOrAccession(path_or_acc, vol_path)),
848  m_IdVersion(0),
867 {
869  PROFILE(sw_WGSOpen);
870  //static CVDBSchema schema(mgr, "wgs.schema");
871  m_Db = CVDB(mgr, m_WGSPath);
872  m_SeqTable = CVDBTable(m_Db, "SEQUENCE"); // SEQUENCE table must exist
873  x_InitIdParams();
874 }
875 
876 
878 {
879 }
880 
881 
882 inline
884 {
885  CRef<SSeq0TableCursor> curs = m_Seq0.Get(row);
886  if ( !curs ) {
888  curs = new SSeq0TableCursor(SeqTable());
889  }
890  return curs;
891 }
892 
893 
894 inline
896 {
897  CRef<SSeqTableCursor> curs = m_Seq.Get(row);
898  if ( !curs ) {
900  curs = new SSeqTableCursor(SeqTable());
901  }
902  return curs;
903 }
904 
905 
906 inline
908 {
909  CRef<SSeq4naTableCursor> curs; // = m_Seq.Get(row);
910  if ( !curs ) {
912  curs = new SSeq4naTableCursor(SeqTable());
913  }
914  return curs;
915 }
916 
917 
918 inline
920 {
921  CRef<SScfTableCursor> curs = m_Scf.Get(row);
922  if ( !curs ) {
924  if ( const CVDBTable& table = ScfTable() ) {
925  curs = new SScfTableCursor(table);
926  }
927  }
928  return curs;
929 }
930 
931 
932 inline
934 {
935  CRef<SProt0TableCursor> curs = m_Prot0.Get(row);
936  if ( !curs ) {
938  if ( const CVDBTable& table = ProtTable() ) {
939  curs = new SProt0TableCursor(table);
940  }
941  }
942  return curs;
943 }
944 
945 
946 inline
948 {
949  CRef<SProtTableCursor> curs = m_Prot.Get(row);
950  if ( !curs ) {
952  if ( const CVDBTable& table = ProtTable() ) {
953  curs = new SProtTableCursor(table);
954  }
955  }
956  return curs;
957 }
958 
959 
960 inline
962 {
963  CRef<SFeatTableCursor> curs = m_Feat.Get(row);
964  if ( !curs ) {
966  if ( const CVDBTable& table = FeatTable() ) {
967  curs = new SFeatTableCursor(table);
968  }
969  }
970  return curs;
971 }
972 
973 
974 inline
976 {
977  CRef<SGiIdxTableCursor> curs = m_GiIdx.Get(row);
978  if ( !curs ) {
980  if ( const CVDBTable& table = GiIdxTable() ) {
981  curs = new SGiIdxTableCursor(table);
982  }
983  }
984  return curs;
985 }
986 
987 
988 inline
990 {
992  if ( !curs ) {
994  if ( const CVDBTable& table = ProtIdxTable() ) {
995  curs = new SProtIdxTableCursor(table);
996  }
997  }
998  return curs;
999 }
1000 
1001 
1002 inline
1004 {
1005  m_Seq0.Put(curs, row);
1006 }
1007 
1008 
1009 inline
1011 {
1012  m_Seq.Put(curs, row);
1013 }
1014 
1015 
1016 inline
1018 {
1019  //m_Seq.Put(curs, row);
1020 }
1021 
1022 
1023 inline
1025 {
1026  m_Scf.Put(curs, row);
1027 }
1028 
1029 
1030 inline
1032 {
1033  m_Prot0.Put(curs, row);
1034 }
1035 
1036 
1037 inline
1039 {
1040  m_Prot.Put(curs, row);
1041 }
1042 
1043 
1044 inline
1046 {
1047  m_Feat.Put(curs, row);
1048 }
1049 
1050 
1051 inline
1053 {
1054  m_GiIdx.Put(curs, row);
1055 }
1056 
1057 
1058 inline
1060 {
1061  m_ProtIdx.Put(curs, row);
1062 }
1063 
1064 
1066 {
1068 
1070  ~SAmbiguityInfo();
1071 
1072  size_t GetUsedMemory() const;
1073 
1074  vector<Uint1> GetAmbiguityBytes(SSeqTableCursor& cur) {
1075  return m_AmbiguityMask;
1076  }
1077 
1079  TWGSContigGapInfo GetGapInfo() const;
1080 
1083 
1085  CWGSDb_Impl& db, SSeqTableCursor& cur) const;
1087  TSeqPos stop_2na_len, TSeqPos stop_gap_len,
1088  CWGSDb_Impl& db, SSeqTableCursor& cur) const;
1090  CWGSDb_Impl& db, SSeqTableCursor& cur) const;
1091 
1093  SSeqTableCursor& cur) const;
1095  CWGSDb_Impl& db, SSeqTableCursor& cur) const;
1096 
1097  bool x_AmbiguousBlock(size_t block_index) const
1098  {
1099  size_t byte_index = block_index/8;
1100  Uint1 byte_bit = 1<<(block_index%8);
1101  return byte_index < m_AmbiguityMask.size() && (m_AmbiguityMask[byte_index] & byte_bit);
1102  }
1103  void x_SetAmbiguousBlock(size_t block_index)
1104  {
1105  size_t byte_index = block_index/8;
1106  Uint1 byte_bit = 1<<(block_index%8);
1107  m_AmbiguityMask[byte_index] |= byte_bit;
1108  }
1110  void x_Calculate4na(CWGSDb_Impl& db) const;
1111  void x_Need4na(CWGSDb_Impl& db) const
1112  {
1113  if ( !m_HasAmbiguityPos && !m_Has4naBlocks ) {
1114  x_Calculate4na(db);
1115  }
1116  }
1117  bool x_AddAmbiguities(const Uint1* ptr, TSeqPos count,
1118  TSeqPos pos, TWGSContigGapInfo& gap_info) const;
1119  bool x_AddAmbiguousBlock(const Uint1* ptr, TSeqPos count,
1120  TSeqPos pos, TWGSContigGapInfo& gap_info) const;
1121 
1122  string m_Prefix;
1124 
1125  mutable CFastMutex m_Mutex; // for m_4naBlocks update
1126 
1129  mutable bool m_HasAmbiguityPos;
1130  mutable bool m_Has4naBlocks;
1131 
1132  vector<INSDC_coord_zero> m_GapStart;
1133  vector<INSDC_coord_len> m_GapLen;
1134  vector<NCBI_WGS_component_props> m_GapProps;
1135  vector<NCBI_WGS_gap_linkage> m_GapLinkage;
1136 
1137  vector<Uint1> m_AmbiguityMask;
1138  mutable vector<INSDC_coord_zero> m_AmbiguityPos;
1139  mutable vector<INSDC_4na_bin> m_Ambiguity4na;
1140 
1141  struct S4naBlock
1142  {
1143  char m_Packed4na[kAmbiguityBlockSize/2]; // packed 4na - two 4na bases per byte
1144  };
1145 
1147  mutable T4naBlocks m_4naBlocks; // ambiguous blocks
1148 
1149  struct S4naReader
1150  {
1154  };
1155  bool x_IsValid(const S4naReader& reader) const;
1157  enum EBaseType {
1160  eBase_Gap
1161  };
1162  EBaseType GetBaseType(const S4naReader& reader) const;
1163  void Advance(S4naReader& reader) const;
1164 };
1165 
1166 
1167 template<class Value>
1168 static void sx_Assign(vector<Value>& dst, const CVDBValueFor<Value>& src)
1169 {
1170  dst.resize(src.size());
1171  copy_n(src.begin(), src.size(), dst.data());
1172 }
1173 
1174 
1176  : m_Prefix(db.GetIdPrefixWithVersion()),
1177  m_RowId(row_id),
1178  m_HasGapInfo(false),
1179  m_HasAmbiguityMask(false),
1180  m_HasAmbiguityPos(false),
1181  m_Has4naBlocks(false)
1182 {
1183  if ( cur.m_GAP_START ) {
1184  sx_Assign(m_GapStart, cur.GAP_START(m_RowId));
1185  if ( m_GapStart.size() ) {
1186  sx_Assign(m_GapLen, cur.GAP_LEN(m_RowId));
1187  sx_Assign(m_GapProps, cur.GAP_PROPS(m_RowId));
1188  if ( cur.m_GAP_LINKAGE ) {
1189  sx_Assign(m_GapLinkage, cur.GAP_LINKAGE(m_RowId));
1190  }
1191  }
1192  m_HasGapInfo = true;
1193  }
1194  const bool kVerify4na = false;
1195  vector<Uint1> m_ExpectedAmbiguityMask;
1196  vector<INSDC_coord_zero> m_ExpectedAmbiguityPos;
1197  vector<INSDC_4na_bin> m_ExpectedAmbiguity4na;
1198 
1199  if ( kVerify4na ) {
1201  swap(m_ExpectedAmbiguityMask, m_AmbiguityMask);
1202  swap(m_ExpectedAmbiguityPos, m_AmbiguityPos);
1203  swap(m_ExpectedAmbiguity4na, m_Ambiguity4na);
1204  m_HasAmbiguityMask = false;
1205  m_HasAmbiguityPos = false;
1206  }
1207  if ( cur.m_AMBIGUITY_MASK ) {
1208  // number of blocks
1209  sx_Assign(m_AmbiguityMask, cur.AMBIGUITY_MASK(m_RowId));
1210  m_HasAmbiguityMask = true;
1211  }
1212  if ( cur.m_AMBIGUITY_POS && cur.m_AMBIGUITY_4NA ) {
1213  sx_Assign(m_AmbiguityPos, cur.AMBIGUITY_POS(m_RowId));
1214  sx_Assign(m_Ambiguity4na, cur.AMBIGUITY_4NA(m_RowId));
1215  m_HasAmbiguityPos = true;
1216  }
1217  if ( !m_HasAmbiguityMask ) {
1219  }
1220  if ( s_GetDebugLevel() >= 6 ) {
1221  size_t memory = GetUsedMemory();
1222  size_t mask_bit_count = 0;
1223  for ( auto bb : m_AmbiguityMask ) {
1224  while ( bb ) {
1225  ++mask_bit_count;
1226  bb &= bb-1;
1227  }
1228  }
1229  CFastMutexGuard guard(m_Mutex);
1230  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1232  <<NStr::NumericToString(m_AmbiguityMask.size(),NStr::fWithCommas)<<" mask bytes, "
1233  <<NStr::NumericToString(mask_bit_count,NStr::fWithCommas)<<" bits, "
1235  <<"size: "<<NStr::NumericToString(memory,NStr::fWithCommas));
1236  if ( s_GetDebugLevel() >= 7 ) {
1237  for ( size_t i = 0; i < 2 && i < m_AmbiguityPos.size(); ++i ) {
1238  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1239  <<"ambiguity at "<<m_AmbiguityPos[i]<<" - "<<m_Ambiguity4na[i]*1);
1240  }
1241  }
1242  }
1243  if ( kVerify4na ) {
1244  x_Need4na(db);
1245  for ( size_t block_index = 0;
1246  block_index < 8*max(m_AmbiguityMask.size(), m_ExpectedAmbiguityMask.size());
1247  ++block_index ) {
1248  bool bit = x_AmbiguousBlock(block_index);
1249  bool exp_bit;
1250  {{
1251  size_t byte_index = block_index/8;
1252  Uint1 byte_bit = 1<<(block_index%8);
1253  exp_bit = byte_index < m_ExpectedAmbiguityMask.size() &&
1254  (m_ExpectedAmbiguityMask[byte_index] & byte_bit);
1255  }}
1256  if ( bit != exp_bit ) {
1257  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "<<
1258  "mask["<<block_index<<" = "<<oct<<block_index<<dec<<"] "<<bit<<", expected "<<exp_bit);
1259  }
1260  }
1261  size_t index = 0, exp_index = 0;
1262  while ( index < m_AmbiguityPos.size() || exp_index < m_ExpectedAmbiguityPos.size() ) {
1263  TSeqPos pos = index < m_AmbiguityPos.size()? m_AmbiguityPos[index]: kInvalidSeqPos;
1264  int base = index < m_AmbiguityPos.size()? m_Ambiguity4na[index]: 0;
1265  TSeqPos exp_pos = exp_index < m_ExpectedAmbiguityPos.size()? m_ExpectedAmbiguityPos[exp_index]: kInvalidSeqPos;
1266  int exp_base = exp_index < m_ExpectedAmbiguityPos.size()? m_ExpectedAmbiguity4na[exp_index]: 0;
1267  if ( pos == exp_pos ) {
1268  if ( base != exp_base ) {
1269  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "<<
1270  "amb["<<pos<<" = "<<oct<<pos<<dec<<"] "<<base<<", expected "<<exp_base);
1271  }
1272  ++index;
1273  ++exp_index;
1274  }
1275  else if ( pos < exp_pos ) {
1276  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "<<
1277  "amb["<<pos<<" = "<<oct<<pos<<dec<<"] "<<base<<", expected -");
1278  ++index;
1279  }
1280  else {
1281  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "<<
1282  "amb["<<exp_pos<<" = "<<oct<<exp_pos<<dec<<"] -, expected "<<exp_base);
1283  ++exp_index;
1284  }
1285  }
1286  }
1287 }
1288 
1289 
1291 {
1292  if ( s_GetDebugLevel() >= 6 ) {
1293  size_t memory = GetUsedMemory();
1294  CFastMutexGuard guard(m_Mutex);
1295  LOG_POST("~SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1296  <<"final size: "<<NStr::NumericToString(memory,NStr::fWithCommas));
1297  }
1298 }
1299 
1300 
1302 {
1303  const size_t kAllocateGap = sizeof(void*)*2;
1304  size_t ret = kAllocateGap + sizeof(*this);
1305  ret += kAllocateGap + m_GapStart.size()*sizeof(m_GapStart.front());
1306  ret += kAllocateGap + m_GapLen.size()*sizeof(m_GapLen.front());
1307  ret += kAllocateGap + m_GapProps.size()*sizeof(m_GapProps.front());
1308  ret += kAllocateGap + m_GapLinkage.size()*sizeof(m_GapLinkage.front());
1309  ret += kAllocateGap + m_AmbiguityMask.size()*sizeof(m_AmbiguityMask.front());
1310  if ( m_HasAmbiguityPos || m_Has4naBlocks ) {
1311  CFastMutexGuard guard(m_Mutex);
1312  ret += kAllocateGap + m_AmbiguityPos.size()*sizeof(m_AmbiguityPos.front());
1313  ret += kAllocateGap + m_Ambiguity4na.size()*sizeof(m_Ambiguity4na.front());
1314  const size_t kBlockUsedMemory =
1315  kAllocateGap + 4*sizeof(void*) + sizeof(S4naBlock); // including map overhead
1316  ret += kBlockUsedMemory * m_4naBlocks.size();
1317  }
1318  return ret;
1319 }
1320 
1321 
1324 {
1325  TWGSContigGapInfo gap_info;
1326  gap_info.gaps_count = m_GapStart.size();
1327  gap_info.gaps_start = m_GapStart.data();
1328  gap_info.gaps_len = m_GapLen.data();
1329  gap_info.gaps_props = m_GapProps.data();
1330  gap_info.gaps_linkage = m_GapLinkage.data();
1331  return gap_info;
1332 }
1333 
1334 
1335 static const bool kRecoverGaps = true;
1336 
1337 
1338 // 2na encoding has values 0-3, occupying 2 bits
1339 // 4na encoding has values 0-15, occupying 4 bits
1340 // unpacked 4na values occupy one base per byte
1341 // packed 4na bases are stored two per byte, first base in highest bits
1342 // packed 2na bases are stored four per byte, first base in highest bits
1343 
1344 // return true if the 4na value is unambiguous
1345 static inline
1347 {
1348  return b && !(b&(b-1));
1349 }
1350 
1351 
1352 // return pointer to the first ambiguity in an unpacked 4na array
1353 static inline
1354 const Uint1* sx_FindAmbiguity(const Uint1* ptr, const Uint1* end)
1355 {
1356  for ( ; ptr != end; ++ptr ) {
1357  if ( !sx_Is2na(*ptr) ) {
1358  return ptr;
1359  }
1360  }
1361  return ptr;
1362 }
1363 
1364 
1365 static inline
1366 Uint1 sx_Get_4na(const char* ptr, size_t offset)
1367 {
1368  Uint1 b = ptr[offset/2];
1369  if ( offset%2 == 0 ) {
1370  b = b >> 4;
1371  }
1372  return b & 0xf;
1373 }
1374 
1375 
1376 // return pointer to the first ambiguity in an unpacked 4na array
1377 static inline
1378 size_t sx_Find_4na_Ambiguity(const char* ptr, size_t offset, size_t base_count)
1379 {
1380  for ( size_t i = offset; i < offset+base_count; ++i ) {
1381  if ( !sx_Is2na(sx_Get_4na(ptr, i)) ) {
1382  return i;
1383  }
1384  }
1385  return offset+base_count;
1386 }
1387 
1388 
1389 // check if unpacked 4na array has any ambiguity
1390 static inline
1391 bool sx_HasAmbiguity(const Uint1* ptr, const Uint1* end)
1392 {
1393  return sx_FindAmbiguity(ptr, end) != end;
1394 }
1395 
1396 
1397 // check if unpacked 4na array has any ambiguity beside explicit gaps
1398 static inline
1401 {
1402  while ( count ) {
1403  gap_info.SetPos(pos);
1404  if ( gap_info.IsInGap(pos) ) {
1405  // skip gap
1406  TSeqPos gap_len = gap_info.GetGapLength(pos, count);
1407  ptr += gap_len;
1408  pos += gap_len;
1409  count -= gap_len;
1410  }
1411  else {
1412  TSeqPos na_len = gap_info.GetDataLength(pos, count);
1413  if ( sx_HasAmbiguity(ptr, ptr+na_len) ) {
1414  return true;
1415  }
1416  ptr += na_len;
1417  pos += na_len;
1418  count -= na_len;
1419  }
1420  }
1421  return false;
1422 }
1423 
1424 
1425 // convert 2 bases of packed 2na byte into packed 4na byte
1426 static
1427 inline
1428 char s_ConvertBits_2na_to_4na(char bits_2na)
1429 {
1430  static const unsigned char table[16] = {
1431  0x11, 0x12, 0x14, 0x18,
1432  0x21, 0x22, 0x24, 0x28,
1433  0x41, 0x42, 0x44, 0x48,
1434  0x81, 0x82, 0x84, 0x88
1435  };
1436  return table[bits_2na & 0xf];
1437 }
1438 
1439 
1440 // convert first 2 bases of packed 2na byte into packed 4na byte
1441 static
1442 inline
1443 char s_ConvertBits_2na_to_4na_1st(char bits_2na)
1444 {
1445  return s_ConvertBits_2na_to_4na(bits_2na >> 4);
1446 }
1447 
1448 
1449 // convert last 2 bases of packed 2na byte into packed 4na byte
1450 static
1451 inline
1452 char s_ConvertBits_2na_to_4na_2nd(char bits_2na)
1453 {
1454  return s_ConvertBits_2na_to_4na(bits_2na);
1455 }
1456 
1457 
1458 // convert packed 2na (4 bases per byte) array into packed 4na (2 bases per byte) array
1459 static
1460 void s_Convert_2na_to_4na(char* dst_4na, const char* src_2na, size_t base_count)
1461 {
1462  while ( base_count >= 4 ) {
1463  char bits_2na = src_2na[0];
1464  dst_4na[0] = s_ConvertBits_2na_to_4na_1st(bits_2na);
1465  dst_4na[1] = s_ConvertBits_2na_to_4na_2nd(bits_2na);
1466  base_count -= 4;
1467  src_2na += 1;
1468  dst_4na += 2;
1469  }
1470  if ( base_count ) {
1471  char bits_2na = src_2na[0] & (0xff00 >> base_count*2);
1472  {{
1473  char bits_4na = s_ConvertBits_2na_to_4na_1st(bits_2na);
1474  if ( base_count < 2 ) {
1475  bits_4na &= 0xf0;
1476  }
1477  dst_4na[0] = bits_4na;
1478  }}
1479  if ( base_count > 2 ) {
1480  dst_4na[1] = s_ConvertBits_2na_to_4na_2nd(bits_2na) & 0xf0;
1481  }
1482  }
1483 }
1484 
1485 
1486 // convert packed 2na (4 bases per byte) vector into packed 4na (2 bases per byte) vector
1487 static
1488 void s_Convert_2na_to_4na(vector<char>& dst_4na_vec,
1489  const vector<char>& src_2na_vec,
1490  size_t base_count)
1491 {
1492  size_t dst_4na_byte_count = (base_count+1)/2;
1493  // allocate 8-byte aligned memory to allow multi-byte operations at end
1494  dst_4na_vec.reserve((dst_4na_byte_count+7)/8*8);
1495  dst_4na_vec.resize(dst_4na_byte_count);
1496  s_Convert_2na_to_4na(dst_4na_vec.data(), src_2na_vec.data(), base_count);
1497 }
1498 
1499 
1500 // set 4na value into a packed 4na vector
1501 static
1502 inline
1503 void s_Set_4na(vector<char>& dst_4na_vec,
1504  size_t offset,
1505  INSDC_4na_bin amb)
1506 {
1507  char& dst = dst_4na_vec[offset/2];
1508  if ( offset%2 == 0 ) {
1509  dst = (dst & 0xf) | (amb << 4);
1510  }
1511  else {
1512  dst = (dst & 0xf0) | amb;
1513  }
1514 }
1515 
1516 
1517 // set 4na gap of specified length into a packed 4na vector
1518 static
1519 inline
1520 void s_Set_4na_gap(vector<char>& dst_4na_vec,
1521  size_t offset,
1522  size_t len)
1523 {
1524  char* dst = dst_4na_vec.data()+ (offset/2);
1525  if ( len && offset%2 == 1 ) {
1526  // start with odd gap base
1527  *dst |= 0xf;
1528  --len;
1529  ++dst;
1530  }
1531  while ( len >= 2 ) {
1532  *dst = char(0xff);
1533  len -= 2;
1534  ++dst;
1535  }
1536  if ( len ) {
1537  // end with odd gap base
1538  *dst |= 0xf0;
1539  }
1540 }
1541 
1542 
1543 // copy 4na bases with arbitrary offset
1544 static
1545 void s_Copy_4na(char* dst_4na, TSeqPos dst_offset,
1546  const char* src_4na, TSeqPos src_offset,
1547  size_t base_count)
1548 {
1549  if ( !base_count ) {
1550  return;
1551  }
1552  dst_4na += dst_offset/2;
1553  dst_offset %= 2;
1554  src_4na += src_offset/2;
1555  src_offset %= 2;
1556  // copy first odd dst base
1557  if ( dst_offset != 0 ) {
1558  Uint1 dst_b = dst_4na[0];
1559  Uint1 src_b = src_4na[0];
1560  src_4na += src_offset;
1561  if ( !src_offset ) {
1562  src_b = src_b >> 4;
1563  }
1564  src_offset ^= 1;
1565  dst_b = (dst_b & 0xf0) | (src_b & 0xf);
1566  dst_4na[0] = dst_b;
1567  ++dst_4na;
1568  dst_offset = 0;
1569  --base_count;
1570  }
1571  // copy pairs of bases
1572  if ( src_offset == 0 ) {
1573  size_t copy_bytes = base_count / 2;
1574  dst_4na = copy_n(src_4na, copy_bytes, dst_4na);
1575  src_4na += copy_bytes;
1576  base_count %= 2;
1577  }
1578  else {
1579  while ( base_count >= 2 ) {
1580  Uint1 src_b0 = src_4na[0];
1581  Uint1 src_b1 = src_4na[1];
1582  Uint1 dst_b = (src_b0 << 4) | (src_b1 >> 4);
1583  dst_4na[0] = dst_b;
1584  ++src_4na;
1585  ++dst_4na;
1586  base_count -= 2;
1587  }
1588  }
1589  // copy last odd base
1590  if ( base_count ) {
1591  Uint1 dst_b = dst_4na[0];
1592  Uint1 src_b = src_4na[0];
1593  if ( src_offset ) {
1594  src_b = src_b << 4;
1595  }
1596  dst_b = (dst_b & 0xf) | (src_b & 0xf0);
1597  dst_4na[0] = dst_b;
1598  }
1599 }
1600 
1601 
1602 // convert unpacked 4na (1 base per byte) array into packed 4na (2 bases per byte) array
1603 static
1604 void s_Pack_4na(char* dst_packed_4na,
1605  const Uint1* src_4na,
1606  size_t base_count)
1607 {
1608  while ( base_count >= 2 ) {
1609  auto b0 = src_4na[0];
1610  auto b1 = src_4na[1];
1611  auto packed_bb = (b0 << 4)+b1;
1612  *dst_packed_4na = packed_bb;
1613  base_count -= 2;
1614  src_4na += 2;
1615  ++dst_packed_4na;
1616  }
1617  if ( base_count ) {
1618  auto b0 = src_4na[0];
1619  auto packed_bb = (b0 << 4);
1620  *dst_packed_4na = packed_bb;
1621  }
1622 }
1623 
1624 
1625 static
1626 void s_SetAmbiguitiesPos(vector<char>& dst_4na_vec,
1627  TSeqPos pos, TSeqPos len,
1628  const vector<INSDC_coord_zero>& amb_pos,
1629  const vector<INSDC_4na_bin>& amb_4na)
1630 {
1631  auto iter_pos = lower_bound(amb_pos.begin(), amb_pos.end(), INSDC_coord_zero(pos));
1632  auto iter_4na = amb_4na.begin() + (iter_pos-amb_pos.begin());
1633  INSDC_coord_zero end = pos + len;
1634  for ( ; iter_pos != amb_pos.end() && *iter_pos < end; ++iter_pos, ++iter_4na ) {
1635  s_Set_4na(dst_4na_vec, *iter_pos-pos, *iter_4na);
1636  }
1637 }
1638 
1639 
1640 static
1641 void s_SetAmbiguitiesBlocks(vector<char>& dst_4na_vec,
1642  TSeqPos pos, TSeqPos len,
1644 {
1645  TSeqPos end = pos+len;
1646  TSeqPos block_pos = pos - pos%kAmbiguityBlockSize;
1647  for ( auto iter = blocks.lower_bound(block_pos);
1648  iter != blocks.end() && iter->first < end;
1649  ++iter ) {
1650  TSeqPos block_pos = iter->first;
1651  TSeqPos dst_offset;
1652  TSeqPos src_offset;
1653  TSeqPos copy_len;
1654  if ( block_pos < pos ) {
1655  dst_offset = 0;
1656  src_offset = pos-block_pos;
1657  copy_len = min(len, kAmbiguityBlockSize-src_offset);
1658  }
1659  else {
1660  dst_offset = block_pos-pos;
1661  src_offset = 0;
1662  copy_len = min(end-block_pos, kAmbiguityBlockSize);
1663  }
1664  s_Copy_4na(dst_4na_vec.data(), dst_offset, iter->second.m_Packed4na, src_offset, copy_len);
1665  }
1666 }
1667 
1668 
1669 static
1670 void s_SetGaps(vector<char>& dst_4na_vec,
1671  TSeqPos pos, TSeqPos len,
1673 {
1674  TSeqPos pos0 = pos;
1675  gap_info.SetPos(pos);
1676  for ( ; len > 0; ) {
1677  if ( gap_info.IsInGap(pos) ) {
1678  // add gap
1679  TSeqPos gap_len = gap_info.GetGapLength(pos, len);
1680  _ASSERT(gap_len <= len);
1681  s_Set_4na_gap(dst_4na_vec, pos-pos0, gap_len);
1682  ++gap_info;
1683  len -= gap_len;
1684  pos += gap_len;
1685  _ASSERT(!gap_info || pos <= gap_info.GetFrom());
1686  }
1687  else {
1688  // data segment
1689  TSeqPos rem_len = gap_info.GetDataLength(pos, len);
1690  _ASSERT(rem_len <= len);
1691  len -= rem_len;
1692  pos += rem_len;
1693  }
1694  }
1695 }
1696 
1697 
1699  TSeqPos pos, TWGSContigGapInfo& gap_info) const
1700 {
1701  bool ambiguous = sx_HasAmbiguity(ptr, count, pos, gap_info);
1702  if ( ambiguous ) {
1703  s_Pack_4na(m_4naBlocks[pos].m_Packed4na, ptr, count);
1704  }
1705  return ambiguous;
1706 }
1707 
1708 
1710  TSeqPos pos, TWGSContigGapInfo& gap_info) const
1711 {
1712  bool ambiguous = false;
1713  while ( count ) {
1714  gap_info.SetPos(pos);
1715  if ( gap_info.IsInGap(pos) ) {
1716  // skip gap
1717  TSeqPos gap_len = gap_info.GetGapLength(pos, count);
1718  ptr += gap_len;
1719  pos += gap_len;
1720  count -= gap_len;
1721  }
1722  else {
1723  TSeqPos na_len = gap_info.GetDataLength(pos, count);
1724  for ( TSeqPos i = 0; i < na_len; ++i ) {
1725  auto b = ptr[i];
1726  if ( !sx_Is2na(b) ) {
1727  ambiguous = true;
1728  m_AmbiguityPos.push_back(pos+i);
1729  m_Ambiguity4na.push_back(b);
1730  }
1731  }
1732  ptr += na_len;
1733  pos += na_len;
1734  count -= na_len;
1735  }
1736  }
1737  return ambiguous;
1738 }
1739 
1740 
1742 {
1743  if ( m_HasAmbiguityMask ) {
1744  return;
1745  }
1746  // calculate ambiguity mask using 4na read
1747  if ( m_HasAmbiguityPos ) {
1748  PROFILE(sw____GetAmb2Mask);
1749  // it's faster to use ambiguity position list if present
1750  if ( size_t ambiguity_count = m_AmbiguityPos.size() ) {
1751  size_t last_block_index = m_AmbiguityPos.back() / kAmbiguityBlockSize;
1752  size_t last_byte_index = last_block_index/8;
1753  m_AmbiguityMask.resize(last_byte_index+1);
1754  for ( size_t i = 0; i < ambiguity_count; ++i ) {
1755  x_SetAmbiguousBlock(m_AmbiguityPos[i] / kAmbiguityBlockSize);
1756  }
1757  }
1758  if ( s_GetDebugLevel() >= 6 ) {
1759  size_t memory = GetUsedMemory();
1760  size_t mask_bit_count = 0;
1761  for ( auto bb : m_AmbiguityMask ) {
1762  while ( bb ) {
1763  ++mask_bit_count;
1764  bb &= bb-1;
1765  }
1766  }
1767  CFastMutexGuard guard(m_Mutex);
1768  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1769  <<"calculated mask from ambiguities, "
1770  <<NStr::NumericToString(m_AmbiguityMask.size(),NStr::fWithCommas)<<" mask bytes, "
1771  <<NStr::NumericToString(mask_bit_count,NStr::fWithCommas)<<" bits, "
1772  <<"size: "<<NStr::NumericToString(memory,NStr::fWithCommas));
1773  }
1774  }
1775  else {
1776  // we'll have to scan for ambiguities in 4na data
1777 
1778  // use full 4na blocks or individual 4na points
1779  bool use_full_4na_blocks = s_UseFull4naBlocks();
1780 
1781  CRef<SSeq4naTableCursor> cur4na;
1783  {{
1784  PROFILE(sw____GetRaw4na);
1785  cur4na = db.Seq4na(m_RowId);
1786  read4na = cur4na->READ(m_RowId);
1787  }}
1788 
1789  PROFILE(sw____Get4na2Mask);
1790  TSeqPos read_length = TSeqPos(read4na.size());
1791  size_t block_count = (read_length+kAmbiguityBlockSize-1) / kAmbiguityBlockSize;
1792  size_t mask_bit_count = 0;
1793  m_AmbiguityMask.resize((block_count+7)/8);
1794  TWGSContigGapInfo gap_info = GetGapInfo();
1795  for ( size_t block_index = 0; block_index < block_count; ++block_index ) {
1796  TSeqPos block_pos = TSeqPos(block_index*kAmbiguityBlockSize);
1797  const Uint1* base_ptr = read4na.data() + block_pos;
1798  TSeqPos base_count = min(kAmbiguityBlockSize, read_length-block_pos);
1799  bool ambiguous = false;
1800  if ( use_full_4na_blocks ) {
1801  ambiguous = x_AddAmbiguousBlock(base_ptr, base_count, block_pos, gap_info);
1802  }
1803  else {
1804  ambiguous = x_AddAmbiguities(base_ptr, base_count, block_pos, gap_info);
1805  }
1806  if ( ambiguous ) {
1807  x_SetAmbiguousBlock(block_index);
1808  ++mask_bit_count;
1809  }
1810  }
1811  if ( use_full_4na_blocks ) {
1812  m_Has4naBlocks = true;
1813  }
1814  else {
1815  m_HasAmbiguityPos = true;
1816  }
1817  // db.Put(cur4na, m_RowId); do not store 4na cursor in cache to free memory
1818  if ( s_GetDebugLevel() >= 6 ) {
1819  size_t memory = GetUsedMemory();
1820  CFastMutexGuard guard(m_Mutex);
1821  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1822  "calculated mask from read, "
1823  <<NStr::NumericToString(m_AmbiguityMask.size(),NStr::fWithCommas)<<" mask bytes, "
1824  <<NStr::NumericToString(mask_bit_count,NStr::fWithCommas)<<" bits, "
1825  <<NStr::NumericToString(m_Ambiguity4na.size(),NStr::fWithCommas)<<" ambig, "
1826  <<NStr::NumericToString(m_4naBlocks.size(),NStr::fWithCommas)<<" blocks, "
1827  <<"size: "<<NStr::NumericToString(memory,NStr::fWithCommas));
1828  }
1829  }
1830  m_HasAmbiguityMask = true;
1831 }
1832 
1833 
1835 {
1836  CFastMutexGuard guard(m_Mutex);
1837  if ( m_HasAmbiguityPos || m_Has4naBlocks ) {
1838  return;
1839  }
1840 
1841  // use full 4na blocks or individual 4na points
1842  bool use_full_4na_blocks = s_UseFull4naBlocks();
1843 
1844  CRef<SSeq4naTableCursor> cur4na;
1846  TSeqPos read_length = 0;
1847  size_t bit_count = 0;
1848  size_t wrong_bit_count = 0;
1849  TWGSContigGapInfo gap_info = GetGapInfo();
1850  for ( size_t block_byte = 0; block_byte < m_AmbiguityMask.size(); ++block_byte ) {
1851  if ( auto bits = m_AmbiguityMask[block_byte] ) {
1852  if ( !cur4na ) {
1853  PROFILE(sw____GetRaw4na);
1854  cur4na = db.Seq4na(m_RowId);
1855  read4na = cur4na->READ(m_RowId);
1856  read_length = TSeqPos(read4na.size());
1857  }
1858  for ( size_t block_bit = 0; block_bit < 8; ++block_bit ) {
1859  if ( bits & (1<<block_bit) ) {
1860  PROFILE(sw____Scan4na);
1861  size_t block_index = block_byte*8+block_bit;
1862  TSeqPos block_pos = TSeqPos(block_index * kAmbiguityBlockSize);
1863  const Uint1* base_ptr = read4na.data() + block_pos;
1864  TSeqPos base_count = min(kAmbiguityBlockSize, read_length-block_pos);
1865  bool ambiguous = false;
1866  gap_info.SetPos(block_pos);
1867  if ( use_full_4na_blocks ) {
1868  ambiguous = x_AddAmbiguousBlock(base_ptr, base_count, block_pos, gap_info);
1869  }
1870  else {
1871  ambiguous = x_AddAmbiguities(base_ptr, base_count, block_pos, gap_info);
1872  }
1873  if ( ambiguous ) {
1874  ++bit_count;
1875  }
1876  else {
1877  ++wrong_bit_count;
1878  if ( s_GetDebugLevel() >= 7 && wrong_bit_count <= 2 ) {
1879  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1880  <<"wrong bit set at "<<block_pos);
1881  }
1882  }
1883  }
1884  }
1885  }
1886  }
1887  if ( use_full_4na_blocks ) {
1888  m_Has4naBlocks = true;
1889  }
1890  else {
1891  m_HasAmbiguityPos = true;
1892  }
1893  // db.Put(cur4na, m_RowId); do not store 4na cursor in cache to free memory
1894  if ( s_GetDebugLevel() >= 6 ) {
1895  guard.Release();
1896  size_t memory = GetUsedMemory();
1897  CFastMutexGuard guard(m_Mutex);
1898  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1899  <<"calculated 4na, "
1900  <<NStr::NumericToString(read_length,NStr::fWithCommas)<<" bases, "
1901  <<NStr::NumericToString(bit_count,NStr::fWithCommas)<<" bits, "
1902  <<NStr::NumericToString(m_Ambiguity4na.size(),NStr::fWithCommas)<<" ambig, "
1903  <<NStr::NumericToString(m_4naBlocks.size(),NStr::fWithCommas)<<" blocks, "
1904  <<"size: "<<NStr::NumericToString(memory,NStr::fWithCommas));
1905  if ( s_GetDebugLevel() >= 7 ) {
1906  for ( size_t i = 0; i < 2 && i < m_AmbiguityPos.size(); ++i ) {
1907  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1908  <<"ambiguity at "<<m_AmbiguityPos[i]<<" - "<<m_Ambiguity4na[i]*1);
1909  }
1910  }
1911  }
1912 }
1913 
1914 
1916 {
1917  if ( m_HasAmbiguityPos ) {
1918  // use explicit ambiguities list
1919  _ASSERT(reader.m_AmbiguityIndex == m_AmbiguityPos.size() ||
1920  (reader.m_AmbiguityIndex < m_AmbiguityPos.size() &&
1921  reader.m_Pos <= TSeqPos(m_AmbiguityPos[reader.m_AmbiguityIndex])));
1922  }
1923  else {
1924  // use 4na blocks
1925  _ASSERT(reader.m_4naBlocksIter == m_4naBlocks.end() ||
1926  (reader.m_Pos < reader.m_4naBlocksIter->first + kAmbiguityBlockSize));
1927  }
1928  return true;
1929 }
1930 
1931 
1934  CWGSDb_Impl& db, SSeqTableCursor& cur) const
1935 {
1936  x_Need4na(db);
1937  S4naReader reader;
1938  reader.m_Pos = pos;
1939  if ( m_HasAmbiguityPos ) {
1940  // use explicit ambiguities list
1941  reader.m_AmbiguityIndex =
1942  lower_bound(m_AmbiguityPos.begin(), m_AmbiguityPos.end(), INSDC_coord_zero(pos)) - m_AmbiguityPos.begin();
1943  }
1944  else {
1945  // use 4na blocks
1946  TSeqPos block_pos = pos - pos%kAmbiguityBlockSize;
1947  reader.m_4naBlocksIter = m_4naBlocks.lower_bound(block_pos);
1948  }
1949  return reader;
1950 }
1951 
1952 
1955 {
1956  _ASSERT(x_IsValid(reader));
1957  Uint1 base;
1958  if ( m_HasAmbiguityPos ) {
1959  // use explicit ambiguities list
1960  if ( reader.m_AmbiguityIndex == m_AmbiguityPos.size() ) {
1961  // no more ambiguities
1962  return eBase_2na;
1963  }
1964  // check if next ambiguity is at current position
1965  if ( reader.m_Pos != TSeqPos(m_AmbiguityPos[reader.m_AmbiguityIndex]) ) {
1966  // not an ambiguity yet
1967  return eBase_2na;
1968  }
1969  base = m_Ambiguity4na[reader.m_AmbiguityIndex];
1970  }
1971  else {
1972  // use 4na blocks
1973  if ( reader.m_4naBlocksIter == m_4naBlocks.end() ) {
1974  // no more 4na block
1975  return eBase_2na;
1976  }
1977  if ( reader.m_4naBlocksIter->first > reader.m_Pos ) {
1978  // not in a 4na block yet
1979  return eBase_2na;
1980  }
1981  // check actual 4na base
1982  TSeqPos offset = reader.m_Pos - reader.m_4naBlocksIter->first;
1983  base = sx_Get_4na(reader.m_4naBlocksIter->second.m_Packed4na, offset);
1984  }
1985  return base == 0xf? eBase_Gap: sx_Is2na(base)? eBase_2na: eBase_4na;
1986 }
1987 
1988 
1990 {
1991  _ASSERT(x_IsValid(reader));
1992  // advance
1993  ++reader.m_Pos;
1994  // update iterators
1995  if ( m_HasAmbiguityPos ) {
1996  // use explicit ambiguities list
1997  if ( reader.m_AmbiguityIndex == m_AmbiguityPos.size() ) {
1998  // no more ambiguities
1999  }
2000  else {
2001  // check if next ambiguity was at current position
2002  if ( reader.m_Pos > TSeqPos(m_AmbiguityPos[reader.m_AmbiguityIndex]) ) {
2003  // advance to next ambiguity
2004  ++reader.m_AmbiguityIndex;
2005  }
2006  }
2007  }
2008  else {
2009  // use 4na blocks
2010  if ( reader.m_4naBlocksIter == m_4naBlocks.end() ) {
2011  // no more 4na blocks
2012  }
2013  else {
2014  // check if we move out of current 4na block
2015  if ( reader.m_Pos >= reader.m_4naBlocksIter->first + kAmbiguityBlockSize ) {
2016  // advance to next 4na block
2017  ++reader.m_4naBlocksIter;
2018  }
2019  }
2020  }
2021  _ASSERT(x_IsValid(reader));
2022 }
2023 
2024 
2026  CWGSDb_Impl& db, SSeqTableCursor& cur) const
2027 {
2028  x_Need4na(db);
2029  PROFILE(sw____Get2naLen);
2030  TSeqPos end = pos+len;
2031  if ( m_HasAmbiguityPos ) {
2032  auto iter = lower_bound(m_AmbiguityPos.begin(), m_AmbiguityPos.end(), INSDC_coord_zero(pos));
2033  if ( iter == m_AmbiguityPos.end() || TSeqPos(*iter) >= end ) {
2034  return len;
2035  }
2036  return *iter - pos;
2037  }
2038  else {
2039  // use 4na blocks
2040  TSeqPos block_pos = pos - pos%kAmbiguityBlockSize;
2041  for ( auto block_iter = m_4naBlocks.lower_bound(block_pos);
2042  block_iter != m_4naBlocks.end() && block_iter->first < end;
2043  ++block_iter ) {
2044  size_t in_block_pos = pos <= block_iter->first? 0: pos-block_iter->first;
2045  size_t in_block_len = min(kAmbiguityBlockSize, end-block_iter->first);
2046  TSeqPos amb_pos = TSeqPos(sx_Find_4na_Ambiguity(block_iter->second.m_Packed4na,
2047  in_block_pos, in_block_len));
2048  if ( amb_pos < in_block_pos+in_block_len ) {
2049  return (block_iter->first+amb_pos) - pos;
2050  }
2051  }
2052  return len;
2053  }
2054 }
2055 
2056 
2057 // Calculate 4na length with gap recovering
2059  TSeqPos stop_2na_len,
2060  TSeqPos stop_gap_len,
2061  CWGSDb_Impl& db, SSeqTableCursor& cur) const
2062 {
2063  PROFILE(sw____Get4naLen);
2064  if ( len < stop_2na_len ) {
2065  return len;
2066  }
2067  S4naReader reader = Get4naReader(pos, db, cur);
2068  TSeqPos rem_len = len, len2na = 0, gap_len = 0;
2069  // |-------------------- len -----------------|
2070  // |- 4na -|- len2na -|- gap_len -$- rem_len -|
2071  // $ is current position
2072  // only one of len2na and gap_len can be above zero
2073 
2074  for ( ; rem_len; --rem_len, Advance(reader) ) {
2075  auto base_type = GetBaseType(reader);
2076  if ( base_type == eBase_2na ) {
2077  if ( len2na == stop_2na_len-1 ) { // 1 more 2na is enough
2078  return len-(rem_len+len2na);
2079  }
2080  ++len2na;
2081  if ( kRecoverGaps ) {
2082  gap_len = 0;
2083  }
2084  }
2085  else {
2086  if ( kRecoverGaps && (base_type == eBase_Gap) ) {
2087  if ( gap_len == stop_gap_len-1 ) { // 1 more gap is enough
2088  return len-(rem_len+gap_len);
2089  }
2090  ++gap_len;
2091  }
2092  len2na = 0;
2093  }
2094  }
2095  _ASSERT(len2na < stop_2na_len);
2096  _ASSERT(!kRecoverGaps || gap_len < stop_gap_len);
2097  return len;
2098 }
2099 
2100 
2102  CWGSDb_Impl& db, SSeqTableCursor& cur) const
2103 {
2104  PROFILE(sw____GetGapLen);
2105  S4naReader reader = Get4naReader(pos, db, cur);
2106  TSeqPos rem_len = len;
2107  for ( ; rem_len; --rem_len, Advance(reader) ) {
2108  // check both bases
2109  auto base_type = GetBaseType(reader);
2110  if ( base_type != eBase_Gap ) {
2111  return len-rem_len;
2112  }
2113  }
2114  return len;
2115 }
2116 
2117 
2118 // Return 2na Seq-data for specified range.
2119 // The data mustn't have ambiguities.
2121  SSeqTableCursor& cur) const
2122 {
2123  PROFILE(sw____GetRaw2na);
2124  CRef<CSeq_data> ret(new CSeq_data);
2125  vector<char>& data = ret->SetNcbi2na().Set();
2126  size_t bytes = (len+3)/4;
2127  // allocate 8-byte aligned memory to allow multi-byte operations at end
2128  data.reserve((bytes+7)/8*8);
2129  data.resize(bytes);
2130  cur.m_Cursor.ReadElements(m_RowId, cur.m_READ_2na, 2, pos, len,
2131  data.data());
2132  return ret;
2133 }
2134 
2135 
2136 // return 4na Seq-data for specified range
2138  CWGSDb_Impl& db, SSeqTableCursor& cur) const
2139 {
2140  x_Need4na(db);
2141  CRef<CSeq_data> ret(new CSeq_data);
2142  vector<char>& data = ret->SetNcbi4na().Set();
2143  {{
2144  auto seq_2na = Get2na(pos, len, cur);
2145  PROFILE(sw____GetCvt4na);
2146  s_Convert_2na_to_4na(data, seq_2na->GetNcbi2na().Get(), len);
2147  }}
2148  if ( m_HasAmbiguityPos ) {
2149  // restore 4na by adding ambiguous bases to 2na
2150  PROFILE(sw____GetAmb4na);
2151  // set ambiguities
2152  s_SetAmbiguitiesPos(data, pos, len, m_AmbiguityPos, m_Ambiguity4na);
2153  }
2154  else {
2155  // restore 4na by adding ambiguous blocks to 2na
2156  PROFILE(sw____GetBlk4na);
2157  s_SetAmbiguitiesBlocks(data, pos, len, m_4naBlocks);
2158  }
2159  {{
2160  PROFILE(sw____SetGaps);
2161  s_SetGaps(data, pos, len, GetGapInfo());
2162  }}
2163  return ret;
2164 }
2165 
2166 
2168 {
2169  TSeqPos pos0 = pos;
2170  TSeqPos end = pos+len;
2171  while ( pos != end ) {
2172  TSeqPos block_index = pos/kAmbiguityBlockSize;
2173  if ( x_AmbiguousBlock(block_index) ) {
2174  // 4na
2175  break;
2176  }
2177  pos = min(end, (block_index+1)*kAmbiguityBlockSize);
2178  }
2179  return pos-pos0;
2180 }
2181 
2182 
2184 {
2185  TSeqPos pos0 = pos;
2186  TSeqPos end = pos+len;
2187  while ( pos != end ) {
2188  TSeqPos block_index = pos/kAmbiguityBlockSize;
2189  if ( !x_AmbiguousBlock(block_index) ) {
2190  // 2na
2191  break;
2192  }
2193  pos = min(end, (block_index+1)*kAmbiguityBlockSize);
2194  }
2195  return pos-pos0;
2196 }
2197 
2198 
2199 #ifdef USE_GLOBAL_AMBIGUITY_CACHE
2200 
2201 DEFINE_STATIC_FAST_MUTEX(s_GlobalAmbiguityCacheMutex);
2204 public:
2207 };
2209 #endif
2210 
2212 {
2213 #ifdef USE_GLOBAL_AMBIGUITY_CACHE
2214  CFastMutexGuard guard(s_GlobalAmbiguityCacheMutex);
2215  return s_GlobalAmbiguityCache->get(make_pair(GetWGSPath(), row));
2216 #else
2218  return m_AmbiguityCache.get(row);
2219 #endif
2220 }
2221 
2222 
2224 {
2225  if ( !info ) {
2226  return;
2227  }
2228  size_t used_memory = info->GetUsedMemory();
2229 #ifdef USE_GLOBAL_AMBIGUITY_CACHE
2230  CFastMutexGuard guard(s_GlobalAmbiguityCacheMutex);
2231  s_GlobalAmbiguityCache->put(make_pair(GetWGSPath(), info->m_RowId), info, used_memory);
2232 #else
2234  m_AmbiguityCache.put(info->m_RowId, info, used_memory);
2235 #endif
2236 }
2237 
2238 
2240 {
2241  CRef<SSeq0TableCursor> seq = Seq0();
2242  if ( !seq->m_Cursor.TryOpenRow(1) ) {
2243  m_IdPrefixWithVersion.erase();
2244  m_IdPrefix.erase();
2245  m_IdVersion = 1;
2246  m_IdRowDigits = 0;
2247  return;
2248  }
2249  CTempString acc = *seq->ACC_PREFIX(1);
2250  const SIZE_TYPE prefix_len = acc.find_first_of("0123456789");
2251  m_IdRowDigits = *seq->ACC_CONTIG_LEN(1);
2252  if ( m_IdRowDigits < 6 || m_IdRowDigits > 8 ) {
2253  NCBI_THROW_FMT(CSraException, eInitFailed,
2254  "CWGSDb: bad WGS accession format: "<<acc);
2255  }
2256  m_IdPrefixWithVersion = acc.substr(0, prefix_len+2);
2257  m_IdPrefix = acc.substr(0, prefix_len);
2258  m_IdVersion = NStr::StringToNumeric<int>(acc.substr(prefix_len, 2));
2259  if ( seq->m_MOL ) {
2260  // explicit contig type
2261  m_ContigMolType = CSeq_inst::TMol(*seq->MOL(1));
2262  }
2263  else {
2264  // deduce contig type from accession prefix
2265  switch ( acc[0] ) {
2266  case 'G':
2267  case 'H':
2268  case 'I':
2270  break;
2271  default:
2273  break;
2274  }
2275  }
2276  m_IdPrefixDbWithVersion = (IsTSA()? "TSA:": "WGS:")+m_IdPrefixWithVersion;
2277  m_IdPrefixDb = (IsTSA()? "TSA:": "WGS:")+m_IdPrefix;
2278  m_HasNoDefaultGnlId = seq->m_SEQID_GNL_PREFIX && seq->SEQID_GNL_PREFIX(1).empty();
2279  bool has_static_taxid = seq->m_TAXID && seq->m_TAXID.IsStatic(seq->m_Cursor);
2280  TTaxId static_taxid = ZERO_TAX_ID;
2281  if ( has_static_taxid ) {
2282  auto value = seq->TAXID(1);
2283  if ( value.size() != 1 ) {
2284  has_static_taxid = false;
2285  }
2286  else {
2287  static_taxid = value[0];
2288  }
2289  }
2290  Put(seq);
2291 
2292  if ( CKMetadata meta = CKMetadata(SeqTable()) ) {
2293  if ( CKMDataNode node = CKMDataNode(meta, "GB_STATE", CKMDataNode::eMissing_Allow) ) {
2294  m_ProjectGBState = NCBI_gb_state(node.GetUint8());
2295  }
2296  if ( CKMDataNode node = CKMDataNode(meta, "REPLACED_BY", CKMDataNode::eMissing_Allow) ) {
2297  size_t size = node.GetSize();
2298  m_ReplacedBy.resize(size);
2299  node.GetData(&m_ReplacedBy[0], size);
2300  }
2301  if ( CKMDataNode node = CKMDataNode(meta, "SEQ_ID_TYPE", CKMDataNode::eMissing_Allow) ) {
2302  m_SeqIdType = CSeq_id::E_Choice(node.GetUint8());
2303  }
2304  if ( CKMDataNode node = CKMDataNode(meta, "EXTRA_TAXIDS", CKMDataNode::eMissing_Allow) ) {
2305  // all tax ids are separate
2306  }
2307  else if ( CKMDataNode node = CKMDataNode(meta, "TAXID", CKMDataNode::eMissing_Allow) ) {
2308  // common taxid
2309  if ( node.GetSize() != 0 ) {
2310  m_CommonTaxId = node.GetUint4();
2311  m_HasCommonTaxId = true;
2312  if ( has_static_taxid && static_taxid != m_CommonTaxId ) {
2314  m_HasCommonTaxId = false;
2315  }
2316  }
2317  }
2318  }
2319 }
2320 
2321 
2323  CTempString vol_path)
2324 {
2325 #ifdef USE_TEST_PATH
2326  {
2327  string test_path = NCBI_PARAM_TYPE(WGS, TEST_PATH)::GetDefault();
2328  if ( !test_path.empty() ) {
2329  string file_path = CDirEntry::MakePath(test_path, path_or_acc);
2330  if ( CDirEntry(file_path).Exists() ) {
2331  LOG_POST(Warning<<"Using local test file: "<<file_path);
2332  return file_path;
2333  }
2334  }
2335  }
2336 #endif
2337  if ( !vol_path.empty() ) {
2338  vector<CTempString> dirs;
2339  NStr::Split(vol_path, ":", dirs);
2340  ITERATE ( vector<CTempString>, it, dirs ) {
2341  string path = CDirEntry::MakePath(*it, path_or_acc);
2342  if ( CDirEntry(path).Exists() ) {
2343  return path;
2344  }
2345  }
2346  string path = CDirEntry::MakePath(vol_path, path_or_acc);
2347  if ( CDirEntry(path).Exists() ) {
2348  return path;
2349  }
2350  }
2351  if ( CVPath::IsPlainAccession(path_or_acc) &&
2352  path_or_acc.find('.') == string::npos ) { // no WGS accession sub-version
2353  // parse WGS accession
2354  const SIZE_TYPE start = 0;
2355  // ID-5322 : WGS prefix can consist of 4 or 6 characters, with optional
2356  // 2-digit version.
2357  // If no version is specified, set it to a default value 00, which is
2358  // resolved to a real version via a symlink on the file system.
2359  string acc = path_or_acc.substr(start);
2360  size_t acclen = acc.size();
2361  size_t digit_pos = acc.find_first_of("0123456789");
2362  if (digit_pos == string::npos && (acclen == 4 || acclen == 6)) {
2363  return string(path_or_acc) + "00";
2364  } else if ((digit_pos == 4 || digit_pos == 6) &&
2365  acclen > digit_pos + 2) {
2366  // remove contig/scaffold id
2367  return path_or_acc.substr(0, start+digit_pos+2);
2368  }
2369  }
2370  return path_or_acc;
2371 }
2372 
2373 
2374 inline
2376  atomic<bool>& table_is_opened,
2377  const char* table_name)
2378 {
2380  if ( !table_is_opened.load(memory_order_acquire) ) {
2382  table_is_opened.store(true, memory_order_release);
2383  }
2384 }
2385 
2386 
2387 inline
2389  CVDBTableIndex& index,
2390  atomic<Int1>& index_is_opened,
2391  const char* index_name,
2392  const char* backup_index_name)
2393 {
2394  if ( table ) {
2396  if ( !index_is_opened.load(memory_order_acquire) ) {
2397  Int1 type = -1;
2398  index = CVDBTableIndex(table, index_name,
2400  if ( index ) {
2401  type = 1;
2402  }
2403  else if ( backup_index_name ) {
2404  index = CVDBTableIndex(table, backup_index_name,
2406  if ( index ) {
2407  type = 2;
2408  }
2409  }
2410  index_is_opened.store(type, memory_order_release);
2411  }
2412  }
2413  else {
2414  index_is_opened.store(-1, memory_order_release);
2415  }
2416 }
2417 
2418 
2420 {
2421  OpenTable(m_ScfTable, m_ScfTableIsOpened, "SCAFFOLD");
2422 }
2423 
2424 
2426 {
2428 }
2429 
2430 
2432 {
2434 }
2435 
2436 
2438 {
2440 }
2441 
2442 
2444 {
2446 }
2447 
2448 
2450 {
2452  "contig_name_uc", "contig_name");
2453 }
2454 
2455 
2457 {
2459  "scaffold_name_uc", "scaffold_name");
2460 }
2461 
2462 
2464 {
2466  "protein_name_uc", "protein_name");
2467 }
2468 
2469 
2471 {
2473  "product_name_uc", "product_name");
2474 }
2475 
2476 
2478 {
2480  "gb_accession");
2481 }
2482 
2483 
2484 pair<TVDBRowId, CWGSDb_Impl::ERowType>
2486  TAllowRowType allow_type)
2487 {
2488  pair<TVDBRowId, ERowType> ret(0, eRowType_contig);
2489  const SIZE_TYPE start = 0;
2490  SIZE_TYPE prefix_len = acc.find_first_of("0123456789");
2491  if (prefix_len == NPOS || prefix_len >= acc.size() - 2)
2492  return ret;
2493  else prefix_len += 2;
2494 
2495  CTempString row = acc.substr(start+prefix_len);
2496  if ( row[0] == 'S' ) {
2497  if ( !(allow_type & fAllowRowType_scaffold) ) {
2498  return ret;
2499  }
2500  ret.second = eRowType_scaffold;
2501  row = row.substr(1); // skip scaffold prefix
2502  }
2503  else if ( row[0] == 'P' ) {
2504  if ( !(allow_type & fAllowRowType_protein) ) {
2505  return ret;
2506  }
2507  ret.second = eRowType_protein;
2508  row = row.substr(1); // skip scaffold prefix
2509  }
2510  else {
2511  if ( !(allow_type & fAllowRowType_contig) ) {
2512  return ret;
2513  }
2514  }
2515  ret.first = NStr::StringToNumeric<TVDBRowId>(row, NStr::fConvErr_NoThrow);
2516  if ( ret.first < 0 ) {
2517  ret.first = 0;
2518  }
2519  return ret;
2520 }
2521 
2522 
2523 TVDBRowId CWGSDb_Impl::ParseRow(CTempString acc, bool* is_scaffold) const
2524 {
2525  TAllowRowType allow_type = fAllowRowType_contig;
2526  if ( is_scaffold ) {
2527  allow_type |= fAllowRowType_scaffold;
2528  }
2529  pair<TVDBRowId, TRowType> rt = ParseRowType(acc, allow_type);
2530  if ( is_scaffold ) {
2531  *is_scaffold = rt.second == eRowType_scaffold;
2532  }
2533  return rt.first;
2534 }
2535 
2536 
2538 
2540 {
2541  if ( const CTextseq_id* text_id = id.GetTextseq_Id() ) {
2542  const_cast<CTextseq_id*>(text_id)->SetVersion(version);
2543  return true;
2544  }
2545  return false;
2546 }
2547 
2548 
2550 {
2551  if ( const CTextseq_id* text_id = id.GetTextseq_Id() ) {
2552  const_cast<CTextseq_id*>(text_id)->SetAccession(accession);
2553  return true;
2554  }
2555  return false;
2556 }
2557 
2558 
2560 
2561 
2562 inline
2564 {
2566 }
2567 
2569 {
2570  const bool kSetErrno = 0;
2571  const bool kSetNcbiError = 0;
2572 
2573  int error = 0, ret = -1;
2574  size_t len = str.size();
2575  if ( !len ) {
2576  error = EINVAL;
2577  }
2578  else {
2579  unsigned v = str.data()[0] - '0';
2580  if (v > 9) {
2581  error = EINVAL;
2582  }
2583  else {
2584  for (size_t i = 1; i < len; ++i) {
2585  unsigned d = str.data()[i] - '0';
2586  if (d > 9) {
2587  error = EINVAL;
2588  break;
2589  }
2590  unsigned nv = v * 10 + d;
2591  const unsigned kOverflowLimit = (INT_MAX - 9) / 10 + 1;
2592  if ( v >= kOverflowLimit ) {
2593  // possible overflow
2594  if ( v > kOverflowLimit || nv > INT_MAX) {
2595  error = ERANGE;
2596  break;
2597  }
2598  }
2599  v = nv;
2600  }
2601  if (!error) {
2602  ret = static_cast<int>(v);
2603  }
2604  }
2605  }
2606  if (kSetErrno) {
2607  errno = error;
2608  }
2609  if (kSetNcbiError && error) {
2611  }
2612  return ret;
2613 }
2614 
2615 
2616 // return non-negative integer if the string is its canonical representation -
2617 // no leading zeros or spaces,
2618 // otherwise return -1
2620 {
2621  int id = sx_StringToNonNegativeInt(str);
2622  if ( id >= 0 ) {
2623  if ( str.size() == 1 || str.data()[0] != '0' ) { // no leading zeroes
2624  return id;
2625  }
2626  }
2627  return -1;
2628 }
2629 
2630 /*
2631 CRange<int> sx_GetPatentRange(const CUser_object& obj, CTempString prefix)
2632 {
2633  int from = -1;
2634  int to = -1;
2635  if ( auto field = obj.GetFieldRef("Patent_accession_first") ) {
2636  if ( field->GetData().IsStr() ) {
2637  CTempString str = field->GetData().GetStr();
2638  if ( NStr::StartsWith(str, prefix) ) {
2639  from = sx_StringToNonNegativeInt(str.substr(prefix.size()));
2640  }
2641  }
2642  }
2643  if ( auto field = obj.GetFieldRef("Patent_accession_last") ) {
2644  if ( field->GetData().IsStr() ) {
2645  CTempString str = field->GetData().GetStr();
2646  if ( NStr::StartsWith(str, prefix) ) {
2647  to = sx_StringToNonNegativeInt(str.substr(prefix.size()));
2648  }
2649  }
2650  }
2651  if ( from >= 0 && to >= from ) {
2652  return CRange<int>(from, to);
2653  }
2654  else {
2655  return CRange<int>::GetEmpty();
2656  }
2657 }
2658 */
2659 
2661 {
2662  CObject_id& oid = tag.SetTag();
2663  int id = sx_GetStringId(str);
2664  if ( id >= 0 ) {
2665  oid.SetId(id);
2666  }
2667  else {
2668  oid.SetStr(str);
2669  }
2670 }
2671 
2672 
2674 {
2675  if ( !bytes.empty() ) {
2676  CObjectIStreamAsnBinary in(bytes.data(), bytes.size());
2677  // hack to determine if the data
2678  // is of type Seq-descr (starts with byte 49)
2679  // or of type Seqdesc (starts with byte >= 160)
2680  if ( bytes[0] == kSeq_descrFirstByte ) {
2681  CSeq_descr tmp;
2682  in >> tmp;
2683  for ( auto& desc : tmp.Set() ) {
2684  descr.Set().push_back(desc);
2685  }
2686  }
2687  else {
2688  while ( in.HaveMoreData() ) {
2689  CRef<CSeqdesc> desc(new CSeqdesc);
2690  in >> *desc;
2691  descr.Set().push_back(desc);
2692  }
2693  }
2694  }
2695 }
2696 
2697 
2699 {
2700  if ( !bytes.empty() ) {
2701  CObjectIStreamAsnBinary in(bytes.data(), bytes.size());
2702  while ( in.HaveMoreData() ) {
2703  CRef<CSeq_annot> annot(new CSeq_annot);
2704  in >> *annot;
2705  annot_set.push_back(annot);
2706  }
2707  }
2708 }
2709 
2710 
2712 {
2713  if ( id.IsGi() ) {
2714  split_id.SetGi(id.GetGi());
2715  }
2716  else {
2717  split_id.SetSeq_id(id);
2718  }
2719 }
2720 
2721 
2723 {
2724  if ( id.IsGi() ) {
2725  split_id.SetGi(id.GetGi());
2726  }
2727  else {
2728  split_id.SetSeq_id(id);
2729  }
2730 }
2731 
2732 
2734 {
2736  sx_SetSplitId(*split_id, id);
2737  split_ids.push_back(split_id);
2738 }
2739 
2740 
2742  const CBioseq::TId& ids)
2743 {
2744  ITERATE ( CBioseq::TId, it, ids ) {
2745  sx_AddSplitId(split_ids, it->GetNCObject());
2746  }
2747 }
2748 
2749 
2751  TSeqPos pos, TSeqPos end)
2752 {
2753  if ( id.IsGi() ) {
2754  CID2S_Gi_Interval& loc_gi = split_loc.SetGi_interval();
2755  loc_gi.SetGi(id.GetGi());
2756  loc_gi.SetStart(pos);
2757  loc_gi.SetLength(end-pos);
2758  }
2759  else {
2760  CID2S_Seq_id_Interval& loc_id = split_loc.SetSeq_id_interval();
2761  loc_id.SetSeq_id(id);
2762  loc_id.SetStart(pos);
2763  loc_id.SetLength(end-pos);
2764  }
2765 }
2766 
2767 
2769 
2770 
2771 bool CWGSDb_Impl::IsTSA(void) const
2772 {
2774 }
2775 
2776 
2778  CTempString tag) const
2779 {
2780  if ( prefix.empty() ) {
2781  return null;
2782  }
2783  else {
2784  CRef<CSeq_id> id(new CSeq_id);
2785  CDbtag& dbtag = id->SetGeneral();
2786  dbtag.SetDb(prefix);
2787  sx_SetTag(dbtag, tag);
2788  return id;
2789  }
2790 }
2791 
2792 
2794  TGnlIdFlags gnl_id_flags) const
2795 {
2796  CRef<CSeq_id> id;
2797  if ( m_HasNoDefaultGnlId ) {
2798  return id;
2799  }
2800  id = new CSeq_id;
2801  CDbtag& dbtag = id->SetGeneral();
2802  SIZE_TYPE colon = tag.rfind(':');
2803  if ( colon != NPOS ) {
2804  dbtag.SetDb(tag.substr(0, colon));
2805  tag = tag.substr(colon+1);
2806  }
2807  else {
2808  const string& db =
2810  dbtag.SetDb(db);
2811  if ( NStr::StartsWith(tag, db) &&
2812  tag[db.size()] == ':' ) {
2813  tag = tag.substr(db.size()+1);
2814  }
2815  }
2816  sx_SetTag(dbtag, tag);
2817  return id;
2818 }
2819 
2820 
2822 {
2823  CRef<CSeq_id> seq_id(new CSeq_id);
2824  CPatent_seq_id& pat_id = seq_id->SetPatent();
2826  pat_id.SetSeqid(id);
2827  return seq_id;
2828 }
2829 
2830 
2833  TVDBRowId row,
2834  TGnlIdFlags gnl_id_flags) const
2835 {
2836  if ( str.empty() ) {
2837  return null;
2838  }
2839  int id = sx_GetStringId(str);
2840  if ( id >= 0 && HasPatentId() ) {
2841  return GetPatentSeq_id(id);
2842  }
2843  if ( gnl_id_flags & fGnlId_NoWGSId ) {
2844  return null;
2845  }
2846  return GetGeneralSeq_id(str, gnl_id_flags);
2847 }
2848 
2849 
2852  const SSeq0TableCursor& cur,
2853  TVDBRowId row) const
2854 {
2855  if ( str.empty() ) {
2856  return null;
2857  }
2858  int id = sx_GetStringId(str);
2859  if ( id >= 0 && HasPatentId() ) {
2860  return GetPatentSeq_id(id);
2861  }
2862  if ( cur.m_SEQID_GNL_PREFIX ) {
2863  return GetGeneralSeq_id(cur.SEQID_GNL_PREFIX(1), str);
2864  }
2865  else {
2866  return GetGeneralSeq_id(str);
2867  }
2868 }
2869 
2870 
2873  const SScfTableCursor& cur,
2874  TVDBRowId row) const
2875 {
2876  if ( str.empty() ) {
2877  return null;
2878  }
2879  int id = sx_GetStringId(str);
2880  if ( id >= 0 && HasPatentId() ) {
2881  return GetPatentSeq_id(id);
2882  }
2883  if ( cur.m_SEQID_GNL_PREFIX ) {
2884  return GetGeneralSeq_id(cur.SEQID_GNL_PREFIX(1), str);
2885  }
2886  else {
2887  return GetGeneralSeq_id(str);
2888  }
2889 }
2890 
2891 
2894  const SProt0TableCursor& cur,
2895  TVDBRowId row) const
2896 {
2897  if ( str.empty() ) {
2898  return null;
2899  }
2900  int id = sx_GetStringId(str);
2901  if ( id >= 0 && HasPatentId() ) {
2902  return GetPatentSeq_id(id);
2903  }
2904  if ( cur.m_SEQID_GNL_PREFIX ) {
2905  return GetGeneralSeq_id(cur.SEQID_GNL_PREFIX(1), str);
2906  }
2907  else {
2909  }
2910 }
2911 
2912 
2914 {
2915  PROFILE(sw_GetAccSeq_id);
2916  CRef<CSeq_id> id;
2917  if ( !acc.empty() ) {
2918  if ( m_SeqIdType != CSeq_id::e_not_set ) {
2919  id = new CSeq_id();
2920  id->Select(m_SeqIdType);
2921  sx_SetAccession(*id, acc);
2922  }
2923  else {
2924  id = new CSeq_id(acc);
2925  }
2926  sx_SetVersion(*id, version);
2927  }
2928  return id;
2929 }
2930 
2931 
2933  TVDBRowId row_id,
2934  int version) const
2935 {
2936  CRef<CSeq_id> id;
2937  if ( m_IdPrefixWithVersion.empty() ) {
2938  return id;
2939  }
2942  if ( type != eRowType_contig ) {
2943  str << char(type);
2944  }
2945  str << setfill('0') << setw(m_IdRowDigits) << row_id;
2946  string id_str = CNcbiOstrstreamToString(str);
2947  id = new CSeq_id(id_str);
2948  sx_SetVersion(*id, version);
2949  return id;
2950 }
2951 
2952 
2954 {
2955  CRef<CSeq_id> id;
2956  if ( m_IdPrefix.empty() ) {
2957  return id;
2958  }
2959  string master_acc = m_IdPrefix;
2960  master_acc.resize(master_acc.size() + 2 + m_IdRowDigits, '0');
2961  id = new CSeq_id(master_acc);
2962  if ( !sx_SetVersion(*id, m_IdVersion) ) {
2963  id = null;
2964  }
2965  return id;
2966 }
2967 
2968 
2970 {
2971  return GetAccSeq_id(eRowType_contig, row_id, 1);
2972 }
2973 
2974 
2976 {
2977  return GetAccSeq_id(eRowType_scaffold, row_id, 1);
2978 }
2979 
2980 
2982 {
2983  return GetAccSeq_id(eRowType_protein, row_id, 1);
2984 }
2985 
2986 
2988 {
2989  return m_ContigMolType;
2990 }
2991 
2992 
2994 {
2995  return CSeq_inst::eMol_dna;
2996 }
2997 
2998 
3000 {
3001  return CSeq_inst::eMol_aa;
3002 }
3003 
3004 
3006 {
3007  m_MasterDescr.clear();
3008  m_IsSetMasterDescr = false;
3009 }
3010 
3011 
3013 {
3014  if ( !IsSetMasterDescr() &&
3015  NCBI_PARAM_TYPE(WGS, MASTER_DESCR)::GetDefault() ) {
3016  x_LoadMasterDescr(filter);
3017  }
3018  return IsSetMasterDescr();
3019 }
3020 
3021 
3023 {
3024  buffer.clear();
3025  CKMetadata meta(SeqTable());
3026  if ( !meta ) {
3027  return 0;
3028  }
3029  CKMDataNode node(meta, "MASTER", CKMDataNode::eMissing_Allow);
3030  if ( !node ) {
3031  return 0;
3032  }
3033  size_t size = node.GetSize();
3034  if ( !size ) {
3035  return 0;
3036  }
3037  buffer.resize_mem(size);
3038  node.GetData(buffer.data(), size);
3039  return size;
3040 }
3041 
3042 
3044 {
3045  if ( !m_MasterEntry ) {
3047  if ( !m_MasterEntry ) {
3049  if ( !GetMasterDescrBytes(buffer) ) {
3050  return null;
3051  }
3052 
3053  CObjectIStreamAsnBinary str(buffer.data(), buffer.size());
3054  CRef<CSeq_entry> master_entry(new CSeq_entry());
3055  str >> *master_entry;
3056  m_MasterEntry = master_entry;
3057  }
3058  if ( m_MasterEntry->IsSeq() ) {
3059  for ( auto& id : m_MasterEntry->GetSeq().GetId() ) {
3060  if ( id->IsPatent() ) {
3061  SetPatentId(id);
3062  break;
3063  }
3064  }
3065  /*
3066  m_PatentSeqIdRangeNuc = CRange<int>::GetEmpty();
3067  m_PatentSeqIdRangeProt = CRange<int>::GetEmpty();
3068  if ( HasPatentId() && m_MasterEntry->GetSeq().IsSetDescr() ) {
3069  for ( auto& d : m_MasterEntry->GetSeq().GetDescr().Get() ) {
3070  const CSeqdesc& desc = *d;
3071  if ( desc.IsUser() ) {
3072  const CUser_object& obj = desc.GetUser();
3073  const CObject_id& type = obj.GetType();
3074  if ( type.IsStr() &&
3075  type.GetStr() == "PatentProjects" ) {
3076  m_PatentSeqIdRangeNuc = sx_GetPatentRange(obj, GetIdPrefixWithVersion());
3077  m_PatentSeqIdRangeProt = m_PatentSeqIdRangeNuc;
3078  }
3079  if ( type.IsStr() &&
3080  type.GetStr() == "PatentProjectsNucleotide" ) {
3081  m_PatentSeqIdRangeNuc = sx_GetPatentRange(obj, GetIdPrefixWithVersion());
3082  }
3083  if ( type.IsStr() &&
3084  type.GetStr() == "PatentProjectsProtein" ) {
3085  m_PatentSeqIdRangeProt = sx_GetPatentRange(obj, GetIdPrefixWithVersion());
3086  }
3087  }
3088  }
3089  }
3090  */
3091  }
3092  }
3093  return m_MasterEntry;
3094 }
3095 
3096 
3098 {
3099  if ( CRef<CSeq_entry> master_entry = GetMasterDescrEntry() ) {
3100  if ( master_entry->IsSetDescr() ) {
3101  SetMasterDescr(master_entry->GetDescr().Get(), filter);
3102  }
3103  }
3104 }
3105 
3106 
3109 {
3110  switch ( desc.Which() ) {
3111  case CSeqdesc::e_Pub:
3112  case CSeqdesc::e_Comment:
3113  return eDescr_force;
3114  case CSeqdesc::e_Source:
3115  case CSeqdesc::e_Molinfo:
3118  case CSeqdesc::e_Genbank:
3119  case CSeqdesc::e_Embl:
3120  return eDescr_default;
3121  case CSeqdesc::e_User:
3122  if ( desc.GetUser().GetType().IsStr() ) {
3123  // only specific user objects are passed from WGS master
3124  const string& name = desc.GetUser().GetType().GetStr();
3125  if ( name == "DBLink" ||
3126  name == "GenomeProjectsDB" ||
3127  name == "StructuredComment" ||
3128  name == "FeatureFetchPolicy" ||
3129  name == "Unverified") {
3130  return eDescr_default;
3131  }
3132  }
3133  return eDescr_skip;
3134  default:
3135  return eDescr_skip;
3136  }
3137 }
3138 
3139 
3141  int filter)
3142 {
3143  if ( filter == CWGSDb::eDescrDefaultFilter ) {
3144  TMasterDescr descr2;
3145  ITERATE ( CSeq_descr::Tdata, it, descr ) {
3147  descr2.push_back(Ref(SerialClone(**it)));
3148  }
3149  }
3151  return;
3152  }
3153  m_MasterDescr = descr;
3154  m_IsSetMasterDescr = true;
3155 }
3156 
3157 static string
3159 {
3160  string uo_type;
3161 
3162  if (desc.IsUser() && desc.GetUser().GetType().IsStr()) {
3163  uo_type = desc.GetUser().GetType().GetStr();
3164  if (uo_type == "StructuredComment") {
3165  ITERATE (CUser_object::TData, it, desc.GetUser().GetData()) {
3166  if ((*it)->GetLabel().IsStr() &&
3167  (*it)->GetLabel().GetStr() == "StructuredCommentPrefix") {
3168  string data = ((*it)->GetData().IsStr() ?
3169  (string) (*it)->GetData().GetStr() :
3170  NStr::IntToString((*it)->GetData().GetInt()));
3171  uo_type += "|" + data;
3172  break;
3173  }
3174  }
3175  }
3176  }
3177 
3178  return uo_type;
3179 }
3180 
3181 static void
3182 s_AddUserObjectType(const CSeqdesc& desc, set<string>& existing_uo_types)
3183 {
3184  string uo_type = s_GetUserObjectType(desc);
3185 
3186  if (!uo_type.empty() && existing_uo_types.count(uo_type) == 0) {
3187  existing_uo_types.insert(uo_type);
3188  }
3189 }
3190 
3191 void CWGSDb_Impl::AddMasterDescr(CSeq_descr& descr, const CBioseq* main_seq, TFlags flags) const
3192 {
3193  if ( !GetMasterDescr().empty() ) {
3194  unsigned type_mask = 0;
3195  set<string> existing_uo_types;
3196 
3197  ITERATE ( CSeq_descr::Tdata, it, descr.Get() ) {
3198  const CSeqdesc& desc = **it;
3199  type_mask |= 1 << desc.Which();
3200  s_AddUserObjectType(desc, existing_uo_types);
3201  }
3202 
3203  if (main_seq && main_seq->IsSetDescr()) {
3204  for (auto& desc : main_seq->GetDescr().Get()) {
3205  type_mask |= 1 << desc->Which();
3206  s_AddUserObjectType(*desc, existing_uo_types);
3207  }
3208  }
3209 
3210  string kMasterDescrMark = "WithMasterDescr";
3211  if ( existing_uo_types.find(kMasterDescrMark) == existing_uo_types.end() ) {
3212  ITERATE ( TMasterDescr, it, GetMasterDescr() ) {
3213  const CSeqdesc& desc = **it;
3215  (type_mask & (1 << desc.Which())) ) {
3216  bool skip = true;
3217  string uo_type = s_GetUserObjectType(desc);
3218  if (!uo_type.empty() && existing_uo_types.count(uo_type) == 0)
3219  skip = false;
3220  // omit master descr if contig already has one of that type
3221  if (skip)
3222  continue;
3223  }
3224  descr.Set().push_back(*it);
3225  }
3226  if ( flags & fMasterDescrMark ) {
3227  CRef<CSeqdesc> desc(new CSeqdesc);
3228  auto& user_object = desc->SetUser();
3229  user_object.SetType().SetStr(kMasterDescrMark);
3230  user_object.SetData();
3231  descr.Set().push_back(desc);
3232  }
3233  }
3234  }
3235 }
3236 
3237 
3239 {
3240  if ( m_MasterEntry ) {
3241  return m_MasterEntry;
3242  }
3243 
3244  // generate one
3245  CRef<CSeq_entry> entry(new CSeq_entry);
3246  CBioseq& seq = entry->SetSeq();
3247  seq.SetId().push_back(GetMasterSeq_id());
3248  if ( !m_MasterDescr.empty() ) {
3249  seq.SetDescr().Set() = m_MasterDescr;
3250  }
3251  CSeq_inst& inst = seq.SetInst();
3254  return entry;
3255 }
3256 
3257 
3259 {
3260  m_PatentId = id;
3261 }
3262 
3263 /*
3264 int CWGSDb_Impl::GetPatentSeqIdNuc(CTempString str_id) const
3265 {
3266  if ( !HasPatentId() ) {
3267  return 0;
3268  }
3269  int id = sx_GetStringId(str_id);
3270  return id >= 0 && IsValidPatentSeqIdNuc(id)? id: 0;
3271 }
3272 
3273 
3274 int CWGSDb_Impl::GetPatentSeqIdProt(CTempString str_id) const
3275 {
3276  if ( !HasPatentId() ) {
3277  return 0;
3278  }
3279  int id = sx_GetStringId(str_id);
3280  return id >= 0 && IsValidPatentSeqIdProt(id)? id: 0;
3281 }
3282 */
3283 
3285 {
3286  if ( m_MasterEntry ) {
3287  const CBioseq::TId& ids = m_MasterEntry->GetSeq().GetId();
3288  ITERATE ( CBioseq::TId, it, ids ) {
3289  const CSeq_id& id = **it;
3290  if ( id.IsGi() ) {
3291  return id.GetGi();
3292  }
3293  }
3294  }
3295  return ZERO_GI;
3296 }
3297 
3298 
3299 static inline TGi s_ToGi(TVDBRowId gi, const char* method)
3300 {
3301  if ( gi < 0 ||
3302  (sizeof(TIntId) != sizeof(gi) && TVDBRowId(TIntId(gi)) != gi) ) {
3303  NCBI_THROW_FMT(CSraException, eDataError,
3304  method<<": GI is too big: "<<gi);
3305  }
3306  return GI_FROM(TVDBRowId, gi);
3307 }
3308 
3309 
3310 pair<TGi, TGi> CWGSDb_Impl::GetNucGiRange(void)
3311 {
3312  pair<TGi, TGi> ret;
3313  if ( CRef<SGiIdxTableCursor> idx = GiIdx() ) {
3314  if ( idx->m_NUC_ROW_ID ) {
3315  TVDBRowIdRange row_range =
3316  idx->m_NUC_ROW_ID.GetRowIdRange(idx->m_Cursor);
3317  if ( row_range.second ) {
3318  ret.first = s_ToGi(row_range.first,
3319  "CWGSDb::GetNucGiRange()");
3320  ret.second = s_ToGi(row_range.first + row_range.second - 1,
3321  "CWGSDb::GetNucGiRange()");
3322  }
3323  }
3324  Put(idx);
3325  }
3326  return ret;
3327 }
3328 
3329 
3330 pair<TGi, TGi> CWGSDb_Impl::GetProtGiRange(void)
3331 {
3332  pair<TGi, TGi> ret;
3333  if ( CRef<SGiIdxTableCursor> idx = GiIdx() ) {
3334  if ( idx->m_PROT_ROW_ID ) {
3335  TVDBRowIdRange row_range =
3336  idx->m_PROT_ROW_ID.GetRowIdRange(idx->m_Cursor);
3337  if ( row_range.second ) {
3338  ret.first = s_ToGi(row_range.first,
3339  "CWGSDb::GetProtGiRange()");
3340  ret.second = s_ToGi(row_range.first + row_range.second - 1,
3341  "CWGSDb::GetProtGiRange()");
3342  }
3343  }
3344  Put(idx);
3345  }
3346  return ret;
3347 }
3348 
3349 
3351 {
3352  if ( ranges.empty() ) {
3353  return;
3354  }
3355  sort(ranges.begin(), ranges.end());
3356  TGiRanges::iterator dst = ranges.begin();
3357  for ( TGiRanges::iterator i = dst+1; i != ranges.end(); ++i ) {
3358  if ( i->GetFrom() == dst->GetToOpen() ) {
3359  dst->SetToOpen(i->GetToOpen());
3360  }
3361  else {
3362  *++dst = *i;
3363  }
3364  }
3365  ranges.erase(dst+1, ranges.end());
3366 }
3367 
3368 
3370 {
3371  TGiRanges ranges;
3372  TVDBRowId row_id = 0;
3373  CRef<SSeqTableCursor> seq = Seq();
3374  if ( seq->m_GI ) {
3375  TIntId gi_start = -1, gi_end = -1;
3376  TVDBRowIdRange row_range = seq->m_GI.GetRowIdRange(seq->m_Cursor);
3377  for ( TVDBRowCount i = 0; i < row_range.second; ++i ) {
3378  row_id = row_range.first+i;
3379  TIntId gi = GI_TO(TIntId, s_ToGi(*seq->GI(row_id), "CWGSDb::GetNucGiRanges()"));
3380  if ( !gi ) {
3381  continue;
3382  }
3383  if ( gi != gi_end ) {
3384  if ( gi_end != gi_start ) {
3385  ranges.push_back(TGiRange(gi_start, gi_end));
3386  }
3387  gi_start = gi;
3388  }
3389  gi_end = gi+1;
3390  }
3391  if ( gi_end != gi_start ) {
3392  ranges.push_back(TGiRange(gi_start, gi_end));
3393  }
3394  x_SortGiRanges(ranges);
3395  }
3396  Put(seq, row_id);
3397  return ranges;
3398 }
3399 
3400 
3402 {
3403  TGiRanges ranges;
3404  return ranges;
3405 }
3406 
3407 
3409  : m_IdLength(0)
3410 {
3411  SIZE_TYPE prefix = 0;
3412  while ( prefix < acc.size() && isalpha(acc[prefix]&0xff) ) {
3413  ++prefix;
3414  }
3415  if ( prefix == acc.size() || prefix == 0 || acc.size()-prefix > 9 ) {
3416  // no prefix, or no digits, or too many digits
3417  return;
3418  }
3419  Uint4 v = 0;
3420  for ( SIZE_TYPE i = prefix; i < acc.size(); ++i ) {
3421  char c = acc[i];
3422  if ( c < '0' || c > '9' ) {
3423  return;
3424  }
3425  v = v*10 + (c-'0');
3426  }
3427  id = v;
3428  m_AccPrefix = acc.substr(0, prefix);
3430  m_IdLength = Uint4(acc.size());
3431 }
3432 
3433 
3435 {
3436  string acc = m_AccPrefix;
3437  acc.resize(m_IdLength, '0');
3438  for ( SIZE_TYPE i = m_IdLength; id; id /= 10 ) {
3439  acc[--i] += id % 10;
3440  }
3441  return acc;
3442 }
3443 
3444 
3446 {
3447  TProtAccRanges ranges;
3448  if ( CRef<SProt0TableCursor> seq = Prot0() ) {
3449  TVDBRowId row_id = 0;
3450  TVDBRowIdRange row_range = seq->m_GB_ACCESSION.GetRowIdRange(seq->m_Cursor);
3451  for ( TVDBRowCount i = 0; i < row_range.second; ++i ) {
3452  row_id = row_range.first+i;
3453  CTempString acc = *seq->GB_ACCESSION(row_id);
3454  if ( acc.empty() ) {
3455  continue;
3456  }
3457  Uint4 id;
3458  SProtAccInfo info(acc, id);
3459  if ( !info ) {
3460  continue;
3461  }
3463  if ( it == ranges.end() || it->first != info ) {
3464  TIdRange range(id, id+1);
3466  }
3467  else {
3468  if ( id < it->second.GetFrom() ) {
3469  it->second.SetFrom(id);
3470  }
3471  else if ( id >= it->second.GetToOpen() ) {
3472  it->second.SetTo(id);
3473  }
3474  }
3475  }
3476  Put(seq, row_id);
3477  }
3478  return ranges;
3479 }
3480 
3481 
3482 pair<TVDBRowId, bool> CWGSDb_Impl::GetGiRowId(TGi gi)
3483 {
3484  pair<TVDBRowId, bool> ret;
3485  TIntId row_id = GI_TO(TIntId, gi);
3486  if ( CRef<SGiIdxTableCursor> idx = GiIdx(row_id) ) {
3487  if ( idx->m_NUC_ROW_ID ) {
3489  idx->NUC_ROW_ID(row_id, CVDBValue::eMissing_Allow);
3490  if ( !value.empty() ) {
3491  ret.first = *value;
3492  }
3493  }
3494  if ( !ret.first && idx->m_PROT_ROW_ID ) {
3496  idx->PROT_ROW_ID(row_id, CVDBValue::eMissing_Allow);
3497  if ( !value.empty() ) {
3498  ret.first = *value;
3499  }
3500  }
3501  Put(idx, row_id);
3502  }
3503  return ret;
3504 }
3505 
3506 
3508 {
3509  TVDBRowId ret = 0;
3510  TIntId row_id = GI_TO(TIntId, gi);
3511  if ( CRef<SGiIdxTableCursor> idx = GiIdx(row_id) ) {
3512  if ( idx->m_NUC_ROW_ID ) {
3514  idx->NUC_ROW_ID(row_id, CVDBValue::eMissing_Allow);
3515  if ( !value.empty() ) {
3516  ret = *value;
3517  }
3518  }
3519  Put(idx, row_id);
3520  }
3521  return ret;
3522 }
3523 
3524 
3526 {
3527  TVDBRowId ret = 0;
3528  TIntId row_id = GI_TO(TIntId, gi);
3529  if ( CRef<SGiIdxTableCursor> idx = GiIdx(row_id) ) {
3530  if ( idx->m_PROT_ROW_ID ) {
3532  idx->PROT_ROW_ID(row_id, CVDBValue::eMissing_Allow);
3533  if ( !value.empty() ) {
3534  ret = *value;
3535  }
3536  }
3537  Put(idx, row_id);
3538  }
3539  return ret;
3540 }
3541 
3542 
3543 inline
3544 TVDBRowId CWGSDb_Impl::Lookup(const string& name,
3545  const CVDBTableIndex& index,
3546  bool upcase)
3547 {
3548  if ( !index ) {
3549  return 0;
3550  }
3551  if ( upcase && !NStr::IsUpper(name) ) {
3552  // upcase
3553  string tmp = name;
3555  return range.second? range.first: 0;
3556  }
3557  else {
3558  TVDBRowIdRange range = index.Find(name);
3559  return range.second? range.first: 0;
3560  }
3561 }
3562 
3563 
3565 {
3566  if ( 1 ) {
3567  CRef<SSeqTableCursor> seq = Seq();
3568  if ( seq->m_CONTIG_NAME_ROW_RANGE ) {
3569  seq->m_Cursor.SetParam("CONTIG_NAME_QUERY", name);
3572  seq->CONTIG_NAME_ROW_RANGE(0, CVDBValue::eMissing_Allow);
3573  if ( !value.empty() ) {
3574  range = *value;
3575  }
3576  Put(seq);
3577  return range.first;
3578  }
3579  Put(seq);
3580  }
3581  const CVDBTableIndex& index = ContigNameIndex();
3582  return Lookup(name, index, m_ContigNameIndexIsOpened.load(memory_order_relaxed) == 2);
3583 }
3584 
3585 
3587 {
3588  const CVDBTableIndex& index = ScaffoldNameIndex();
3589  return Lookup(name, index, m_ScaffoldNameIndexIsOpened.load(memory_order_relaxed) == 2);
3590 }
3591 
3592 
3594 {
3595  const CVDBTableIndex& index = ProteinNameIndex();
3596  return Lookup(name, index, m_ProteinNameIndexIsOpened.load(memory_order_relaxed) == 2);
3597 }
3598 
3599 
3601 {
3602  const CVDBTableIndex& index = ProductNameIndex();
3603  return Lookup(name, index, m_ProductNameIndexIsOpened.load(memory_order_relaxed) == 2);
3604 }
3605 
3606 
3607 TVDBRowId CWGSDb_Impl::GetProtAccRowId(const string& acc, int ask_version)
3608 {
3609  TVDBRowId prot_row_id = 0;
3610  if ( CRef<SProtIdxTableCursor> idx = ProtIdx() ) {
3611  CVDBMgr::CRequestContextUpdater ctx_updater;
3613  string tmp = acc;
3614  const char* query_param_name;
3615  if ( NStr::Equal(idx->m_ROW_ID.GetName(), "ROW_ID") ) {
3616  query_param_name = "NAME_QUERY";
3617  }
3618  else {
3619  query_param_name = "ACCESSION_QUERY";
3620  }
3621  idx->m_Cursor.SetParam(query_param_name, NStr::ToUpper(tmp));
3623  idx->NAME_ROW_RANGE(0, CVDBValue::eMissing_Allow);
3624  if ( !value.empty() ) {
3625  range = *value;
3626  }
3627  if ( range.first && range.first <= range.second ) {
3628  CVDBValueFor<TVDBRowId> prot_rows = idx->ROW_ID(range.first);
3629  if ( !prot_rows.empty() ) {
3630  if ( ask_version > 0 ) {
3631  // check if version exists
3632  size_t version_index = size_t(prot_rows.size() == 1? 0: ask_version-1);
3633  if ( version_index < prot_rows.size() ) {
3634  // check if version mathces
3635  prot_row_id = prot_rows[version_index];
3636  if ( prot_row_id ) {
3637  CRef<SProt0TableCursor> prot = Prot0(prot_row_id);
3638  int actual_version = *prot->ACC_VERSION(prot_row_id);
3639  Put(prot, prot_row_id);
3640  if ( actual_version != ask_version ) {
3641  // version mismatch
3642  prot_row_id = 0;
3643  }
3644  }
3645  }
3646  }
3647  else if ( ask_version == -1 ) {
3648  // last version
3649  prot_row_id = prot_rows[prot_rows.size()-1];
3650  }
3651  }
3652  }
3653  Put(idx);
3654  }
3655  return prot_row_id;
3656 }
3657 
3658 
3660 {
3661  bool can_have_gis = false;
3662  auto cur = Seq();
3663  if (cur->m_GI) {
3664  auto gi_range = cur->m_Cursor.GetRowIdRange(cur->m_GI.GetIndex());
3665  if ( gi_range.second ) {
3666  auto value = cur->GI(gi_range.first, CVDBValue::eMissing_Allow);
3667  if (value.size() == 1 && *value != 0) {
3668  can_have_gis = true;
3669  }
3670  }
3671  }
3672  Put(cur);
3673  return can_have_gis;
3674 }
3675 
3676 
3678 {
3679  TVDBRowCount feature_count = 0;
3680  if ( auto cur = Feat() ) {
3681  feature_count = cur->m_Cursor.GetRowIdRange().second;
3682  Put(cur);
3683  }
3684  return feature_count;
3685 }
3686 
3687 
3689 {
3690  return GetTotalFeatureCount() > 0;
3691 }
3692 
3693 
3695 {
3696  if (GetWGSPath().find_first_of("\\/.:") != NPOS) {
3697  // non-standard path
3698  return false;
3699  }
3700  if (IsReplaced()) {
3701  // old or replaced WGS project
3702  return false;
3703  }
3705  // disabled WGS project
3706  return false;
3707  }
3708  return true;
3709 }
3710 
3711 
3713 {
3714  // assume no feature id correction
3715  EFeatLocIdType loc_id_type = eFeatLocIdGi;
3716  if (HasStandardFeatLocIdType()) {
3717  // shortcut for regular VDB files
3718  if (!CanHaveGis()) {
3719  loc_id_type = eFeatLocIdAccVer;
3720  }
3721  return loc_id_type;
3722  }
3723  try {
3724  if ( CRef<SFeatTableCursor> cur = Feat() ) {
3725  TVDBRowId feat_row_id = 1;
3726  try {
3727  PROFILE(sw_GetFeatLocIdTypeRange);
3728  CRef<SSeqTableCursor> seq = Seq();
3729  auto row_range = seq->m_Cursor.GetRowIdRange(seq->m_FEAT_ROW_START.GetIndex());
3730  for ( TVDBRowCount i = 0; i < row_range.second; ++i ) {
3731  auto seq_row_id = row_range.first+i;
3732  auto row_start = seq->FEAT_ROW_START(seq_row_id);
3733  if ( !row_start.empty() ) {
3734  feat_row_id = *row_start;
3735  break;
3736  }
3737  }
3738  Put(seq);
3739  }
3740  catch ( exception& /*ignored*/ ) {
3741  // use first feature in the file
3742  }
3743  PROFILE(sw_GetFeatLocIdTypeFeat);
3744  CRef<CSeq_feat> feat(new CSeq_feat);
3745  CTempString bytes;
3746  {{
3747  PROFILE(sw_GetFeatLocIdTypeFeatBytes);
3748  bytes = *cur->SEQ_FEAT(feat_row_id);
3749  }}
3750  cur.GetNCObject().m_ObjStr.OpenFromBuffer(bytes.data(), bytes.size());
3751  cur.GetNCObject().m_ObjStr >> *feat;
3752  Put(cur);
3754  if ( const CTextseq_id* id = !seq_id? 0: seq_id->GetTextseq_Id() ) {
3755  if ( id->IsSetVersion() ) {
3756  loc_id_type = eFeatLocIdAccVer;
3757  }
3758  else {
3759  loc_id_type = eFeatLocIdAccNoVer;
3760  }
3761  }
3762  }
3763  }
3764  catch ( exception& /*ignored*/ ) {
3765  // assume no feature id correction
3766  }
3767  return loc_id_type;
3768 }
3769 
3770 
3772 {
3773  auto loc_id_type = m_FeatLocIdType.load(memory_order_relaxed);
3774  if ( loc_id_type == eFeatLocIdUninitialized ) {
3775  // determine and cache for the future
3776  loc_id_type = DetermineFeatLocIdType();
3777  m_FeatLocIdType.store(loc_id_type, memory_order_relaxed);
3778  }
3779  return loc_id_type;
3780 }
3781 
3782 
3783 /////////////////////////////////////////////////////////////////////////////
3784 // CWGSSeqIterator
3785 /////////////////////////////////////////////////////////////////////////////
3786 
3787 
3789 {
3790  // skip gaps starting before the requested position
3791  while ( *this && GetToOpen() <= pos ) {
3792  ++*this;
3793  }
3794 }
3795 
3796 
3798  eFromFlags
3799 };
3802 };
3803 
3804 
3806 {
3807  explicit
3809  : db(db),
3811  split_prod(false),
3812  split_data(false),
3813  split_feat(false),
3814  split_qual(false),
3815  split_version(kAssignedDefaultSplitVersion)
3816  {
3817  }
3818 
3820  : SWGSCreateInfo(db)
3821  {
3822  if ( flags != fDefaultFlags ) {
3823  x_SetFlags(flags);
3824  }
3825  }
3826 
3828  : SWGSCreateInfo(db)
3829  {
3830  if ( split_version != kDefaultSplitVersion ) {
3831  x_SetSplitVersion(split_version);
3832  }
3833  }
3834 
3835  // set flags and corresponding split_version
3836  void x_SetFlags(TFlags flags);
3837 
3838  // set split_version and corresponding flags
3839  void x_SetSplitVersion(TSplitVersion split_version);
3840 
3842  TFlags flags;
3843  bool split_prod, split_data, split_feat, split_qual;
3852 
3853  template<class Iter>
3854  void x_SetId(Iter& it)
3855  {
3856  main_id = it.GetId(flags);
3857  feat_id = main_id;
3858  // fix feature ids
3859  // it can be accession.version and accession
3860  if ( feat_id->IsGi() ) {
3861  EFeatLocIdType feat_loc_id_type = db->GetFeatLocIdType();
3862  if ( feat_loc_id_type != eFeatLocIdGi ) {
3863  feat_id = it.GetId(flags & ~fIds_gi);
3864  }
3865  }
3866  }
3867  void x_ResetId()
3868  {
3869  main_id = null;
3870  feat_id = null;
3871  }
3872  template<class Iter>
3873  void x_SetSeq(Iter& it)
3874  {
3875  main_seq = new CBioseq();
3876  x_SetId(it);
3877  }
3879  {
3880  main_seq = new CBioseq();
3881  x_SetId(it);
3882  }
3883  void x_ResetSeq()
3884  {
3885  main_seq = null;
3886  x_ResetId();
3887  }
3888 
3889  void x_AddDescr(CTempString bytes);
3890  void x_AddFeature(const CWGSFeatureIterator& it,
3892  void x_AddFeaturesDirect(TVDBRowIdRange range,
3893  vector<TVDBRowId>& product_row_ids);
3894  void x_AddFeaturesSplit(TVDBRowIdRange range,
3895  vector<TVDBRowId>& product_row_ids);
3896  void x_AddFeatures(TVDBRowIdRange range,
3897  vector<TVDBRowId>& product_row_ids);
3898  void x_AddFeatures(TVDBRowIdRange range);
3899  CBioseq_set& x_GetProtSet(void);
3900  void x_CreateProtSet(TVDBRowIdRange range);
3901  void x_AddProducts(const vector<TVDBRowId>& product_row_ids);
3902 };
3903 
3904 
3906 {
3907  if ( *this ) {
3908  auto state = GetGBState();
3909  // skip artificial entries with 'missing' state
3911  return true;
3912  }
3913  // skip not included entries
3914  if ( !(m_IncludeFlags & TIncludeFlags(1 << state)) ) {
3915  return true;
3916  }
3917  }
3918  return false;
3919 }
3920 
3921 
3923 {
3924  if ( m_Cur0 ) {
3925  if ( m_Db ) {
3926  GetDb().Put(m_Cur0, m_CurrId);
3927  if ( m_Cur ) {
3928  GetDb().Put(m_Cur, m_CurrId);
3929  }
3930  }
3931  else {
3932  m_Cur.Reset();
3933  m_Cur0.Reset();
3934  }
3935  }
3936  m_Db.Reset();
3937  m_CurrId = m_FirstGoodId = m_FirstBadId = 0;
3938  m_AccVersion = eLatest;
3939 }
3940 
3941 
3943  : m_AccVersion(eLatest)
3944 {
3945  *this = iter;
3946 }
3947 
3948 
3950 {
3951  if ( this != &iter ) {
3952  Reset();
3953  m_Db = iter.m_Db;
3954  m_Cur0 = iter.m_Cur0;
3955  m_Cur = iter.m_Cur;
3956  m_CurrId = iter.m_CurrId;
3957  m_AccVersion = iter.m_AccVersion;
3959  m_FirstBadId = iter.m_FirstBadId;
3962  }
3963  return *this;
3964 }
3965 
3966 
3968  : m_CurrId(0),
3969  m_FirstGoodId(0),
3970  m_FirstBadId(0),
3971  m_AccVersion(eLatest),
3972  m_IncludeFlags(fIncludeDefault),
3973  m_ClipByQuality(true)
3974 {
3975 }
3976 
3977 
3979  EIncludeFlags include_flags,
3980  EClipType clip_type)
3981  : m_AccVersion(eLatest)
3982 {
3983  x_Select(wgs_db, include_flags, clip_type);
3984 }
3985 
3986 
3988  TVDBRowId row,
3989  EIncludeFlags include_flags,
3990  EClipType clip_type)
3991  : m_AccVersion(eLatest)
3992 {
3993  x_Select(wgs_db, include_flags, clip_type, row);
3994 }
3995 
3996 
3998  TVDBRowId first_row,
3999  TVDBRowId last_row,
4000  EIncludeFlags include_flags,
4001  EClipType clip_type)
4002  : m_AccVersion(eLatest)
4003 {
4004  x_Select(wgs_db, include_flags, clip_type, first_row, last_row);
4005 }
4006 
4007 
4009  CTempString acc,
4010  EIncludeFlags include_flags,
4011  EClipType clip_type)
4012  : m_AccVersion(eLatest)
4013 {
4014  x_Select(wgs_db, include_flags, clip_type, acc);
4015 }
4016 
4017 
4019  TIncludeFlags include_flags,
4020  EClipType clip_type)
4021  : m_AccVersion(eLatest)
4022 {
4023  x_Select(wgs_db, include_flags, clip_type);
4024 }
4025 
4026 
4028  TVDBRowId row,
4029  TIncludeFlags include_flags,
4030  EClipType clip_type)
4031  : m_AccVersion(eLatest)
4032 {
4033  x_Select(wgs_db, include_flags, clip_type, row);
4034 }
4035 
4036 
4038  TVDBRowId first_row,
4039  TVDBRowId last_row,
4040  TIncludeFlags include_flags,
4041  EClipType clip_type)
4042  : m_AccVersion(eLatest)
4043 {
4044  x_Select(wgs_db, include_flags, clip_type, first_row, last_row);
4045 }
4046 
4047 
4049  CTempString acc,
4050  TIncludeFlags include_flags,
4051  EClipType clip_type)
4052  : m_AccVersion(eLatest)
4053 {
4054  x_Select(wgs_db, include_flags, clip_type, acc);
4055 }
4056 
4057 
4058 static inline
4059 CWGSSeqIterator::TIncludeFlags s_ToFlags(CWGSSeqIterator::EWithdrawn withdrawn)
4060 {
4061  if ( withdrawn == CWGSSeqIterator::eIncludeWithdrawn ) {
4063  }
4064  else {
4066  }
4067 }
4068 
4069 
4071  EWithdrawn withdrawn,
4072  EClipType clip_type)
4073  : m_AccVersion(eLatest)
4074 {
4075  x_Select(wgs_db, s_ToFlags(withdrawn), clip_type);
4076 }
4077 
4078 
4080  TVDBRowId row,
4081  EWithdrawn withdrawn,
4082  EClipType clip_type)
4083  : m_AccVersion(eLatest)
4084 {
4085  x_Select(wgs_db, s_ToFlags(withdrawn), clip_type, row);
4086 }
4087 
4088 
4090  TVDBRowId first_row,
4091  TVDBRowId last_row,
4092  EWithdrawn withdrawn,
4093  EClipType clip_type)
4094  : m_AccVersion(eLatest)
4095 {
4096  x_Select(wgs_db, s_ToFlags(withdrawn), clip_type, first_row, last_row);
4097 }
4098 
4099 
4101  CTempString acc,
4102  EWithdrawn withdrawn,
4103  EClipType clip_type)
4104  : m_AccVersion(eLatest)
4105 {
4106  x_Select(wgs_db, s_ToFlags(withdrawn), clip_type, acc);
4107 }
4108 
4109 
4111 {
4112  Reset();
4113 }
4114 
4115 
4116 
4118  TIncludeFlags include_flags,
4119  EClipType clip_type)
4120 {
4121  x_Init(wgs_db, include_flags, clip_type, 0);
4122  x_Settle();
4123 }
4124 
4125 
4127  TIncludeFlags include_flags,
4128  EClipType clip_type,
4129  TVDBRowId row)
4130 {
4131  CVDBMgr::CRequestContextUpdater ctx_updater;
4132  x_Init(wgs_db, include_flags, clip_type, row);
4133  SelectRow(row);
4134 }
4135 
4136 
4138  TIncludeFlags include_flags,
4139  EClipType clip_type,
4140  TVDBRowId first_row,
4141  TVDBRowId last_row)
4142 {
4143  CVDBMgr::CRequestContextUpdater ctx_updater;
4144  x_Init(wgs_db, include_flags, clip_type, first_row);
4145  if ( m_FirstBadId == 0 ) {
4146  return;
4147  }
4148  if ( first_row > m_FirstGoodId ) {
4149  m_CurrId = m_FirstGoodId = first_row;
4151  }
4152  if ( last_row < m_FirstBadId-1 ) {
4153  m_FirstBadId = last_row+1;
4154  }
4155  x_Settle();
4156 }
4157 
4158 
4160  TIncludeFlags include_flags,
4161  EClipType clip_type,
4162  CTempString acc)
4163 {
4164  CVDBMgr::CRequestContextUpdater ctx_updater;
4165  if ( TVDBRowId row = wgs_db.ParseContigRow(acc) ) {
4166  x_Init(wgs_db, include_flags, clip_type, row);
4167  SelectRow(row);
4168  }
4169  else {
4170  // bad format
4173  }
4174 }
4175 
4176 
4177 void CWGSSeqIterator::x_Init(const CWGSDb& wgs_db,
4178  TIncludeFlags include_flags,
4179  EClipType clip_type,
4180  TVDBRowId get_row)
4181 {
4182  PROFILE(sw_SeqIterator);
4185  m_ClipByQuality = false;
4186  if ( !wgs_db ) {
4187  return;
4188  }
4189  m_Cur0 = wgs_db.GetNCObject().Seq0(get_row);
4190  m_Cur = wgs_db.GetNCObject().Seq(get_row);
4191  if ( !m_Cur ) {
4192  return;
4193  }
4194  m_Db = wgs_db;
4195  m_IncludeFlags = include_flags;
4196  switch ( clip_type ) {
4197  case eNoClip:
4198  m_ClipByQuality = false;
4199  break;
4200  case eClipByQuality:
4201  m_ClipByQuality = true;
4202  break;
4203  default:
4205  break;
4206  }
4207  TVDBRowIdRange range = m_Cur->m_CONTIG_NAME.GetRowIdRange(m_Cur->m_Cursor);
4208  m_FirstGoodId = m_CurrId = range.first;
4209  m_FirstBadId = range.first+range.second;
4210 }
4211 
4212 
4214 {
4215  if ( row < m_FirstGoodId ) {
4216  // before the first id
4218  }
4219  else {
4220  m_CurrId = row;
4221  if ( x_Excluded() ) {
4223  }
4224  }
4226  return *this;
4227 }
4228 
4229 
4231 {
4232  x_CheckValid("CWGSSeqIterator::operator++");
4233  m_AmbiguityInfo = null;
4234  ++m_CurrId;
4235  x_Settle();
4236  return *this;
4237 }
4238 
4239 
4241 {
4242  while ( *this && x_Excluded() ) {
4243  ++m_CurrId;
4245  }
4246 }
4247 
4248 
4249 void CWGSSeqIterator::x_ReportInvalid(const char* method) const
4250 {
4251  NCBI_THROW_FMT(CSraException, eInvalidState,
4252  "CWGSSeqIterator::"<<method<<"(): Invalid iterator state");
4253 }
4254 
4255 
4256 bool CWGSSeqIterator::HasGi(void) const
4257 {
4258  return m_Cur->m_GI && GetGi() != ZERO_GI;
4259 }
4260 
4261 
4263 {
4264  x_CheckValid("CWGSSeqIterator::GetGi");
4265  if ( !m_Cur->m_GI || m_AccVersion.m_Offset != 0 ) {
4266  return ZERO_GI;
4267  }
4269  return gi.empty()? ZERO_GI: s_ToGi(*gi, "CWGSSeqIterator::GetGi()");
4270 }
4271 
4272 
4274 {
4275  x_CheckValid("CWGSSeqIterator::GetAccession");
4276  return *CVDBStringValue(m_Cur->ACCESSION(m_CurrId));
4277 }
4278 
4279 
4281 {
4282  CVDBMgr::CRequestContextUpdater ctx_updater;
4283  x_CheckValid("CWGSSeqIterator::GetLatestAccVersion");
4284  return *m_Cur->ACC_VERSION(m_CurrId);
4285 }
4286 
4287 
4289 {
4290  x_CheckValid("CWGSSeqIterator::GetAccVersionCount");
4291 #ifdef TEST_ACC_VERSION
4292  if ( GetLatestAccVersion() > 1 &&
4293  m_Cur->TRIM_START(m_CurrId).size() == 1 ) {
4294  return 2;
4295  }
4296 #endif
4297  return unsigned(m_Cur->TRIM_START(m_CurrId).size());
4298 }
4299 
4300 
4302 {
4303  CVDBMgr::CRequestContextUpdater ctx_updater;
4304  if ( version == -1 ) {
4305  // latest version
4306  return true;
4307  }
4308  int latest_version = GetLatestAccVersion();
4309  return version <= latest_version &&
4310  version > int(latest_version - GetAccVersionCount());
4311 }
4312 
4313 
4316 {
4318  if ( version != -1 ) {
4319  int latest_version = GetLatestAccVersion();
4320  int oldest_version = latest_version - GetAccVersionCount() + 1;
4321  if ( version > latest_version || version < oldest_version ) {
4322  NCBI_THROW_FMT(CSraException, eDataError,
4323  "CWGSSeqIterator: "<<
4324  GetDb().m_IdPrefixWithVersion<<"/"<<m_CurrId<<
4325  " version "<<version<<
4326  " is out of VDB version range: "<<
4327  oldest_version<<"-"<<latest_version);
4328  }
4329  ret.m_Offset = version - latest_version;
4330  }
4331  return ret;
4332 }
4333 
4334 
4336 {
4337  CVDBMgr::CRequestContextUpdater ctx_updater;
4339 }
4340 
4341 
4343 {
4344  CVDBMgr::CRequestContextUpdater ctx_updater;
4346 }
4347 
4348 
4350 {
4351  CRef<CSeq_id> id;
4352  if ( m_Cur->m_GI ) {
4353  CSeq_id::TGi gi = GetGi();
4354  if ( gi != ZERO_GI ) {
4355  id = new CSeq_id;
4356  id->SetGi(gi);
4357  }
4358  }
4359  return id;
4360 }
4361 
4362 
4364 {
4366 }
4367 
4368 
4370 {
4371  return GetGeneralOrPatentSeq_id();
4372 }
4373 
4374 
4376 {
4377  x_CheckValid("CWGSSeqIterator::GetContigName");
4378  return *m_Cur->CONTIG_NAME(m_CurrId);
4379 }
4380 
4382 {
4383  x_CheckValid("CWGSSeqIterator::HasTitle");
4384  return m_Cur->m_TITLE && !m_Cur->TITLE(m_CurrId).empty();
4385 }
4386 
4388 {
4389  x_CheckValid("CWGSSeqIterator::GetTitle");
4390  return *m_Cur->TITLE(m_CurrId);
4391 }
4392 
4394 {
4395  return value.empty()? ZERO_TAX_ID: TAX_ID_FROM(int, value[0]);
4396 }
4397 
4398 
4400 {
4401  return GetDb().HasCommonTaxId() || m_Cur0->m_TAXID;
4402 }
4403 
4404 
4406 {
4407  x_CheckValid("CWGSSeqIterator::GetTaxId");
4408  if ( GetDb().HasCommonTaxId() ) {
4409  return GetDb().GetCommonTaxId();
4410  }
4411  return s_GetTaxId(m_Cur0->TAXID(m_CurrId));
4412 }
4413 
4414 
4416 {
4417  x_CheckValid("CWGSSeqIterator::GetSeqHash");
4418  return m_Cur->m_HASH;
4419 }
4420 
4421 
4423 {
4424  return HasSeqHash()? *m_Cur->HASH(m_CurrId): 0;
4425 }
4426 
4427 
4429 {
4430  return *m_Cur->READ_LEN(m_CurrId);
4431 }
4432 
4433 
4435 {
4437 #ifdef TEST_ACC_VERSION
4438  if ( GetLatestAccVersion() > 1 && m_AccVersion.m_Offset != 0 && arr.size() == 1 ) {
4439  return *arr - 5*m_AccVersion.m_Offset;
4440  }
4441 #endif
4442  return arr[arr.size()-1+m_AccVersion.m_Offset];
4443 }
4444 
4445 
4447 {
4449 #ifdef TEST_ACC_VERSION
4450  if ( GetLatestAccVersion() > 1 && m_AccVersion.m_Offset != 0 && arr.size() == 1 ) {
4452  return len < *arr? len: 0;
4453  }
4454 #endif
4455  return arr[arr.size()-1+m_AccVersion.m_Offset];
4456 }
4457 
4458 
4460 {
4461  if ( GetClipQualityLeft() != 0 ) {
4462  return true;
4463  }
4464  if ( GetClipQualityLength() != GetRawSeqLength() ) {
4465  return true;
4466  }
4467  return false;
4468 }
4469 
4470 
4472 {
4473  return GetClipByQualityFlag(clip_type)?
4474  GetClipQualityLeft(): 0;
4475 }
4476 
4477 
4479 {
4480  return GetClipByQualityFlag(clip_type)?
4482 }
4483 
4484 
4486 {
4487  if ( flags & fIds_gi ) {
4488  // gi
4489  if ( CRef<CSeq_id> id = GetGiSeq_id() ) {
4490  return id;
4491  }
4492  }
4493 
4494  if ( flags & fIds_acc ) {
4495  // acc.ver
4496  if ( CRef<CSeq_id> id = GetAccSeq_id() ) {
4497  return id;
4498  }
4499  }
4500 
4501  if ( flags & fIds_gnl ) {
4502  // gnl
4503  if ( CRef<CSeq_id> id = GetGeneralOrPatentSeq_id() ) {
4504  return id;
4505  }
4506  }
4507 
4508  NCBI_THROW_FMT(CSraException, eDataError,
4509  "CWGSSeqIterator::GetId("<<flags<<"): "
4510  "no valid id found: "<<
4511  GetDb().m_IdPrefixWithVersion<<"/"<<m_CurrId);
4512 }
4513 
4514 
4516 {
4517  CVDBMgr::CRequestContextUpdater ctx_updater;
4518  PROFILE(sw___GetContigIds);
4519  if ( flags & fIds_acc ) {
4520  // acc.ver
4521  if ( CRef<CSeq_id> id = GetAccSeq_id() ) {
4522  ids.push_back(id);
4523  }
4524  }
4525 
4526  if ( flags & fIds_gnl ) {
4527  // gnl
4528  if ( CRef<CSeq_id> id = GetGeneralOrPatentSeq_id() ) {
4529  ids.push_back(id);
4530  }
4531  }
4532 
4533  if ( flags & fIds_gi ) {
4534  // gi
4535  if ( CRef<CSeq_id> id = GetGiSeq_id() ) {
4536  ids.push_back(id);
4537  }
4538  }
4539 }
4540 
4541 
4543 {
4544  x_CheckValid("CWGSSeqIterator::HasSeqDescrBytes");
4545  return m_Cur->m_DESCR && !m_Cur->DESCR(m_CurrId).empty();
4546 }
4547 
4548 
4550 {
4551  x_CheckValid("CWGSSeqIterator::GetSeqDescrBytes");
4552  CTempString descr_bytes;
4553  if ( m_Cur->m_DESCR ) {
4554  descr_bytes = m_Cur->DESCR(m_CurrId);
4555  }
4556  return descr_bytes;
4557 }
4558 
4559 
4561 {
4562  x_CheckValid("CWGSSeqIterator::HasNucProtDescrBytes");
4563  return m_Cur->m_NUC_PROT_DESCR && !m_Cur->NUC_PROT_DESCR(m_CurrId).empty();
4564 }
4565 
4566 
4568 {
4569  x_CheckValid("CWGSSeqIterator::GetNucProtDescrBytes");
4570  CTempString descr_bytes;
4571  if ( m_Cur->m_NUC_PROT_DESCR ) {
4572  descr_bytes = m_Cur->NUC_PROT_DESCR(m_CurrId);
4573  }
4574  return descr_bytes;
4575 }
4576 
4577 
4579 {
4580  x_CheckValid("CWGSSeqIterator::HasSeq_descr");
4581  if ( flags & fSeqDescr ) {
4582  if ( HasSeqDescrBytes() ) {
4583  return true;
4584  }
4585  }
4586  if ( flags & fNucProtDescr ) {
4587  if ( HasNucProtDescrBytes() ) {
4588  return true;
4589  }
4590  }
4591  if ( flags & fMasterDescr ) {
4592  if ( !GetDb().GetMasterDescr().empty() ) {
4593  return true;
4594  }
4595  }
4596  return false;
4597 }
4598 
4599 
4601 {
4602  x_CheckValid("CWGSSeqIterator::GetSeq_descr");
4603  CRef<CSeq_descr> ret(new CSeq_descr);
4604  if ( (flags & fSeqDescr) && m_Cur->m_DESCR ) {
4605  sx_AddDescrBytes(*ret, *m_Cur->DESCR(m_CurrId));
4606  }
4607  if ( (flags & fNucProtDescr) && m_Cur->m_NUC_PROT_DESCR ) {
4608  sx_AddDescrBytes(*ret, *m_Cur->NUC_PROT_DESCR(m_CurrId));
4609  }
4610  if ( flags & fMasterDescr ) {
4611  GetDb().AddMasterDescr(*ret, nullptr, flags);
4612  }
4613  if ( ret->Get().empty() ) {
4614  ret.Reset();
4615  }
4616  return ret;
4617 }
4618 
4619 
4621 {
4622  x_CheckValid("CWGSSeqIterator::GetLocFeatRowIdRange");
4623 
4624  if ( !m_Cur->m_FEAT_ROW_START ) {
4625  return TVDBRowIdRange(0, 0);
4626  }
4627  CVDBValueFor<TVDBRowId> start_val = m_Cur->FEAT_ROW_START(m_CurrId);
4628  if ( start_val.empty() ) {
4629  return TVDBRowIdRange(0, 0);
4630  }
4631  TVDBRowId start = *start_val;
4632  TVDBRowId end = *m_Cur->FEAT_ROW_END(m_CurrId);
4633  if ( end < start ) {
4634  NCBI_THROW_FMT(CSraException, eDataError,
4635  "CWGSSeqIterator::GetLocFeatRowIdRange: "
4636  "feature row range is invalid: "<<start<<","<<end);
4637  }
4638  return TVDBRowIdRange(start, end-start+1);
4639 }
4640 
4641 
4643 {
4644  x_CheckValid("CWGSSeqIterator::HasAnnotSet");
4645  return m_Cur->m_ANNOT && !m_Cur->ANNOT(m_CurrId).empty();
4646 }
4647 
4648 
4650 {
4651  x_CheckValid("CWGSSeqIterator::GetAnnotBytes");
4652  return *m_Cur->ANNOT(m_CurrId);
4653 }
4654 
4655 
4656 void CWGSSeqIterator::GetAnnotSet(TAnnotSet& annot_set, TFlags flags) const
4657 {
4658  x_CheckValid("CWGSSeqIterator::GetAnnotSet");
4659  if ( (flags & fSeqAnnot) && m_Cur->m_ANNOT ) {
4660  sx_AddAnnotBytes(annot_set, *m_Cur->ANNOT(m_CurrId));
4661  }
4662 }
4663 
4664 
4666 {
4667  x_CheckValid("CWGSSeqIterator::CanHaveQualityGraph");
4668  return m_Cur->m_QUALITY;
4669 }
4670 
4671 
4673 {
4674  x_CheckValid("CWGSSeqIterator::HasQualityGraph");
4675  return m_Cur->m_QUALITY && !m_Cur->QUALITY(m_CurrId).empty();
4676 }
4677 
4678 
4679 inline
4681 {
4682  PROFILE(sw____GetContigQualSize);
4683  return m_Cur->m_Cursor.GetElementCount(m_CurrId, m_Cur->m_QUALITY, 8);
4684 }
4685 
4686 
4687 void
4688 CWGSSeqIterator::GetQualityVec(vector<INSDC_quality_phred>& quality_vec) const
4689 {
4690  x_CheckValid("CWGSSeqIterator::GetQualityArray");
4691 
4692  TSeqPos pos = GetSeqOffset();
4694  if ( end <= pos ) {
4695  quality_vec.clear();
4696  return;
4697  }
4698  TSeqPos size = end-pos;
4699  quality_vec.reserve((size+7)/8*8);
4700  quality_vec.resize(size);
4701  m_Cur->m_Cursor.ReadElements(m_CurrId, m_Cur->m_QUALITY, 8, pos, size,
4702  quality_vec.data());
4703 }
4704 
4705 
4707 {
4708  return "Phrap Graph";
4709 }
4710 
4711 
4712 static inline void s_GetMinMax(const Uint1* arr, size_t size,
4713  Uint1& min_v, Uint1& max_v)
4714 {
4715  Uint1 min_v0 = 0xff, max_v0 = 0;
4716  Uint1 min_v1 = 0xff, max_v1 = 0;
4717  Uint1 min_v2 = 0xff, max_v2 = 0;
4718  Uint1 min_v3 = 0xff, max_v3 = 0;
4719  for ( ; size >= 4; arr += 4, size -= 4 ) {
4720  Uint1 v0 = arr[0];
4721  Uint1 v1 = arr[1];
4722  Uint1 v2 = arr[2];
4723  Uint1 v3 = arr[3];
4724  if ( v0 < min_v0 ) min_v0 = v0;
4725  if ( v1 < min_v1 ) min_v1 = v1;
4726  if ( v2 < min_v2 ) min_v2 = v2;
4727  if ( v3 < min_v3 ) min_v3 = v3;
4728  if ( v0 > max_v0 ) max_v0 = v0;
4729  if ( v1 > max_v1 ) max_v1 = v1;
4730  if ( v2 > max_v2 ) max_v2 = v2;
4731  if ( v3 > max_v3 ) max_v3 = v3;
4732  }
4733  for ( ; size > 0; arr += 1, size -= 1 ) {
4734  Uint1 v0 = arr[0];
4735  if ( v0 < min_v0 ) min_v0 = v0;
4736  if ( v0 > max_v0 ) max_v0 = v0;
4737  }
4738  min_v0 = min(min_v0, min_v2);
4739  max_v0 = max(max_v0, max_v2);
4740  min_v1 = min(min_v1, min_v3);
4741  max_v1 = max(max_v1, max_v3);
4742  min_v = min(min_v0, min_v1);
4743  max_v = max(max_v0, max_v1);
4744 }
4745 
4746 
4748  TFlags flags) const
4749 {
4751  info.x_SetId(*this);
4752  x_GetQualityAnnot(annot_set, info);
4753 }
4754 
4755 
4758  TSeqPos pos,
4759  TSeqPos len) const
4760 {
4761  x_CheckValid("CWGSSeqIterator::GetQualityAnnot");
4762  if ( !(info.flags & fQualityGraph) || !m_Cur->m_QUALITY ) {
4763  return;
4764  }
4765 
4766  PROFILE(sw___GetContigQual);
4767  TSeqPos end = len == kInvalidSeqPos? kInvalidSeqPos: pos + len;
4768  pos = max(pos, GetSeqOffset());
4769  end = min(end, x_GetQualityArraySize());
4770  if ( end <= pos ) {
4771  return;
4772  }
4773  TSeqPos size = end-pos;
4774  CByte_graph::TValues values;
4775  {
4776  PROFILE(sw____GetContigQualData);
4777  values.reserve((size+7)/8*8);
4778  values.resize(size);
4779  m_Cur->m_Cursor.ReadElements(m_CurrId, m_Cur->m_QUALITY, 8, pos, size,
4780  values.data());
4781  }
4782 
4783  Uint1 min_q = 0, max_q = 0;
4784  {
4785  PROFILE(sw____GetContigQualMinMax);
4786  s_GetMinMax((const Uint1*)values.data(), values.size(), min_q, max_q);
4787  }
4788  if ( max_q == 0 ) {
4789  return;
4790  }
4791 
4792  CRef<CSeq_annot> annot(new CSeq_annot);
4793  CRef<CAnnotdesc> name(new CAnnotdesc);
4794  name->SetName(GetQualityAnnotName());
4795  annot->SetDesc().Set().push_back(name);
4796  CRef<CSeq_graph> graph(new CSeq_graph);
4797  graph->SetTitle("Phrap Quality");
4798  CSeq_interval& loc = graph->SetLoc().SetInt();
4799  loc.SetId(*info.main_id);
4800  loc.SetFrom(pos);
4801  loc.SetTo(end-1);
4802  graph->SetNumval(TSeqPos(size));
4803  CByte_graph& bytes = graph->SetGraph().SetByte();
4804  bytes.SetValues().swap(values);
4805  bytes.SetAxis(0);
4806  bytes.SetMin(min_q);
4807  bytes.SetMax(max_q);
4808  annot->SetData().SetGraph().push_back(graph);
4809  annot_set.push_back(annot);
4810 }
4811 
4812 
4814 {
4816 }
4817 
4818 
4820 {
4821  x_CheckValid("CWGSSeqIterator::GetGBState");
4822 
4823  NCBI_gb_state state = 0;
4824  if ( type & eGBStateRaw ) {
4825  state = GetRawGBState();
4826  }
4827  if ( !state && (type & eGBStateProject) ) {
4828  state = m_Db->GetProjectGBState();
4829  }
4830  return state;
4831 }
4832 
4833 
4835 {
4836  x_CheckValid("CWGSSeqIterator::GetRawGBState");
4837 
4838  if ( m_AccVersion.m_Offset != 0 ) {
4839  // not the last version of sequence
4841  }
4842  CVDBMgr::CRequestContextUpdater ctx_updater;
4843  NCBI_gb_state state = 0;
4844  if ( m_Cur->m_GB_STATE ) {
4845  state = *m_Cur->GB_STATE(m_CurrId);
4846  }
4847  return state;
4848 }
4849 
4850 
4852 {
4853  x_CheckValid("CWGSSeqIterator::HasPublicComment");
4854 
4855  if ( !m_Cur->m_PUBLIC_COMMENT ) {
4856  return false;
4857  }
4858  return !m_Cur->PUBLIC_COMMENT(m_CurrId).empty();
4859 }
4860 
4861 
4863 {
4864  x_CheckValid("CWGSSeqIterator::GetPublicComment");
4865 
4866  if ( !m_Cur->m_PUBLIC_COMMENT ) {
4867  return string();
4868  }
4869  return *m_Cur->PUBLIC_COMMENT(m_CurrId);
4870 }
4871 
4872 
4874 {
4875  x_CheckValid("CWGSSeqIterator::IsCircular");
4876 
4877  return m_Cur->m_CIRCULAR && *m_Cur->CIRCULAR(m_CurrId);
4878 }
4879 
4880 
4882 {
4883  return m_Cur->m_GAP_START;
4884 }
4885 
4886 
4889  CWGSDb_Impl& db,
4891  TVDBRowId row_id)
4892  : m_Db(&db),
4893  m_Seq(seq),
4895  {
4896  if ( !m_AmbiguityInfo ) {
4897  m_AmbiguityInfo = db.GetAmbiguityInfo(row_id);
4898  if ( !m_AmbiguityInfo ) {
4900  }
4902  }
4903  }
4905  {
4906  if ( m_AmbiguityInfo ) {
4908  }
4909  }
4910 
4912  {
4913  return m_AmbiguityInfo.GetNCPointer();
4914  }
4915 
4916  vector<Uint1> GetAmbiguityBytes() const
4917  {
4919  }
4920 
4922  {
4923  return operator->()->Get2naLengthExact(pos, len,
4925  }
4927  TSeqPos stop_2na_len, TSeqPos stop_gap_len) const
4928  {
4929  return operator->()->Get4naLengthExact(pos, len, stop_2na_len, stop_gap_len,
4931  }
4933  {
4934  return operator->()->GetGapLengthExact(pos, len,
4936  }
4937 
4939  {
4940  return operator->()->Get2na(pos, len,
4941  m_Seq.GetNCObject());
4942  }
4944  {
4945  return operator->()->Get4na(pos, len,
4947  }
4948 
4949 private:
4951  void operator=(const SAmbiguityAccess&) = delete;
4952 
4956 };
4957 
4958 
4960 {
4962 }
4963 
4964 
4966 {
4967  x_CheckValid("CWGSSeqIterator::GetGapInfo");
4968 
4969  if ( HasGapInfo() ) {
4970  gap_info = GetAmbiguity()->GetGapInfo();
4971  }
4972  else {
4973  gap_info = TWGSContigGapInfo();
4974  }
4975 }
4976 
4977 static
4979 {
4981  evidence->SetType(type);
4982  gap.SetLinkage_evidence().push_back(evidence);
4983 }
4984 
4985 
4986 static
4989  NCBI_WGS_gap_linkage gap_linkage)
4990 {
4992  static const int kLenTypeMask =
4995  static const int kGapTypeMask =
5004  _ASSERT(props < 0);
5005  int len_type = -(-props & kLenTypeMask);
5006  int gap_type = -(-props & kGapTypeMask);
5007  literal->SetLength(len);
5008  if ( len_type == NCBI_WGS_gap_unknown ) {
5009  literal->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
5010  }
5011  if ( gap_type || gap_linkage ) {
5012  CSeq_gap& gap = literal->SetSeq_data().SetGap();
5013  switch ( gap_type ) {
5014  case 0:
5016  break;
5017  case NCBI_WGS_gap_scaffold:
5019  break;
5020  case NCBI_WGS_gap_contig:
5022  break;
5025  break;
5028  break;
5031  break;
5032  case NCBI_WGS_gap_telomere:
5034  break;
5035  case NCBI_WGS_gap_repeat:
5037  break;
5040  break;
5041  default:
5042  break;
5043  }
5044  // linkage-evidence bits should be in order of ASN.1 specification
5045  if ( gap_linkage & NCBI_WGS_gap_linkage_linked ) {
5046  gap.SetLinkage(gap.eLinkage_linked);
5047  }
5050  for ( ; bit && bit <= gap_linkage; bit<<=1, ++type ) {
5051  if ( gap_linkage & bit ) {
5052  sx_AddEvidence(gap, type);
5053  }
5054  }
5055  }
5056  return literal;
5057 }
5058 
5059 
5061 {
5062  return GetAmbiguity().GetAmbiguityBytes();
5063 }
5064 
5065 
5067 {
5068  return GetAmbiguity().Get2na(pos, len);
5069 }
5070 
5071 
5073 {
5074  return GetAmbiguity().Get4na(pos, len);
5075 }
5076 
5077 
5078 /////////////////////////////////////////////////////////////////////////////
5079 // delta control constants
5080 
5081 // kMin2naSize is the minimal size of 2na segment that will
5082 // save memory if inserted in between 4na segments.
5083 // It's determined by formula MinLen = 8*MemoryOverfeadOfSegment.
5084 // The memory overhead of a segment in total is
5085 // (assuming allocation overhead equal to 2 pointers):
5086 // 18*sizeof(void*)+7*sizeof(int)
5087 // (+sizeof(int) on some 64-bit platforms due to alignment).
5088 // So one segment memory overhead is 100 bytes on 32-bit platform,
5089 // and 176 bytes on 64-bit platform.
5090 // This leads to threshold size of 800 bases on 32-bit platforms and
5091 // 1408 bases on most 64-bit platforms.
5092 // We'll use slightly bigger threshold to take into account
5093 // possible CPU overhead for 2na operations.
5094 // static const TSeqPos kMin2naSize = 2048;
5095 // Actually use kAmbiguityBlockSize (=1024), it's optimal enough
5096 // and allows to use precomputed ambiguity info directly
5098 
5099 // size of chinks if the segment is split
5100 static const TSeqPos kChunk4naSize = 1<<16; // 64Ki bases or 32KiB
5101 static const TSeqPos kChunk2naSize = 1<<17; // 128Ki bases or 32KiB
5102 
5103 // min size of segment to split
5105 static const TSeqPos kSplit2naSize = kChunk2naSize; //+kChunk2naSize/4;
5106 
5107 // end of delta control constants
5108 /////////////////////////////////////////////////////////////////////////////
5109 
5112  eDelta_split
5113 };
5114 
5115 
5116 inline
5118  TSeqPos pos, TSeqPos len,
5119  const TWGSContigGapInfo& gap_info) const
5120 {
5121  SSegment seg;
5122  seg.range.SetFrom(pos);
5123  seg.range.SetLength(len);
5124  seg.is_gap = true;
5125  NCBI_WGS_component_props props = *gap_info.gaps_props;
5126  NCBI_WGS_gap_linkage linkage = 0;
5127  if ( gap_info.gaps_linkage ) {
5128  linkage = *gap_info.gaps_linkage;
5129  }
5130  seg.literal = sx_MakeGapLiteral(len, props, linkage);
5131  segments.push_back(seg);
5132 }
5133 
5134 
5135 inline
5138 {
5139  range.SetToOpen(min(range.GetToOpen(), GetSeqLength()));
5140  return range;
5141 }
5142 
5143 
5144 // add raw data as delta segments with explicit gap info
5147  TWGSContigGapInfo gap_info,
5148  TInstSegmentFlags flags) const
5149 {
5151  TSeqPos raw_offset = GetSeqOffset();
5152  TSeqPos pos = range.GetFrom() + raw_offset;
5153  TSeqPos len = range.GetLength();
5154 
5155  gap_info.SetPos(pos);
5156  auto ambiguity = GetAmbiguity();
5157 
5158  for ( ; len > 0; ) {
5159  if ( gap_info.IsInGap(pos) ) {
5160  // add gap
5161  TSeqPos gap_len = gap_info.GetGapLength(pos, len);
5162  _ASSERT(gap_len <= len);
5163  if ( flags & fInst_MakeGaps) {
5164  x_AddGap(segments, pos - raw_offset, gap_len, gap_info);
5165  }
5166  ++gap_info;
5167  len -= gap_len;
5168  pos += gap_len;
5169  _ASSERT(!gap_info || pos <= gap_info.GetFrom());
5170  continue;
5171  }
5172 
5173  // data segment
5174  TSeqPos rem_len = gap_info.GetDataLength(pos, len);
5175  _ASSERT(rem_len <= len);
5176 
5177  if ( flags & fInst_Split ) {
5178  // break data at the next chunk boundary
5179  TSeqPos chunk_start =
5180  (pos-raw_offset)/kDataChunkSize*kDataChunkSize;
5181  TSeqPos chunk_end = chunk_start + kDataChunkSize;
5182  rem_len = min(rem_len, chunk_end - pos);
5183  }
5184 
5185  bool is_2na;
5186  TSeqPos seg_len;
5187  if ( flags & fInst_Minimal ) {
5188  // whole region is either 2na or 4na
5189  seg_len = ambiguity->Get2naLengthBlock(pos, rem_len);
5190  if ( seg_len == rem_len ) {
5191  // 2na
5192  is_2na = true;
5193  }
5194  else {
5195  // 4na
5196  seg_len = rem_len;
5197  is_2na = false;
5198  }
5199  }
5200  else {
5201  // determine optimal sequence piece for regular delta
5202  seg_len = ambiguity->Get2naLengthBlock(pos, min(rem_len, kSplit2naSize));
5203  if ( seg_len >= kMin2naSize || seg_len == rem_len ) {
5204  if ( seg_len > kSplit2naSize ) {
5205  seg_len = kChunk2naSize;
5206  }
5207  // 2na
5208  is_2na = true;
5209  }
5210  else {
5211  _ASSERT(seg_len < kSplit4naSize && seg_len < rem_len);
5212  TSeqPos seg_len_2na = seg_len;
5213  seg_len += ambiguity->Get4naLengthBlock(pos+seg_len,
5214  min(rem_len, kSplit4naSize)-seg_len);
5215  if ( seg_len == seg_len_2na ) {
5216  // no 4na added, so encode 2na, even if it's small
5217  _ASSERT(seg_len > 0);
5218  is_2na = true;
5219  }
5220  else {
5221  // 4na
5222  // limit too long 4na segments
5223  if ( seg_len >= kSplit4naSize ) {
5224  seg_len = kChunk4naSize;
5225  }
5226  is_2na = false;
5227  }
5228  }
5229  }
5230 
5231  SSegment seg;
5232  seg.is_gap = false;
5233  seg.range.SetFrom(pos - raw_offset);
5234  seg.range.SetLength(seg_len);
5235  if ( flags & fInst_MakeData ) {
5236  // actually generate Seq-data
5237  seg.literal = new CSeq_literal;
5238  seg.literal->SetLength(seg_len);
5239  if ( is_2na ) {
5240  // 2na
5241  seg.literal->SetSeq_data(*ambiguity.Get2na(pos, seg_len));
5242  _ASSERT(seg.literal->GetSeq_data().GetNcbi2na().Get().size() == (seg_len+3)/4);
5243  }
5244  else {
5245  seg.literal->SetSeq_data(*ambiguity.Get4na(pos, seg_len));
5246  _ASSERT(seg.literal->GetSeq_data().GetNcbi4na().Get().size() == (seg_len+1)/2);
5247  }
5248  }
5249  segments.push_back(seg);
5250  pos += seg_len;
5251  len -= seg_len;
5252  }
5253 }
5254 
5255 
5256 // add raw data as delta segments with gap recovering
5258  COpenRange<TSeqPos> range) const
5259 {
5261  TSeqPos raw_offset = GetSeqOffset();
5262  TSeqPos pos = range.GetFrom() + raw_offset;
5263  TSeqPos len = range.GetLength();
5264 
5265  // max size of gap segment
5266  const TSeqPos kMinGapSize = 20;
5267  // size of gap segment if its actual size is unknown
5268  const TSeqPos kUnknownGapSize = 100;
5269  SAmbiguityAccess ambiguity = GetAmbiguity();
5270 
5271  for ( ; len > 0; ) {
5272  SSegment seg;
5273  seg.range.SetFrom(pos - raw_offset);
5274  seg.is_gap = false;
5275  seg.literal = new CSeq_literal;
5276 
5277  TSeqPos rem_len = len;
5278  TSeqPos seg_len = ambiguity.Get2naLengthExact(pos, min(rem_len, kSplit2naSize));
5279  if ( seg_len >= kMin2naSize || seg_len == len ) {
5280  if ( seg_len > kSplit2naSize ) {
5281  seg_len = kChunk2naSize;
5282  }
5283  seg.literal->SetSeq_data(*ambiguity.Get2na(pos, seg_len));
5284  _ASSERT(seg.literal->GetSeq_data().GetNcbi2na().Get().size() == (seg_len+3)/4);
5285  }
5286  else {
5287  TSeqPos seg_len_2na = seg_len;
5288  seg_len += ambiguity.Get4naLengthExact(pos+seg_len,
5289  min(rem_len, kSplit4naSize)-seg_len,
5290  kMin2naSize, kMinGapSize);
5291  if ( kRecoverGaps && seg_len == 0 ) {
5292  seg_len = ambiguity.GetGapLengthExact(pos, rem_len);
5293  _ASSERT(seg_len > 0);
5294  seg.is_gap = true;
5295  if ( seg_len == kUnknownGapSize ) {
5296  seg.literal->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
5297  }
5298  }
5299  else if ( seg_len == seg_len_2na ) {
5300  seg.literal->SetSeq_data(*ambiguity.Get2na(pos, seg_len));
5301  _ASSERT(seg.literal->GetSeq_data().GetNcbi2na().Get().size() == (seg_len+3)/4);
5302  }
5303  else {
5304  if ( seg_len >= kSplit4naSize ) {
5305  seg_len = kChunk4naSize;
5306  }
5307  seg.literal->SetSeq_data(*ambiguity.Get4na(pos, seg_len));
5308  _ASSERT(seg.literal->GetSeq_data().GetNcbi4na().Get().size() == (seg_len+1)/2);
5309  }
5310  }
5311 
5312  seg.range.SetLength(seg_len);
5313  seg.literal->SetLength(seg_len);
5314  segments.push_back(seg);
5315  pos += seg_len;
5316  len -= seg_len;
5317  }
5318 }
5319 
5320 
5322  const TSegments& segments) const
5323 {
5325  TSeqPos pos = 0;
5326  CDelta_ext::Tdata& delta = inst.SetExt().SetDelta().Set();
5327  ITERATE ( TSegments, it, segments ) {
5328  CRef<CDelta_seq> seq(new CDelta_seq);
5329  _ASSERT(it->range.GetFrom() == pos);
5330  if ( it->literal ) {
5331  _ASSERT(it->range.GetLength() == it->literal->GetLength());
5332  seq->SetLiteral(it->literal.GetNCObject());
5333  }
5334  else {
5335  seq->SetLiteral().SetLength(it->range.GetLength());
5336  }
5337  delta.push_back(seq);
5338  pos += it->range.GetLength();
5339  }
5340  _VERIFY(pos == inst.GetLength());
5341 }
5342 
5343 
5345  const TSegments& segments) const
5346 {
5347  if ( segments.size() == 1 && !segments[0].is_gap ) {
5348  // plain single data segment, delta is not necessary
5349  _ASSERT(segments[0].literal);
5350  _ASSERT(!segments[0].literal->IsSetFuzz());
5351  _ASSERT(segments[0].literal->IsSetSeq_data());
5353  inst.SetSeq_data(segments[0].literal.GetNCObject().SetSeq_data());
5354  inst.ResetStrand();
5355  inst.ResetExt();
5356  }
5357  else {
5358  x_SetDelta(inst, segments);
5359  }
5360 }
5361 
5362 
5364 {
5365  CVDBMgr::CRequestContextUpdater ctx_updater;
5366  PROFILE(sw___GetContigInst);
5367  x_CheckValid("CWGSSeqIterator::GetSeq_inst");
5368  auto ambiguity = GetAmbiguity();
5369  CRef<CSeq_inst> inst(new CSeq_inst);
5370  inst->SetMol(GetDb().GetContigMolType());
5371  if ( IsCircular() ) {
5373  }
5374  TSeqPos length = GetSeqLength();
5375  inst->SetLength(length);
5376  if ( length == 0 ) {
5378  return inst;
5379  }
5380  COpenRange<TSeqPos> whole(0, length);
5381  if ( (info.flags & fMaskInst) == fInst_ncbi4na ) {
5383  inst->SetSeq_data(*ambiguity.Get4na(GetSeqOffset(), length));
5384  }
5385  else if ( HasGapInfo() ) {
5386  CRef<CSeq_id> id = GetAccSeq_id();
5387  TWGSContigGapInfo gap_info;
5388  GetGapInfo(gap_info);
5389  if ( !info.split_data ) {
5390  TSegments segments;
5391  TInstSegmentFlags inst_flags = fInst_MakeGaps|fInst_MakeData;
5392  x_GetSegmentsWithExplicitGaps(segments, whole, gap_info, inst_flags);
5393  x_SetDeltaOrData(*inst, segments);
5394  }
5395  else {
5396  // split
5397  TSegments segments;
5398  TInstSegmentFlags inst_flags = fInst_MakeGaps|fInst_Split;
5399  x_GetSegmentsWithExplicitGaps(segments, whole, gap_info, inst_flags);
5400  x_SetDelta(*inst, segments);
5401 
5402  CRef<CID2S_Chunk_Info> chunk;
5403  ITERATE ( TSegments, it, segments ) {
5404  if ( it->is_gap ) {
5405  continue;
5406  }
5407  TSeqPos pos = it->range.GetFrom();
5408  TSeqPos end = it->range.GetToOpen();
5409  int chunk_id = pos/kDataChunkSize*kChunkIdStep + eChunk_data;
5410  if ( !chunk || chunk->GetId() != chunk_id ) {
5411  chunk = new CID2S_Chunk_Info;
5412  chunk->SetId().Set(chunk_id);
5413  info.split->SetChunks().push_back(chunk);
5414  }
5416  chunk->SetContent().push_back(content);
5417  sx_SetSplitInterval(content->SetSeq_data(), *info.main_id,
5418  pos, end);
5419  }
5420  }
5421  }
5422  else {
5423  TSegments segments;
5425  x_SetDeltaOrData(*inst, segments);
5426  }
5427  return inst;
5428 }
5429 
5430 
5432 {
5434  return x_GetSeq_inst(info);
5435 }
5436 
5437 
5439 {
5440  this->flags = flags;
5442 }
5443 
5444 
5446 {
5448  flags = fDefaultFlags;
5449  }
5451  flags = fDefaultFlags;
5452  }
5453  else {
5454  NCBI_THROW_FMT(CSraException, eInvalidArg,
5455  "SWGSCreateInfo::SetSplitVersion("<<split_version<<"): "
5456  "unknown split version");
5457  }
5458  this->split_version = split_version;
5459 }
5460 
5461 
5462 inline
5464 {
5465  if ( bytes.empty() ) {
5466  return;
5467  }
5468  if ( data ) {
5469  data->AddDescr(*main_seq, bytes);
5470  }
5471  else {
5472  sx_AddDescrBytes(main_seq->SetDescr(), bytes);
5473  }
5474 }
5475 
5476 
5477 inline
5480 {
5481  if ( data ) {
5482  data->AddFeature(dst, it.GetSeq_featBytes());
5483  }
5484  else {
5485  dst.push_back(it.GetSeq_feat());
5486  //LOG_POST(MSerial_AsnText<<*dst.back());
5487  }
5488 }
5489 
5490 
5492 {
5493  if ( !entry->IsSet() ) {
5494  CRef<CBioseq_set> seqset(new CBioseq_set);
5496  if ( split_feat || split_prod ) {
5497  seqset->SetId().SetId(kMainEntryId);
5498  }
5499  CRef<CSeq_entry> main_entry(new CSeq_entry);
5500  main_entry->SetSeq(*main_seq);
5501  seqset->SetSeq_set().push_back(main_entry);
5502  entry->SetSet(*seqset);
5503  }
5504  return entry->SetSet();
5505 }
5506 
5507 
5509  vector<TVDBRowId>& product_row_ids)
5510 {
5511  CSeq_annot::TData::TFtable* main_features = 0;
5512  CSeq_annot::TData::TFtable* product_features = 0;
5513  for ( CWGSFeatureIterator feat_it(db, range); feat_it; ++feat_it ) {
5515  if ( TVDBRowId product_row_id = feat_it.GetProductRowId() ) {
5516  // product feature
5517  product_row_ids.push_back(product_row_id);
5518  if ( !product_features ) {
5519  CRef<CSeq_annot> annot(new CSeq_annot);
5520  x_GetProtSet().SetAnnot().push_back(annot);
5521  product_features = &annot->SetData().SetFtable();
5522  }
5523  dst = product_features;
5524  }
5525  else {
5526  // plain feature
5527  if ( !main_features ) {
5528  CRef<CSeq_annot> annot(new CSeq_annot);
5529  main_seq->SetAnnot().push_back(annot);
5530  main_features = &annot->SetData().SetFtable();
5531  }
5532  dst = main_features;
5533  }
5534  x_AddFeature(feat_it, *dst);
5535  }
5536 }
5537 
5538 
5543 
5544  size_t feat_count;
5545 
5546  struct SFeatureSet {
5547  vector< COpenRange<TSeqPos> > loc_ranges;
5548  bitset<CSeqFeatData::e_MaxChoice> feat_types;
5549 
5550  void Reset() {
5551  loc_ranges.clear();
5552  feat_types.reset();
5553  }
5554 
5556  {
5557  if ( feat_type >= CSeqFeatData::e_MaxChoice ) {
5558  feat_type = CSeqFeatData::e_not_set;
5559  }
5560  feat_types.set(feat_type);
5561  }
5562 
5563  static
5565  {
5566  static const TSeqPos kMaxGap = 100000;
5567  if ( src.GetFrom() >= dst.GetFrom() ) {
5568  // after
5569  if ( src.GetFrom() > dst.GetToOpen() + kMaxGap ) {
5570  // too far
5571  return false;
5572  }
5573  }
5574  else {
5575  // before
5576  if ( src.GetToOpen() + kMaxGap < dst.GetFrom() ) {
5577  // too far
5578  return false;
5579  }
5580  dst.SetFrom(src.GetFrom());
5581  }
5582  if ( src.GetToOpen() > dst.GetToOpen() ) {
5583  dst.SetToOpen(src.GetToOpen());
5584  }
5585  return true;
5586  }
5588  {
5589  if ( loc_ranges.empty() || !ExpandRange(loc_ranges.back(), range) ) {
5590  loc_ranges.push_back(range);
5591  }
5592  }
5594  {
5595  AddFeatType(type);
5597  }
5598 
5599  bool HasFeatures() const
5600  {
5601  return feat_types.any();
5602  }
5603  void AddContent(CID2S_Chunk_Info& chunk, CSeq_id& feat_id);
5604  };
5605  SFeatureSet features[2]; // w/o and w/ product
5606 
5608  : main_id(&main_id),
5609  feat_id(&feat_id),
5610  seq_place(new CID2S_Bioseq_Ids::C_E)
5611  {
5613  Reset();
5614  }
5615 
5616  void Reset() {
5617  feat_count = 0;
5618  for ( auto& fs : features ) {
5619  fs.Reset();
5620  }
5621  }
5622 
5624  {
5625  features[with_product].AddFeature(type, range);
5626  ++feat_count;
5627  }
5628 
5630  CWGSProteinIterator& prot_it,
5631  const vector<TVDBRowId>& product_row_ids,
5632  size_t product_index);
5633 };
5634 
5635 
5637  CSeq_id::TGi gi_range_start,
5638  CSeq_id::TGi gi_range_stop)
5639 {
5640  if ( gi_range_stop == gi_range_start ) {
5641  return;
5642  }
5644  if ( gi_range_stop == gi_range_start+GI_CONST(1) ) {
5645  loc->SetWhole_gi(gi_range_start);
5646  }
5647  else {
5648  CID2S_Gi_Range& gi_range = loc->SetWhole_gi_range();
5649  gi_range.SetStart(gi_range_start);
5650  gi_range.SetCount(GI_TO(CID2S_Gi_Range::TCount, gi_range_stop - gi_range_start));
5651  }
5652  loc_set.push_back(loc);
5653 }
5654 
5655 
5657  CWGSProteinIterator& prot_it,
5658  const vector<TVDBRowId>& product_row_ids,
5659  size_t product_index)
5660 {
5661  // pack sorted locations once more
5662  //sort(loc_ranges.begin(), loc_ranges.end());
5664  chunk->SetId().Set(index*kChunkIdStep+eChunk_feat);
5665  CRef<CID2S_Chunk_Content> content;
5666  if ( features[0].HasFeatures() ) {
5667  // add annot place on sequence
5668  content = new CID2S_Chunk_Content;
5669  chunk->SetContent().push_back(content);
5670  content->SetSeq_annot_place().SetBioseqs().Set().push_back(seq_place);
5671 
5672  // add annot types and locations
5673  features[0].AddContent(*chunk, *feat_id);
5674  }
5675  if ( features[1].HasFeatures() ) {
5676  // add annot place on nuc-prot-set
5677  content = new CID2S_Chunk_Content;
5678  chunk->SetContent().push_back(content);
5679  content->SetSeq_annot_place().SetBioseq_sets().Set().push_back(kMainEntryId);
5680 
5681  // add annot types and locations
5682  features[1].AddContent(*chunk, *feat_id);
5683 
5684  // add annot types and locations for products
5685  CID2S_Seq_annot_Info& annot_info = chunk->SetContent().back()->SetSeq_annot();
5686  CRef<CID2S_Seq_loc> old_loc(&annot_info.SetSeq_loc());
5687  annot_info.ResetSeq_loc();
5688  auto& loc_set = annot_info.SetSeq_loc().SetLoc_set();
5689  loc_set.push_back(old_loc);
5690  EFeatLocIdType feat_loc_id_type = eFeatLocIdUninitialized;
5691  CSeq_id::TGi gi_range_start = ZERO_GI, gi_range_stop = ZERO_GI;
5692  for ( auto it = product_row_ids.begin()+product_index; it != product_row_ids.end(); ++it ) {
5693  if ( !prot_it.SelectRow(*it) ) {
5694  ERR_POST_X(11, "CWGSDb::x_AddProducts: "
5695  "invalid protein row id: "<<*it);
5696  continue;
5697  }
5698  // fix feature ids
5699  // it can be accession.version and accession
5700  CSeq_id::TGi gi = prot_it.GetGi();
5701  if ( gi != ZERO_GI ) {
5702  if ( feat_loc_id_type == eFeatLocIdUninitialized ) {
5703  feat_loc_id_type = prot_it.GetDb().GetFeatLocIdType();
5704  }
5705  if ( feat_loc_id_type == eFeatLocIdGi ) {
5706  if ( gi != gi_range_stop ) {
5707  s_AddGiRange(loc_set, gi_range_start, gi_range_stop);
5708  gi_range_start = gi;
5709  }
5710  gi_range_stop = gi+GI_CONST(1);
5711  continue;
5712  }
5713  }
5714  s_AddGiRange(loc_set, gi_range_start, gi_range_stop);
5716  //LOG_POST("Feat info for "<<feat_id->AsFastaString());
5718  loc->SetWhole_seq_id(*feat_id);
5719  loc_set.push_back(loc);
5720  }
5721  s_AddGiRange(loc_set, gi_range_start, gi_range_stop);
5722  }
5723 
5724  // add empty feat-ids to prevent loading by id
5725  content = new CID2S_Chunk_Content;
5726  chunk->SetContent().push_back(content);
5727  content->SetFeat_ids();
5728 
5729  // done
5730  Reset();
5731  return chunk;
5732 }
5733 
5734 
5736 {
5737  // add features
5738  CRef<CID2S_Chunk_Content> content;
5739  content = new CID2S_Chunk_Content;
5740  chunk.SetContent().push_back(content);
5741  CID2S_Seq_annot_Info& annot_info = content->SetSeq_annot();
5742  // types
5744  if ( feat_types[type] ) {
5746  type_info->SetType(CSeqFeatData::E_Choice(type));
5747  annot_info.SetFeat().push_back(type_info);
5748  }
5749  }
5750  // locations
5751  CID2S_Seq_id_Ints& intervals = annot_info.SetSeq_loc().SetSeq_id_ints();
5752  intervals.SetSeq_id(feat_id);
5753  //LOG_POST("Feat info for "<<feat_id.AsFastaString());
5754  for ( auto r : loc_ranges ) {
5755  CRef<CID2S_Interval> interval(new CID2S_Interval);
5756  interval->SetStart(r.GetFrom());
5757  interval->SetLength(r.GetLength());
5758  intervals.SetInts().push_back(interval);
5759  }
5760 }
5761 
5762 
5764  vector<TVDBRowId>& product_row_ids)
5765 {
5766  // for each chunk we need to create:
5767  // ID2S-Chunk-Info.content.seq-annot-place. nuc-prot-set entry id or contig Seq-id
5768  // ID2S-Chunk-Info.content.seq-annot.feat&seq-loc
5769  int chunk_index = 0;
5771  COpenRange<TSeqPos> seq_range;
5772  size_t product_index = 0;
5773  CWGSProteinIterator prot_it(db);
5774  for ( CWGSFeatureIterator feat_it(db, range); feat_it; ++feat_it ) {
5775  bool with_product = false;
5776  if ( TVDBRowId product_row_id = feat_it.GetProductRowId() ) {
5777  // product feature
5778  product_row_ids.push_back(product_row_id);
5779  with_product = true;
5780  }
5781  c.AddFeature(with_product, feat_it.GetFeatType(), feat_it.GetLocRange());
5782  if ( c.feat_count == kFeatPerChunk ) {
5783  split->SetChunks().push_back(c.CreateChunkInfo(chunk_index++,
5784  prot_it,
5785  product_row_ids, product_index));
5786  product_index = product_row_ids.size();
5787  }
5788  }
5789  if ( c.feat_count ) {
5790  split->SetChunks().push_back(c.CreateChunkInfo(chunk_index,
5791  prot_it,
5792  product_row_ids, product_index));
5793  }
5794  if ( !product_row_ids.empty() ) {
5795  x_GetProtSet();
5796  }
5797 }
5798 
5799 
5800 inline
5802  vector<TVDBRowId>& product_row_ids)
5803 {
5804  if ( split_feat ) {
5805  x_AddFeaturesSplit(range, product_row_ids);
5806  }
5807  else {
5808  x_AddFeaturesDirect(range, product_row_ids);
5809  }
5810 }
5811 
5812 
5814 {
5815  CSeq_annot::TData::TFtable* main_features = 0;
5816  for ( CWGSFeatureIterator feat_it(db, range); feat_it; ++feat_it ) {
5817  // plain feature
5818  if ( !main_features ) {
5819  CRef<CSeq_annot> annot(new CSeq_annot);
5820  main_seq->SetAnnot().push_back(annot);
5821  main_features = &annot->SetData().SetFtable();
5822  }
5823  x_AddFeature(feat_it, *main_features);
5824  }
5825 }
5826 
5827 
5829 {
5831  sx_SetSplitId(*place, *info.main_id);
5832 
5834  for ( TSeqPos k = 0, pos = 0; pos < size; ++k, pos += kQualChunkSize ) {
5835  TSeqPos end = min(size, pos + kQualChunkSize);
5836  int chunk_id = k*kChunkIdStep + eChunk_qual;
5837 
5839  info.split->SetChunks().push_back(chunk);
5840 
5841  chunk->SetId().Set(chunk_id);
5842 
5843  CRef<CID2S_Chunk_Content> content;
5844 
5845  // content of quality annot
5846  content = new CID2S_Chunk_Content;
5847  chunk->SetContent().push_back(content);
5848  content->SetFeat_ids();
5849  CID2S_Seq_annot_Info& annot_info =
5850  content->SetSeq_annot();
5851  annot_info.SetName(GetQualityAnnotName());
5852  annot_info.SetGraph();
5853  sx_SetSplitInterval(annot_info.SetSeq_loc(), *info.main_id, pos, end);
5854 
5855  // place of quality annot
5856  content = new CID2S_Chunk_Content;
5857  chunk->SetContent().push_back(content);
5858  content->SetSeq_annot_place().SetBioseqs().Set().push_back(place);
5859  }
5860 }
5861 
5862 
5864 {
5865  CVDBMgr::CRequestContextUpdater ctx_updater;
5866  PROFILE(sw__GetContigBioseq);
5867  _ASSERT(!info.main_seq);
5868  info.x_SetSeq(*this);
5869  if ( info.entry ) {
5870  _ASSERT(info.entry->Which() == CSeq_entry::e_not_set);
5871  info.entry->SetSeq(*info.main_seq);
5872  }
5873  GetIds(info.main_seq->SetId(), info.flags);
5874  if ( info.flags & fMaskDescr ) {
5875  PROFILE(sw___GetContigDescr);
5876  if ( (info.flags & fMaskDescr) == fSeqDescr && !(info.flags & fSeqDescrObj) ) {
5877  // only own descriptors
5878  if ( m_Cur->m_DESCR ) {
5879  CVDBStringValue descr = m_Cur->DESCR(m_CurrId);
5880  if ( !descr.empty() ) {
5881  info.x_AddDescr(*descr);
5882  }
5883  }
5884  }
5885  else {
5886  // full descirptor collection
5887  if ( CRef<CSeq_descr> descr = GetSeq_descr(info.flags) ) {
5888  info.main_seq->SetDescr(*descr);
5889  }
5890  }
5891  }
5892  if ( info.flags & fMaskAnnot ) {
5893  PROFILE(sw___GetContigAnnot);
5894  GetAnnotSet(info.main_seq->SetAnnot(), info.flags);
5895  bool has_split_annot = false;
5896  if ( (info.flags & fQualityGraph) && CanHaveQualityGraph() ) {
5897  if ( info.split_qual ) {
5898  x_AddQualityChunkInfo(info);
5899  has_split_annot = true;
5900  }
5901  else {
5902  x_GetQualityAnnot(info.main_seq->SetAnnot(), info);
5903  }
5904  }
5905  if ( !has_split_annot && info.main_seq->GetAnnot().empty() ) {
5906  info.main_seq->ResetAnnot();
5907  }
5908  }
5909  info.main_seq->SetInst(*x_GetSeq_inst(info));
5910 }
5911 
5912 
5913 static
5915 {
5916  for ( CWGSFeatureIterator feat_it(db, range); feat_it; ++feat_it ) {
5917  if ( feat_it.GetProductRowId() ) {
5918  if ( count ) {
5919  --count;
5920  continue;
5921  }
5922  return true;
5923  }
5924  }
5925  return false;
5926 }
5927 
5928 
5929 void SWGSCreateInfo::x_AddProducts(const vector<TVDBRowId>& product_row_ids)
5930 {
5931  // add products
5932  TFlags save_flags = flags;
5933  CRef<CBioseq> save_seq = main_seq;
5934  CRef<CSeq_id> save_main_id = main_id;
5935  CRef<CSeq_id> save_feat_id = feat_id;
5936  CRef<CSeq_entry> save_entry = entry;
5937  CWGSProteinIterator prot_it(db);
5938  flags = prot_it.fDefaultFlags & ~prot_it.fMasterDescr;
5940  CID2S_Chunk_Data::TBioseqs* bioseqs = 0;
5941  if ( chunk ) {
5942  CRef<CID2S_Chunk_Data> chunk_data(new CID2S_Chunk_Data);
5943  chunk->SetData().push_back(chunk_data);
5944  chunk_data->SetId().SetBioseq_set(kMainEntryId);
5945  bioseqs = &chunk_data->SetBioseqs();
5946  }
5947  else {
5948  entries = &save_entry->SetSet().SetSeq_set();
5949  }
5950  ITERATE ( vector<TVDBRowId>, it, product_row_ids ) {
5951  if ( !prot_it.SelectRow(*it) ) {
5952  ERR_POST_X(11, "CWGSDb::x_AddProducts: "
5953  "invalid protein row id: "<<*it);
5954  continue;
5955  }
5956  entry = null;
5957  x_ResetSeq();
5958  prot_it.x_CreateBioseq(*this);
5959  if ( entries ) {
5960  CRef<CSeq_entry> entry(new CSeq_entry);
5961  entry->SetSeq(*main_seq);
5962  entries->push_back(entry);
5963  }
5964  else {
5965  bioseqs->push_back(main_seq);
5966  }
5967  }
5968  flags = save_flags;
5969  main_seq = save_seq;
5970  main_id = save_main_id;
5971  feat_id = save_feat_id;
5972  entry = save_entry;
5973 }
5974 
5975 
5977 {
5978  CVDBMgr::CRequestContextUpdater ctx_updater;
5979  _ASSERT(entry->IsSeq() && &entry->GetSeq() == main_seq);
5980  vector<TVDBRowId> product_row_ids;
5981  {
5982  PROFILE(sw__GetContigFeat);
5983  x_AddFeatures(range, product_row_ids);
5984  }
5985  if ( !product_row_ids.empty() ) {
5986  if ( split_prod ) {
5987  _ASSERT(entry && entry->IsSet());
5988  entry->SetSet().SetId().SetId(kMainEntryId);
5989  int chunk_index = 0;
5990  size_t prod_count = 0;
5991  CID2S_Bioseq_Ids::Tdata* ids = 0;
5992  CWGSProteinIterator prot_it(db);
5993  ITERATE ( vector<TVDBRowId>, it, product_row_ids ) {
5994  if ( !ids || prod_count == kProdPerChunk ) {
5996  split->SetChunks().push_back(chunk);
5997  chunk->SetId().Set(chunk_index*kChunkIdStep + eChunk_prod);
5998  prod_count = 0;
5999  ++chunk_index;
6000 
6001  CRef<CID2S_Chunk_Content> content;
6002  content = new CID2S_Chunk_Content;
6003  chunk->SetContent().push_back(content);
6004  content->SetFeat_ids();
6005 
6006  content = new CID2S_Chunk_Content;
6007  chunk->SetContent().push_back(content);
6009  content->SetBioseq_place().push_back(place_info);
6010  place_info->SetBioseq_set(kMainEntryId);
6011  ids = &place_info->SetSeq_ids().Set();
6012  }
6013  ++prod_count;
6014  if ( !prot_it.SelectRow(*it) ) {
6015  ERR_POST_X(12, "CWGSDb::x_CreateProtSet: "
6016  "invalid protein row id: "<<*it);
6017  continue;
6018  }
6019  CBioseq::TId prot_ids;
6020  prot_it.GetIds(prot_ids, flags);
6021  sx_AddSplitIds(*ids, prot_ids);
6022  }
6023  }
6024  else {
6025  x_AddProducts(product_row_ids);
6026  }
6027  }
6028 }
6029 
6030 
6031 static
6032 void sx_AddMasterDescr(const CWGSDb& db, SWGSCreateInfo& info, SWGSDb_Defs::TFlags flags)
6033 {
6034  if ( !db->GetMasterDescr().empty() ) {
6035  db->AddMasterDescr(info.entry->SetDescr(), info.main_seq, flags);
6036  }
6037 }
6038 
6039 
6041 {
6042  CVDBMgr::CRequestContextUpdater ctx_updater;
6043  PROFILE(sw_GetContigEntry);
6044  if ( !(info.flags & fSeqAnnot) || !info.db->FeatTable() ) {
6045  // plain sequence only without FEATURE table
6046  x_CreateBioseq(info);
6047  }
6048  else {
6049  TFlags flags = info.flags;
6050  info.flags = flags & ~(fMasterDescr | fNucProtDescr);
6051  if ( flags & fMasterDescr ) {
6052  // we need main sequence descriptors deserialized
6053  info.flags |= fSeqDescrObj;
6054  }
6055  x_CreateBioseq(info);
6056  info.flags = flags;
6057  info.x_CreateProtSet(GetLocFeatRowIdRange());
6058  if ( flags & (fNucProtDescr | fMasterDescr) ) {
6059  if ( (flags & fNucProtDescr) && m_Cur->m_NUC_PROT_DESCR ) {
6060  CVDBStringValue descr = m_Cur->NUC_PROT_DESCR(m_CurrId);
6061  if ( !descr.empty() ) {
6062  sx_AddDescrBytes(info.entry->SetDescr(), descr);
6063  }
6064  }
6065  if ( flags & fMasterDescr ) {
6066  sx_AddMasterDescr(m_Db, info, flags);
6067  }
6068  }
6069  }
6070 }
6071 
6072 
6074 {
6075  // split data if...
6076  PROFILE(sw_InitSplit);
6077  info.split = null;
6078  info.data = null;
6079  if ( kEnableSplitData && (info.flags & fSplitSeqData) &&
6080  ((info.flags & fMaskInst) == fInst_delta) && // delta is requested
6081  HasGapInfo() && // we have explicit gap info
6082  GetSeqLength() >= kMinDataSplitSize // data is big enough
6083  ) {
6084  info.split_data = true;
6085  }
6086  if ( kEnableSplitProd && (info.flags & fSplitProducts) &&
6087  sx_HasMoreProducts(m_Db, GetLocFeatRowIdRange(), kProdPerChunk) ) {
6088  // split products if there are many enough
6089  info.split_prod = true;
6090  }
6091  if ( kEnableSplitFeat && (info.flags & fSplitFeatures) && // if split is enabled and requested
6092  GetLocFeatRowIdRange().second >= kMinFeatCountToSplit && // if there are anough features
6093  GetDb().GetFeatLocIdType() != eFeatLocIdAccNoVer ) { // if feat Seq-ids are unambiguous
6094  // split features if there are many enough
6095  info.split_feat = true;
6096  }
6097  if ( kEnableSplitQual && (info.flags & fSplitQualityGraph) &&
6098  CanHaveQualityGraph() ) {
6099  info.split_qual = true;
6100  }
6101  if ( !info.split_data && !info.split_prod && !info.split_feat &&
6102  !info.split_qual ) {
6103  return false;
6104  }
6105  info.entry = new CSeq_entry;
6106  info.split = new CID2S_Split_Info;
6107  info.split->SetSkeleton(*info.entry);
6108  info.split->SetChunks();
6109  return true;
6110 }
6111 
6112 
6114 {
6115  x_CreateEntry(info);
6116 }
6117 
6118 
6120  unsigned index) const
6121 {
6122  PROFILE(sw_CreateQualityChunk);
6124  sx_SetSplitId(data->SetId(), *info.main_id);
6125  x_GetQualityAnnot(data->SetAnnots(), info,
6127  info.chunk->SetData().push_back(data);
6128 }
6129 
6130 
6132  unsigned index) const
6133 {
6134  CVDBMgr::CRequestContextUpdater ctx_updater;
6135  PROFILE(sw_CreateDataChunk);
6137  sx_SetSplitId(data->SetId(), *info.main_id);
6139  range.SetFrom(index*kDataChunkSize);
6140  range.SetLength(kDataChunkSize);
6141 
6142  TWGSContigGapInfo gap_info;
6143  GetGapInfo(gap_info);
6144  TSegments segments;
6145  TInstSegmentFlags inst_flags = fInst_MakeData;
6146  x_GetSegmentsWithExplicitGaps(segments, range, gap_info, inst_flags);
6147  ITERATE ( TSegments, it, segments ) {
6148  _ASSERT(!it->is_gap);
6149  _ASSERT(it->literal && it->literal->IsSetSeq_data());
6151  piece->SetStart(it->range.GetFrom());
6152  piece->SetData().push_back(it->literal);
6153  data->SetSeq_data().push_back(piece);
6154  }
6155  info.chunk->SetData().push_back(data);
6156 }
6157 
6158 
6160  unsigned index) const
6161 {
6162  PROFILE(sw_CreateProductsChunk);
6163  vector<TVDBRowId> product_row_ids;
6164  TVDBRowId skip = index*kProdPerChunk;
6165  for ( CWGSFeatureIterator feat_it(m_Db, GetLocFeatRowIdRange()); feat_it; ++feat_it ) {
6166  if ( TVDBRowId row_id = feat_it.GetProductRowId() ) {
6167  if ( skip ) {
6168  --skip;
6169  continue;
6170  }
6171  product_row_ids.push_back(row_id);
6172  if ( product_row_ids.size() == kProdPerChunk ) {
6173  break;
6174  }
6175  }
6176  }
6177  info.x_AddProducts(product_row_ids);
6178 }
6179 
6180 
6182  unsigned index) const
6183 {
6184  PROFILE(sw_CreateFeaturesChunk);
6185  // select range of feature table rows
6186  auto range = GetLocFeatRowIdRange();
6187  auto feat_start = range.first + kFeatPerChunk*index;
6188  auto feat_stop = min(range.first+range.second, feat_start+kFeatPerChunk);
6189  range.first = feat_start;
6190  range.second = max(feat_start, feat_stop)-feat_start;
6191  // create features
6192  info.chunk->SetData();
6193  CSeq_annot::TData::TFtable* main_features = 0;
6194  CSeq_annot::TData::TFtable* product_features = 0;
6195  for ( CWGSFeatureIterator feat_it(m_Db, range); feat_it; ++feat_it ) {
6197  if ( feat_it.GetProductRowId() ) {
6198  // product feature
6199  if ( !product_features ) {
6201  info.chunk->SetData().push_back(data);
6202  data->SetId().SetBioseq_set(kMainEntryId);
6203  CRef<CSeq_annot> annot(new CSeq_annot);
6204  data->SetAnnots().push_back(annot);
6205  product_features = &annot->SetData().SetFtable();
6206  }
6207  dst = product_features;
6208  }
6209  else {
6210  // plain feature
6211  if ( !main_features ) {
6213  info.chunk->SetData().push_back(data);
6214  data->SetId().SetSeq_id(*info.main_id);
6215  CRef<CSeq_annot> annot(new CSeq_annot);
6216  data->SetAnnots().push_back(annot);
6217  main_features = &annot->SetData().SetFtable();
6218  }
6219  dst = main_features;
6220  }
6221  info.x_AddFeature(feat_it, *dst);
6222  }
6223 }
6224 
6225 
6227  TChunkId chunk_id) const
6228 {
6229  CVDBMgr::CRequestContextUpdater ctx_updater;
6230  PROFILE(sw_GetChunk);
6231  info.x_SetId(*this);
6232  EChunkType type = EChunkType(chunk_id%kChunkIdStep);
6233  unsigned index = chunk_id/kChunkIdStep;
6234  if ( type == eChunk_qual ) {
6235  x_CreateQualityChunk(info, index);
6236  }
6237  else if ( type == eChunk_prod ) {
6238  x_CreateProductsChunk(info, index);
6239  }
6240  else if ( type == eChunk_feat ) {
6241  x_CreateFeaturesChunk(info, index);
6242  }
6243  else if ( type == eChunk_data ) {
6244  x_CreateDataChunk(info, index);
6245  }
6246  else {
6247  NCBI_THROW_FMT(CSraException, eInvalidArg,
6248  "CWGSSeqIterator::CreateChunk("<<chunk_id<<"): "
6249  "unsupported chunk type: "<<type);
6250  }
6251 }
6252 
6253 
6255 {
6256  PROFILE(sw_GetBioseq);
6257  x_CheckValid("CWGSSeqIterator::GetBioseq");
6259  x_CreateBioseq(info);
6260  return info.main_seq;
6261 }
6262 
6263 
6265 {
6266  PROFILE(sw_GetSeq_entry);
6267  x_CheckValid("CWGSSeqIterator::GetSeq_entry");
6269  info.entry = new CSeq_entry;
6270  x_CreateEntry(info);
6271  return info.entry;
6272 }
6273 
6274 
6276 {
6277  PROFILE(sw_GetSeq_entryData);
6278  x_CheckValid("CWGSSeqIterator::GetSeq_entryData");
6280  info.entry = new CSeq_entry;
6281  info.data = new CWGSAsnBinData(*info.entry);
6282  x_CreateEntry(info);
6283  return CRef<CAsnBinData>(info.data);
6284 }
6285 
6286 
6288 {
6289  return GetSplitInfoAndVersion(flags).first;
6290 }
6291 
6292 
6293 pair<CRef<CID2S_Split_Info>, CWGSSeqIterator::TSplitVersion>
6295 {
6296  CVDBMgr::CRequestContextUpdater ctx_updater;
6297  PROFILE(sw_GetSplitInfo);
6298  x_CheckValid("CWGSSeqIterator::GetSplitInfo");
6300  if ( x_InitSplit(info) ) {
6301  x_CreateSplit(info);
6302  }
6303  return make_pair(info.split, info.split_version);
6304 }
6305 
6306 
6308 {
6309  return GetSplitInfoDataAndVersion(flags).first;
6310 }
6311 
6312 
6313 pair<CRef<CAsnBinData>, CWGSSeqIterator::TSplitVersion>
6315 {
6316  CVDBMgr::CRequestContextUpdater ctx_updater;
6317  PROFILE(sw_GetSplitInfoData);
6318  x_CheckValid("CWGSSeqIterator::GetSplitInfoData");
6320  if ( x_InitSplit(info) ) {
6321  info.data = new CWGSAsnBinData(*info.split);
6322  x_CreateSplit(info);
6323  }
6324  return make_pair(CRef<CAsnBinData>(info.data), info.split_version);
6325 }
6326 
6327 
6329  TFlags flags) const
6330 {
6331  x_CheckValid("CWGSSeqIterator::GetChunk");
6333  info.chunk = new CID2S_Chunk;
6334  x_CreateChunk(info, chunk_id);
6335  return info.chunk;
6336 }
6337 
6338 
6340  TSplitVersion split_version) const
6341 {
6342  x_CheckValid("CWGSSeqIterator::GetChunk");
6343  SWGSCreateInfo info(m_Db, eFromSplitVersion, split_version);
6344  info.chunk = new CID2S_Chunk;
6345  x_CreateChunk(info, chunk_id);
6346  return info.chunk;
6347 }
6348 
6349 
6351  TFlags flags) const
6352 {
6353  x_CheckValid("CWGSSeqIterator::GetChunkData");
6355  info.chunk = new CID2S_Chunk;
6356  info.data = new CWGSAsnBinData(*info.chunk);
6357  x_CreateChunk(info, chunk_id);
6358  return CRef<CAsnBinData>(info.data);
6359 }
6360 
6361 
6363  TSplitVersion split_version) const
6364 {
6365  x_CheckValid("CWGSSeqIterator::GetChunkData");
6366  SWGSCreateInfo info(m_Db, eFromSplitVersion, split_version);
6367  info.chunk = new CID2S_Chunk;
6368  info.data = new CWGSAsnBinData(*info.chunk);
6369  x_CreateChunk(info, chunk_id);
6370  return CRef<CAsnBinData>(info.data);
6371 }
6372 
6373 
6374 /////////////////////////////////////////////////////////////////////////////
6375 // CWGSScaffoldIterator
6376 /////////////////////////////////////////////////////////////////////////////
6377 
6378 
6380 {
6381  if ( m_Cur ) {
6382  if ( m_Db ) {
6383  GetDb().Put(m_Cur);
6384  }
6385  else {
6386  m_Cur.Reset();
6387  }
6388  }
6389  m_Db.Reset();
6390  m_CurrId = m_FirstGoodId = m_FirstBadId = 0;
6391 }
6392 
6393 
6395  : m_CurrId(0),
6396  m_FirstGoodId(0),
6397  m_FirstBadId(0)
6398 {
6399  *this = iter;
6400 }
6401 
6402 
6405 {
6406  if ( this != &iter ) {
6407  Reset();
6408  m_Db = iter.m_Db;
6409  m_Cur = iter.m_Cur;
6410  m_CurrId = iter.m_CurrId;
6412  m_FirstBadId = iter.m_FirstBadId;
6413  }
6414  return *this;
6415 }
6416 
6417 
6419  : m_CurrId(0),
6420  m_FirstGoodId(0),
6421  m_FirstBadId(0)
6422 {
6423 }
6424 
6425 
6427 {
6428  x_Init(wgs_db);
6429 }
6430 
6431 
6433  TVDBRowId row)
6434 {
6435  x_Init(wgs_db);
6436  SelectRow(row);
6437 }
6438 
6439 
6441  CTempString acc)
6442 {
6443  if ( TVDBRowId row = wgs_db.ParseScaffoldRow(acc) ) {
6444  x_Init(wgs_db);
6445  SelectRow(row);
6446  }
6447  else {
6448  // bad format
6450  }
6451 }
6452 
6453 
6455 {
6456  Reset();
6457 }
6458 
6459 
6461 {
6462  PROFILE(sw_ScafIterator);
6464  if ( !wgs_db ) {
6465  return;
6466  }
6467  m_Cur = wgs_db.GetNCObject().Scf();
6468  if ( !m_Cur ) {
6469  return;
6470  }
6471  m_Db = wgs_db;
6472  TVDBRowIdRange range = m_Cur->m_SCAFFOLD_NAME.GetRowIdRange(m_Cur->m_Cursor);
6473  m_FirstGoodId = m_CurrId = range.first;
6474  m_FirstBadId = range.first+range.second;
6475 }
6476 
6477 
6479 {
6480  if ( row < m_FirstGoodId ) {
6482  }
6483  else {
6484  m_CurrId = row;
6485  }
6486  return *this;
6487 }
6488 
6489 
6490 void CWGSScaffoldIterator::x_ReportInvalid(const char* method) const
6491 {
6492  NCBI_THROW_FMT(CSraException, eInvalidState,
6493  "CWGSScaffoldIterator::"<<method<<"(): "
6494  "Invalid iterator state");
6495 }
6496 
6497 
6499 {
6500  x_CheckValid("CWGSScaffoldIterator::GetAccession");
6501  if ( !m_Cur->m_ACCESSION ) {
6502  return CTempString();
6503  }
6504  CVDBMgr::CRequestContextUpdater ctx_updater;
6505  return *CVDBStringValue(m_Cur->ACCESSION(m_CurrId));
6506 }
6507 
6508 
6510 {
6511  // scaffolds always have version 1
6512  return 1;
6513 }
6514 
6515 
6517 {
6518  return GetGBState(eGBStateAll);
6519 }
6520 
6521 
6523 {
6524  x_CheckValid("CWGSScaffoldIterator::GetGBState");
6525 
6526  NCBI_gb_state state = 0;
6527  if ( type & eGBStateRaw ) {
6528  state = GetRawGBState();
6529  }
6530  if ( !state && (type & eGBStateProject) ) {
6531  state = m_Db->GetProjectGBState();
6532  }
6533  return state;
6534 }
6535 
6536 
6538 {
6539  x_CheckValid("CWGSScaffoldIterator::GetRawGBState");
6540 
6541  CVDBMgr::CRequestContextUpdater ctx_updater;
6542  NCBI_gb_state state = 0;
6543  if ( m_Cur->m_GB_STATE ) {
6544  state = *m_Cur->GB_STATE(m_CurrId);
6545  }
6546  return state;
6547 }
6548 
6549 
6551 {
6552  CVDBMgr::CRequestContextUpdater ctx_updater;
6553  CRef<CSeq_id> id;
6554  CTempString acc = GetAccession();
6555  if ( !acc.empty() ) {
6556  id = GetDb().GetAccSeq_id(acc, GetAccVersion());
6557  }
6558  else {
6560  }
6561  return id;
6562 }
6563 
6564 
6566 {
6567  CTempString name = GetScaffoldName();
6568  if ( name.empty() || sx_GetStringId(name) == m_CurrId ) {
6569  return null;
6570  }
6571  return GetDb().GetGeneralSeq_id(name);
6572 }
6573 
6574 
6576 {
6577  return GetGeneralOrPatentSeq_id();
6578 }
6579 
6580 
6582 {
6583  return null;
6584 }
6585 
6586 
6588 {
6589  if ( flags & fIds_gi ) {
6590  // gi
6591  if ( CRef<CSeq_id> id = GetGiSeq_id() ) {
6592  return id;
6593  }
6594  }
6595 
6596  if ( flags & fIds_acc ) {
6597  // acc.ver
6598  if ( CRef<CSeq_id> id = GetAccSeq_id() ) {
6599  return id;
6600  }
6601  }
6602 
6603  if ( flags & fIds_gnl ) {
6604  // gnl
6605  if ( CRef<CSeq_id> id = GetGeneralOrPatentSeq_id() ) {
6606  return id;
6607  }
6608  }
6609 
6610  NCBI_THROW_FMT(CSraException, eDataError,
6611  "CWGSScaffoldIterator::GetId("<<flags<<"): "
6612  "no valid id found: "<<
6613  GetDb().m_IdPrefixWithVersion<<"/"<<m_CurrId);
6614 }
6615 
6616 
6618 {
6619  CVDBMgr::CRequestContextUpdater ctx_updater;
6620  PROFILE(sw___GetScaffoldIds);
6621  if ( flags & fIds_acc ) {
6622  // acc.ver
6623  if ( CRef<CSeq_id> id = GetAccSeq_id() ) {
6624  ids.push_back(id);
6625  }
6626  }
6627 
6628  if ( flags & fIds_gnl ) {
6629  // gnl
6630  if ( CRef<CSeq_id> id = GetGeneralOrPatentSeq_id() ) {
6631  ids.push_back(id);
6632  }
6633  }
6634 
6635  if ( flags & fIds_gi ) {
6636  // gi
6637  if ( CRef<CSeq_id> id = GetGiSeq_id() ) {
6638  ids.push_back(id);
6639  }
6640  }
6641 }
6642 
6643 
6645 {
6646  x_CheckValid("CWGSScaffoldIterator::GetScaffoldName");
6647  return *CVDBStringValue(m_Cur->SCAFFOLD_NAME(m_CurrId));
6648 }
6649 
6650 
6652 {
6653  x_CheckValid("CWGSScaffoldIterator::HasSeq_descr");
6654 
6655  return (flags & fMasterDescr) && !GetDb().GetMasterDescr().empty();
6656 }
6657 
6658 
6660 {
6661  x_CheckValid("CWGSScaffoldIterator::GetSeq_descr");
6662 
6663  CRef<CSeq_descr> ret(new CSeq_descr);
6664  if ( flags & fMasterDescr ) {
6665  GetDb().AddMasterDescr(*ret, nullptr, flags);
6666  }
6667  if ( ret->Get().empty() ) {
6668  ret.Reset();
6669  }
6670  return ret;
6671 }
6672 
6673 
6675 {
6676  x_CheckValid("CWGSScaffoldIterator::GetSeqLength");
6677 
6678  TSeqPos length = 0;
6679  CVDBValueFor<INSDC_coord_len> lens = m_Cur->COMPONENT_LEN(m_CurrId);
6680  for ( size_t i = 0; i < lens.size(); ++i ) {
6681  TSeqPos len = lens[i];
6682  length += len;
6683  }
6684  return length;
6685 }
6686 
6687 
6689 {
6690  x_CheckValid("CWGSScaffoldIterator::IsCircular");
6691 
6692  return m_Cur->m_CIRCULAR && *m_Cur->CIRCULAR(m_CurrId);
6693 }
6694 
6695 
6697 {
6698  x_CheckValid("CWGSScaffoldIterator::GetLocFeatRowIdRange");
6699 
6700  if ( !m_Cur->m_FEAT_ROW_START ) {
6701  return TVDBRowIdRange(0, 0);
6702  }
6703  CVDBValueFor<TVDBRowId> start_val = m_Cur->FEAT_ROW_START(m_CurrId);
6704  if ( start_val.empty() ) {
6705  return TVDBRowIdRange(0, 0);
6706  }
6707  TVDBRowId start = *start_val;
6708  TVDBRowId end = *m_Cur->FEAT_ROW_END(m_CurrId);
6709  if ( end < start ) {
6710  NCBI_THROW_FMT(CSraException, eDataError,
6711  "CWGSScaffoldIterator::GetLocFeatRowIdRange: "
6712  "feature row range is invalid: "<<start<<","<<end);
6713  }
6714  return TVDBRowIdRange(start, end-start+1);
6715 }
6716 
6717 
6719 {
6720  CVDBMgr::CRequestContextUpdater ctx_updater;
6721  x_CheckValid("CWGSScaffoldIterator::GetSeq_inst");
6722 
6723  CRef<CSeq_inst> inst(new CSeq_inst);
6724  TSeqPos length = 0;
6725  inst->SetMol(GetDb().GetScaffoldMolType());
6726  if ( IsCircular() ) {
6728  }
6731  int id_ind = 0;
6732  CVDBValueFor<TVDBRowId> ids = m_Cur->COMPONENT_ID(m_CurrId);
6733  CVDBValueFor<INSDC_coord_len> lens = m_Cur->COMPONENT_LEN(m_CurrId);
6734  CVDBValueFor<INSDC_coord_one> starts = m_Cur->COMPONENT_START(m_CurrId);
6735  CVDBValueFor<NCBI_WGS_component_props> propss = m_Cur->COMPONENT_PROPS(m_CurrId);
6736  const NCBI_WGS_gap_linkage* linkages = 0;
6737  if ( m_Cur->m_COMPONENT_LINKAGE ) {
6738  CVDBValueFor<NCBI_WGS_gap_linkage> linkages_val = m_Cur->COMPONENT_LINKAGE(m_CurrId);
6739  if ( !linkages_val.empty() ) {
6740  size_t gaps_count = 0;
6741  for ( size_t i = 0; i < lens.size(); ++i ) {
6742  NCBI_WGS_component_props props = propss[i];
6743  if ( props < 0 ) {
6744  // gap
6745  ++gaps_count;
6746  }
6747  }
6748  if ( linkages_val.size() != gaps_count ) {
6749  NCBI_THROW(CSraException, eDataError,
6750  "CWGSScaffoldIterator: inconsistent gap info");
6751  }
6752  linkages = linkages_val.data();
6753  }
6754  }
6755  CDelta_ext::Tdata& delta = inst->SetExt().SetDelta().Set();
6756  for ( size_t i = 0; i < lens.size(); ++i ) {
6757  TSeqPos len = lens[i];
6758  NCBI_WGS_component_props props = propss[i];
6759  CRef<CDelta_seq> seg(new CDelta_seq);
6760  if ( props < 0 ) {
6761  // gap
6763  sx_MakeGapLiteral(len, props, linkages? *linkages++: 0);
6764  seg->SetLiteral(*literal);
6765  }
6766  else {
6767  // contig
6768  TSeqPos start = starts[i];
6769  if ( start == 0 || len == 0 ) {
6770  NCBI_THROW_FMT(CSraException, eDataError,
6771  "CWGSScaffoldIterator: component is bad for "+
6772  GetAccSeq_id()->AsFastaString());
6773  }
6774  --start; // make start zero-based
6775  TVDBRowId row_id = ids[id_ind++];
6776  CSeq_interval& interval = seg->SetLoc().SetInt();
6777  interval.SetFrom(start);
6778  interval.SetTo(start+len-1);
6779  interval.SetId(*GetDb().GetContigSeq_id(row_id));
6780  if ( props & NCBI_WGS_strand_minus ) {
6781  interval.SetStrand(eNa_strand_minus);
6782  }
6783  else if ( props & NCBI_WGS_strand_plus ) {
6784  // default Seq-interval strand is plus
6785  }
6786  else {
6787  interval.SetStrand(eNa_strand_unknown);
6788  }
6789  }
6790  delta.push_back(seg);
6791  length += len;
6792  }
6793  inst->SetLength(length);
6794  return inst;
6795 }
6796 
6797 
6799 {
6800  CVDBMgr::CRequestContextUpdater ctx_updater;
6801  PROFILE(sw__GetScaffoldBioseq);
6802  _ASSERT(!info.main_seq);
6803  info.x_SetSeq(*this);
6804  if ( info.entry ) {
6805  _ASSERT(info.entry->Which() == CSeq_entry::e_not_set);
6806  info.entry->SetSeq(*info.main_seq);
6807  }
6808  GetIds(info.main_seq->SetId(), info.flags);
6809  if ( info.flags & fMaskDescr ) {
6810  PROFILE(sw___GetContigDescr);
6811  if ( (info.flags & fMaskDescr) == fSeqDescr ) {
6812  // only own descriptors
6813  /*
6814  if ( m_Cur->m_DESCR ) {
6815  CVDBStringValue descr = m_Cur->DESCR(m_CurrId);
6816  if ( !descr.empty() ) {
6817  info.x_AddDescr(*descr);
6818  }
6819  }
6820  */
6821  }
6822  else {
6823  // full descirptor collection
6824  if ( CRef<CSeq_descr> descr = GetSeq_descr(info.flags) ) {
6825  info.main_seq->SetDescr(*descr);
6826  }
6827  }
6828  }
6829  info.main_seq->SetInst(*GetSeq_inst(info.flags));
6830 }
6831 
6832 
6834 {
6835  CVDBMgr::CRequestContextUpdater ctx_updater;
6836  PROFILE(sw_GetScaffoldEntry);
6837  if ( !(info.flags & fSeqAnnot) || !info.db->FeatTable() ) {
6838  // plain sequence only without FEATURE table
6840  }
6841  else {
6842  TFlags flags = info.flags;
6843  info.flags = flags & ~fSeqAnnot & ~fMasterDescr;
6845  info.flags = flags;
6846  info.x_CreateProtSet(GetLocFeatRowIdRange());
6847  if ( flags & fMasterDescr ) {
6849  }
6850  }
6851 }
6852 
6853 
6855 {
6856  x_CheckValid("CWGSScaffoldIterator::GetBioseq");
6859  return info.main_seq;
6860 }
6861 
6862 
6864 {
6865  x_CheckValid("CWGSScaffoldIterator::GetSeq_entry");
6867  info.entry = new CSeq_entry;
6869  return info.entry;
6870 }
6871 
6872 
6873 /////////////////////////////////////////////////////////////////////////////
6874 // CWGSGiIterator
6875 /////////////////////////////////////////////////////////////////////////////
6876 
6877 
6879 {
6880  if ( m_Cur ) {
6881  if ( m_Db ) {
6883  }
6884  else {
6885  m_Cur.Reset();
6886  }
6887  }
6888  m_Db.Reset();
6890  m_CurrRowId = 0;
6891  m_CurrSeqType = eAll;
6892 }
6893 
6894 
6896  : m_CurrGi(ZERO_GI), m_FirstBadGi(ZERO_GI)
6897 {
6898 }
6899 
6900 
6902  : m_CurrGi(ZERO_GI), m_FirstBadGi(ZERO_GI)
6903 {
6904  *this = iter;
6905 }
6906 
6907 
6910 {
6911  if ( this != &iter ) {
6912  Reset();
6913  m_Db = iter.m_Db;
6914  m_Cur = iter.m_Cur;
6915  m_CurrGi = iter.m_CurrGi;
6916  m_FirstBadGi = iter.m_FirstBadGi;
6917  m_CurrRowId = iter.m_CurrRowId;
6920  }
6921  return *this;
6922 }
6923 
6924 
6926 {
6927  x_Init(wgs_db, seq_type);
6928  x_Settle();
6929 }
6930 
6931 
6933 {
6934  x_Init(wgs_db, seq_type);
6935  if ( *this ) {
6936  TGi first_gi = m_CurrGi;
6937  m_CurrGi = gi;
6938  if ( gi < first_gi || gi >= m_FirstBadGi ) {
6939  m_CurrRowId = 0;
6940  m_CurrSeqType = eAll;
6941  m_FirstBadGi = gi;
6942  }
6943  else if ( x_Excluded() ) {
6944  m_FirstBadGi = gi;
6945  }
6946  }
6947 }
6948 
6949 
6951 {
6952  Reset();
6953 }
6954 
6955 
6956 void CWGSGiIterator::x_Init(const CWGSDb& wgs_db, ESeqType seq_type)
6957 {
6958  m_Db = wgs_db;
6959  m_Cur = GetDb().GiIdx();
6960  if ( !m_Cur ) {
6961  m_Db.Reset();
6963  m_CurrGi = ZERO_GI;
6964  m_CurrRowId = 0;
6965  m_CurrSeqType = eAll;
6966  return;
6967  }
6968  m_FilterSeqType = seq_type;
6969  if ( (seq_type == eProt || !m_Cur->m_NUC_ROW_ID) &&
6970  (seq_type == eNuc || !m_Cur->m_PROT_ROW_ID) ) {
6971  // no asked type of sequences in index
6972  Reset();
6973  return;
6974  }
6976  m_FirstBadGi = GI_FROM(TIntId, range.first+range.second);
6977  m_CurrGi = GI_FROM(TIntId, range.first);
6978 }
6979 
6980 
6982 {
6983  if ( m_FilterSeqType != eProt && m_Cur->m_NUC_ROW_ID ) {
6986  if ( !value.empty() ) {
6987  m_CurrRowId = *value;
6988  if ( m_CurrRowId ) {
6989  m_CurrSeqType = eNuc;
6990  return false;
6991  }
6992  }
6993  }
6994  if ( m_FilterSeqType != eNuc && m_Cur->m_PROT_ROW_ID ) {
6997  if ( !value.empty() ) {
6998  m_CurrRowId = *value;
6999  if ( m_CurrRowId ) {
7000  m_CurrSeqType = eProt;
7001  return false;
7002  }
7003  }
7004  }
7005  m_CurrSeqType = eAll;
7006  m_CurrRowId = 0;
7007  return true;
7008 }
7009 
7010 
7012 {
7013  while ( *this && x_Excluded() ) {
7014  ++m_CurrGi;
7015  }
7016 }
7017 
7018 
7019 /////////////////////////////////////////////////////////////////////////////
7020 // CWGSProteinIterator
7021 /////////////////////////////////////////////////////////////////////////////
7022 
7023 
7025 {
7026  if ( m_Cur0 ) {
7027  if ( m_Db ) {
7028  GetDb().Put(m_Cur0);
7029  if ( m_Cur ) {
7030  GetDb().Put(m_Cur);
7031  }
7032  }
7033  else {
7034  m_Cur.Reset();
7035  }
7036  }
7037  m_Db.Reset();
7039 }
7040 
7041 
7043  : m_CurrId(0), m_FirstGoodId(0), m_FirstBadId(0)
7044 {
7045 }
7046 
7047 
7049  : m_CurrId(0), m_FirstGoodId(0), m_FirstBadId(0)
7050 {
7051  *this = iter;
7052 }
7053 
7054 
7057 {
7058  if ( this != &iter ) {
7059  Reset();
7060  m_Db = iter.m_Db;
7061  m_Cur0 = iter.m_Cur0;
7062  m_Cur = iter.m_Cur;
7063  m_CurrId = iter.m_CurrId;
7065  m_FirstBadId = iter.m_FirstBadId;
7066  }
7067  return *this;
7068 }
7069 
7070 
7072 {
7073  x_Init(wgs_db);
7074 }
7075 
7076 
7078 {
7079  x_Init(wgs_db);
7080  SelectRow(row);
7081 }
7082 
7083 
7085 {
7086  if ( TVDBRowId row = wgs_db.ParseProteinRow(acc) ) {
7087  x_Init(wgs_db);
7088  SelectRow(row);
7089  }
7090  else {
7091  // bad format
7093  }
7094 }
7095 
7096 
7098 {
7099  Reset();
7100 }
7101 
7102 
7104 {
7105  PROFILE(sw_ProtIterator);
7107  if ( !wgs_db ) {
7108  return;
7109  }
7110  m_Cur0 = wgs_db.GetNCObject().Prot0();
7111  if ( !m_Cur0 ) {
7112  return;
7113  }
7114  m_Db = wgs_db;
7116  m_FirstGoodId = m_CurrId = range.first;
7117  m_FirstBadId = range.first+range.second;
7118 }
7119 
7120 
7122 {
7123  if ( !m_Cur ) {
7124  const_cast<CWGSProteinIterator*>(this)->m_Cur = GetDb().Prot();
7125  }
7126 }
7127 
7128 
7130 {
7131  if ( row < m_FirstGoodId ) {
7133  }
7134  else {
7135  m_CurrId = row;
7136  }
7137  return *this;
7138 }
7139 
7140 
7141 void CWGSProteinIterator::x_ReportInvalid(const char* method) const
7142 {
7143  NCBI_THROW_FMT(CSraException, eInvalidState,
7144  "CWGSProteinIterator::"<<method<<"(): "
7145  "Invalid iterator state");
7146 }
7147 
7148 
7150 {
7151  return m_Cur0->m_GI && GetGi() != ZERO_GI;
7152 }
7153 
7154 
7156 {
7157  x_CheckValid("CWGSProteinIterator::GetGi");
7158  if ( m_Cur0->m_GI ) {
7160  if ( !gi.empty() ) {
7161  return s_ToGi(*gi, "CWGSProteinIterator::GetGi()");
7162  }
7163  }
7164  return ZERO_GI;
7165 }
7166 
7167 
7169 {
7170  PROFILE(sw____GetProtAcc);
7171  x_CheckValid("CWGSProteinIterator::GetAccession");
7172  if ( m_Cur0->m_GB_ACCESSION ) {
7173  return *CVDBStringValue(m_Cur0->GB_ACCESSION(m_CurrId));
7174  }
7175  else {
7176  return CTempString();
7177  }
7178 }
7179 
7180 
7182 {
7183  PROFILE(sw____GetProtAccVer);
7184  x_CheckValid("CWGSProteinIterator::GetAccVersion");
7185  return *m_Cur0->ACC_VERSION(m_CurrId);
7186 }
7187 
7188 
7190 {
7191  CVDBMgr::CRequestContextUpdater ctx_updater;
7192  PROFILE(sw____GetProtAccSeq_id);
7193  CRef<CSeq_id> id;
7194  CTempString acc = GetAccession();
7195  if ( !acc.empty() ) {
7196  id = GetDb().GetAccSeq_id(acc, GetAccVersion());
7197  }
7198  else {
7200  }
7201  return id;
7202 }
7203 
7204 
7206 {
7208 }
7209 
7210 
7212 {
7213  return GetGeneralOrPatentSeq_id();
7214 }
7215 
7216 
7218 {
7219  PROFILE(sw____GetProtGISeq_id);
7220  CRef<CSeq_id> id;
7221  CSeq_id::TGi gi = GetGi();
7222  if ( gi != ZERO_GI ) {
7223  id = new CSeq_id;
7224  id->SetGi(gi);
7225  }
7226  return id;
7227 }
7228 
7229 
7231 {
7232  if ( flags & fIds_gi ) {
7233  // gi
7234  if ( CRef<CSeq_id> id = GetGiSeq_id() ) {
7235  return id;
7236  }
7237  }
7238 
7239  if ( flags & fIds_acc ) {
7240  // acc.ver
7241  if ( CRef<CSeq_id> id = GetAccSeq_id() ) {
7242  return id;
7243  }
7244  }
7245 
7246  if ( flags & fIds_gnl ) {
7247  // gnl
7248  if ( CRef<CSeq_id> id = GetGeneralOrPatentSeq_id() ) {
7249  return id;
7250  }
7251  }
7252 
7253  NCBI_THROW_FMT(CSraException, eDataError,
7254  "CWGSProteinIterator::GetId("<<flags<<"): "
7255  "no valid id found: "<<
7256  GetDb().m_IdPrefixWithVersion<<"/"<<m_CurrId);
7257 }
7258 
7259 
7261 {
7262  CVDBMgr::CRequestContextUpdater ctx_updater;
7263  PROFILE(sw___GetProtIds);
7264  if ( flags & fIds_acc ) {
7265  // acc.ver
7266  if ( CRef<CSeq_id> id = GetAccSeq_id() ) {
7267  ids.push_back(id);
7268  }
7269  }
7270 
7271  if ( flags & fIds_gnl ) {
7272  // gnl
7273  if ( CRef<CSeq_id> id = GetGeneralOrPatentSeq_id() ) {
7274  ids.push_back(id);
7275  }
7276  }
7277 
7278  if ( flags & fIds_gi ) {
7279  // gi
7280  if ( CRef<CSeq_id> id = GetGiSeq_id() ) {
7281  ids.push_back(id);
7282  }
7283  }
7284 }
7285 
7286 
7288 {
7289  x_CheckValid("CWGSProteinIterator::GetProteinName");
7290  return *CVDBStringValue(m_Cur0->PROTEIN_NAME(m_CurrId));
7291 }
7292 
7293 
7295 {
7296  x_CheckValid("CWGSProteinIterator::GetProductName");
7297  x_Cur();
7298  return *CVDBStringValue(m_Cur->PRODUCT_NAME(m_CurrId));
7299 }
7300 
7301 
7303 {
7304  if ( GetDb().HasCommonTaxId() ) {
7305  return true;
7306  }
7307  x_Cur();
7308  return m_Cur->m_TAXID;
7309 }
7310 
7311 
7313 {
7314  x_CheckValid("CWGSProteinIterator::GetTaxId");
7315  if ( GetDb().HasCommonTaxId() ) {
7316  return GetDb().GetCommonTaxId();
7317  }
7318  x_Cur();
7319  return s_GetTaxId(m_Cur->TAXID(m_CurrId));
7320 }
7321 
7322 
7324 {
7325  x_CheckValid("CWGSProteinIterator::GetSeqHash");
7326  x_Cur();
7327  return m_Cur->m_HASH;
7328 }
7329 
7330 
7332 {
7333  return HasSeqHash()? *m_Cur->HASH(m_CurrId): 0;
7334 }
7335 
7336 
7338 {
7339  x_CheckValid("CWGSProteinIterator::GetSeqLength");
7340  x_Cur();
7341  return *m_Cur->PROTEIN_LEN(m_CurrId);
7342 }
7343 
7344 
7346 {
7347  x_CheckValid("CWGSProteinIterator::HasRefAcc");
7348  x_Cur();
7349  return m_Cur->m_REF_ACC;
7350 }
7351 
7352 
7354 {
7355  x_CheckValid("CWGSProteinIterator::GetRefAcc");
7356  x_Cur();
7357  return *CVDBStringValue(m_Cur->REF_ACC(m_CurrId));
7358 }
7359 
7360 
7362 {
7363  return GetGBState(eGBStateAll);
7364 }
7365 
7366 
7368 {
7369  x_CheckValid("CWGSProteinIterator::GetGBState");
7370 
7371  NCBI_gb_state state = 0;
7372  if ( type & eGBStateRaw ) {
7373  state = GetRawGBState();
7374  }
7375  if ( !state && (type & eGBStateProject) ) {
7376  state = m_Db->GetProjectGBState();
7377  }
7378  return state;
7379 }
7380 
7381 
7383 {
7384  x_CheckValid("CWGSProteinIterator::GetRawGBState");
7385  x_Cur();
7386  CVDBMgr::CRequestContextUpdater ctx_updater;
7387  NCBI_gb_state state = 0;
7388  if ( m_Cur->m_GB_STATE ) {
7389  state = *m_Cur->GB_STATE(m_CurrId);
7390  }
7391  return state;
7392 }
7393 
7394 
7396 {
7397  x_CheckValid("CWGSProteinIterator::HasPublicComment");
7398  x_Cur();
7399  if ( !m_Cur->m_PUBLIC_COMMENT ) {
7400  return false;
7401  }
7402  return !m_Cur->PUBLIC_COMMENT(m_CurrId).empty();
7403 }
7404 
7405 
7407 {
7408  x_CheckValid("CWGSProteinIterator::GetPublicComment");
7409  x_Cur();
7410  if ( !m_Cur->m_PUBLIC_COMMENT ) {
7411  return string();
7412  }
7413  return *m_Cur->PUBLIC_COMMENT(m_CurrId);
7414 }
7415 
7416 
7418 {
7419  x_CheckValid("CWGSProteinIterator::HasTitle");
7420  x_Cur();
7421  return m_Cur->m_TITLE && !m_Cur->TITLE(m_CurrId).empty();
7422 }
7423 
7424 
7426 {
7427  x_CheckValid("CWGSProteinIterator::GetTitle");
7428  x_Cur();
7429  if ( !m_Cur->m_TITLE ) {
7430  return CTempString();
7431  }
7432  return *CVDBStringValue(m_Cur->TITLE(m_CurrId));
7433 }
7434 
7435 
7437 {
7438  x_CheckValid("CWGSProteinIterator::GetLocFeatRowIdRange");
7439  x_Cur();
7440  if ( !m_Cur->m_FEAT_ROW_START ) {
7441  return TVDBRowIdRange(0, 0);
7442  }
7443  CVDBValueFor<TVDBRowId> start_val = m_Cur->FEAT_ROW_START(m_CurrId);
7444  if ( start_val.empty() ) {
7445  return TVDBRowIdRange(0, 0);
7446  }
7447  TVDBRowId start = *start_val;
7448  TVDBRowId end = *m_Cur->FEAT_ROW_END(m_CurrId);
7449  if ( end < start ) {
7450  NCBI_THROW_FMT(CSraException, eDataError,
7451  "CWGSProteinIterator::GetLocFeatRowIdRange: "
7452  "feature row range is invalid: "<<start<<","<<end);
7453  }
7454  return TVDBRowIdRange(start, end-start+1);
7455 }
7456 
7457 
7459 {
7460  x_CheckValid("CWGSProteinIterator::GetProductFeatCount");
7461  x_Cur();
7462  if ( !m_Cur->m_FEAT_PRODUCT_ROW_ID ) {
7463  return 0;
7464  }
7465  return m_Cur->FEAT_PRODUCT_ROW_ID(m_CurrId).size();
7466 }
7467 
7468 
7470 {
7471  x_CheckValid("CWGSProteinIterator::GetProductFeatRowId");
7472  x_Cur();
7473  if ( !m_Cur->m_FEAT_PRODUCT_ROW_ID ) {
7474  return 0;
7475  }
7476  return m_Cur->FEAT_PRODUCT_ROW_ID(m_CurrId)[index];
7477 }
7478 
7479 
7481 {
7482  x_CheckValid("CWGSProteinIterator::GetBestProductFeatRowId");
7483  x_Cur();
7484  if ( !m_Cur->m_FEAT_PRODUCT_ROW_ID ) {
7485  return 0;
7486  }
7487  CVDBValueFor<TVDBRowId> row = m_Cur->FEAT_PRODUCT_ROW_ID(m_CurrId);
7488  return row.empty()? 0: row[row.size()-1];
7489 }
7490 
7491 
7493 {
7494  x_CheckValid("CWGSProteinIterator::GetProductFeatRowId");
7495  x_Cur();
7496  if ( !m_Cur->m_FEAT_PRODUCT_ROW_ID ) {
7497  return 0;
7498  }
7499  CVDBValueFor<TVDBRowId> row = m_Cur->FEAT_PRODUCT_ROW_ID(m_CurrId);
7500  return row.empty()? 0: *row;
7501 }
7502 
7503 
7505 {
7506  x_CheckValid("CWGSProteinIterator::GetReplacedByRowId");
7507  x_Cur();
7508  if ( m_Cur->m_REPLACED_BY ) {
7509  CVDBValueFor<TVDBRowId> value = m_Cur->REPLACED_BY(m_CurrId);
7510  if ( !value.empty() ) {
7511  return *value;
7512  }
7513  }
7514 #ifdef TEST_ACC_VERSION
7515  if ( m_CurrId % 3 != 0 ) {
7516  return m_CurrId+1;
7517  }
7518 #endif
7519  return 0;
7520 }
7521 
7522 
7524 {
7525  x_CheckValid("CWGSProteinIterator::GetReplacesRowId");
7526  x_Cur();
7527  if ( m_Cur->m_REPLACES ) {
7529  if ( !value.empty() ) {
7530  return *value;
7531  }
7532  }
7533 #ifdef TEST_ACC_VERSION
7534  if ( m_CurrId % 3 != 1 ) {
7535  return m_CurrId-1;
7536  }
7537 #endif
7538  return 0;
7539 }
7540 
7541 
7543 {
7544  x_CheckValid("CWGSProteinIterator::HasSeq_descr");
7545  x_Cur();
7546  if ( flags & fSeqDescr ) {
7547  if ( m_Cur->m_DESCR && !m_Cur->DESCR(m_CurrId).empty() ) {
7548  return true;
7549  }
7550  if ( !GetTitle().empty() ) {
7551  return true;
7552  }
7553  }
7554  if ( (flags & fMasterDescr) && !GetDb().GetMasterDescr().empty() ) {
7555  return true;
7556  }
7557  return false;
7558 }
7559 
7560 
7562 {
7563  x_CheckValid("CWGSProteinIterator::GetSeq_descr");
7564  x_Cur();
7565  CRef<CSeq_descr> ret(new CSeq_descr);
7566  if ( flags & fSeqDescr ) {
7567  if ( m_Cur->m_DESCR ) {
7568  sx_AddDescrBytes(*ret, *m_Cur->DESCR(m_CurrId));
7569  }
7570  else {
7571  CTempString title = GetTitle();
7572  if ( !title.empty() ) {
7573  CRef<CSeqdesc> desc(new CSeqdesc);
7574  desc->SetTitle(title);
7575  ret->Set().push_back(desc);
7576  }
7577  }
7578  }
7579  if ( flags & fMasterDescr ) {
7580  GetDb().AddMasterDescr(*ret, nullptr, flags);
7581  }
7582  if ( ret->Get().empty() ) {
7583  ret.Reset();
7584  }
7585  return ret;
7586 }
7587 
7588 
7590 {
7591  x_CheckValid("CWGSProteinIterator::HasAnnotSet");
7592  x_Cur();
7593  return m_Cur->m_ANNOT && !m_Cur->ANNOT(m_CurrId).empty();
7594 }
7595 
7596 
7597 void CWGSProteinIterator::GetAnnotSet(TAnnotSet& annot_set, TFlags flags) const
7598 {
7599  x_CheckValid("CWGSProteinIterator::GetAnnotSet");
7600  x_Cur();
7601  if ( (flags & fSeqAnnot) && m_Cur->m_ANNOT ) {
7602  sx_AddAnnotBytes(annot_set, *m_Cur->ANNOT(m_CurrId));
7603  }
7604 }
7605 
7606 
7608 {
7609  PROFILE(sw___GetProtInst);
7610  x_CheckValid("CWGSProteinIterator::GetSeq_inst");
7611  x_Cur();
7612  CRef<CSeq_inst> inst(new CSeq_inst);
7613  TSeqPos length = GetSeqLength();
7614  inst->SetMol(GetDb().GetProteinMolType());
7615  inst->SetLength(length);
7617  inst->SetSeq_data().SetNcbieaa().Set() = *m_Cur->PROTEIN(m_CurrId);
7618  if ( 1 ) {
7619  // add history info
7620  TVDBRowId replaced_by_row = GetReplacedByRowId();
7621  TVDBRowId replaces_row = GetReplacesRowId();
7622  if ( replaced_by_row || replaces_row ) {
7623  CSeq_hist& hist = inst->SetHist();
7624  if ( replaced_by_row ) {
7625  CWGSProteinIterator it(m_Db, replaced_by_row);
7626  hist.SetReplaced_by().SetIds().push_back(it.GetId());
7627  }
7628  if ( replaces_row ) {
7629  CWGSProteinIterator it(m_Db, replaces_row);
7630  hist.SetReplaces().SetIds().push_back(it.GetId());
7631  }
7632  }
7633  }
7634  return inst;
7635 }
7636 
7637 
7639 {
7640  CVDBMgr::CRequestContextUpdater ctx_updater;
7641  PROFILE(sw__GetProtBioseq);
7642  _ASSERT(!info.main_seq);
7643  x_Cur();
7644  info.x_SetSeq(*this);
7645  if ( info.entry ) {
7646  _ASSERT(info.entry->Which() == CSeq_entry::e_not_set);
7647  info.entry->SetSeq(*info.main_seq);
7648  }
7649  GetIds(info.main_seq->SetId(), info.flags);
7650  if ( info.flags & fMaskDescr ) {
7651  PROFILE(sw___GetProtDescr);
7652  if ( (info.flags & fMaskDescr) == fSeqDescr ) {
7653  // only own descriptors
7654  if ( m_Cur->m_DESCR ) {
7655  CVDBStringValue descr = m_Cur->DESCR(m_CurrId);
7656  if ( !descr.empty() ) {
7657  info.x_AddDescr(*descr);
7658  }
7659  }
7660  }
7661  else {
7662  // full descirptor collection
7663  if ( CRef<CSeq_descr> descr = GetSeq_descr(info.flags) ) {
7664  info.main_seq->SetDescr(*descr);
7665  }
7666  }
7667  }
7668  if ( info.flags & fMaskAnnot ) {
7669  if ( !info.db->FeatTable() ) {
7670  // plain sequence only without FEATURE table
7671  PROFILE(sw___GetProtAnnot);
7672  GetAnnotSet(info.main_seq->SetAnnot());
7673  if ( info.main_seq->GetAnnot().empty() ) {
7674  info.main_seq->ResetAnnot();
7675  }
7676  }
7677  else {
7678  // generate features from FEATURE table
7679  PROFILE(sw__GetProtFeat);
7680  info.x_AddFeatures(GetLocFeatRowIdRange());
7681  }
7682  }
7683  info.main_seq->SetInst(*GetSeq_inst(info.flags));
7684 }
7685 
7686 
7688 {
7689  PROFILE(sw_GetProtEntry);
7691 }
7692 
7693 
7695 {
7696  x_CheckValid("CWGSProteinIterator::GetBioseq");
7699  return info.main_seq;
7700 }
7701 
7702 
7704 {
7705  x_CheckValid("CWGSProteinIterator::GetSeq_entry");
7707  info.entry = new CSeq_entry;
7709  return info.entry;
7710 }
7711 
7712 
7713 /////////////////////////////////////////////////////////////////////////////
7714 // CWGSFeatureIterator
7715 /////////////////////////////////////////////////////////////////////////////
7716 
7717 
7719 {
7720  if ( m_Cur ) {
7721  if ( m_Db ) {
7722  GetDb().Put(m_Cur);
7723  }
7724  else {
7725  m_Cur.Reset();
7726  }
7727  }
7728  m_Db.Reset();
7730 }
7731 
7732 
7734  : m_CurrId(0),
7735  m_FirstGoodId(0),
7736  m_FirstBadId(0)
7737 {
7738 }
7739 
7740 
7742  : m_CurrId(0),
7743  m_FirstGoodId(0),
7744  m_FirstBadId(0)
7745 {
7746  *this = iter;
7747 }
7748 
7749 
7752 {
7753  if ( this != &iter ) {
7754  Reset();
7755  m_Db = iter.m_Db;
7756  m_Cur = iter.m_Cur;
7757  m_CurrId = iter.m_CurrId;
7759  m_FirstBadId = iter.m_FirstBadId;
7760  }
7761  return *this;
7762 }
7763 
7764 
7766 {
7767  x_Init(wgs);
7768 }
7769 
7770 
7772 {
7773  x_Init(wgs);
7774  SelectRow(row);
7775 }
7776 
7777 
7779  TVDBRowIdRange row_range)
7780 {
7781  x_Init(db);
7782  if ( !m_Db ) {
7783  return;
7784  }
7785  m_FirstGoodId = m_CurrId = max(m_FirstGoodId, row_range.first);
7786  m_FirstBadId = min(m_FirstBadId, TVDBRowId(row_range.first+row_range.second));
7787 }
7788 
7789 
7791 {
7792  Reset();
7793 }
7794 
7795 
7798 {
7799  if ( row < m_FirstGoodId ) {
7801  }
7802  else {
7803  m_CurrId = row;
7804  }
7805  return *this;
7806 }
7807 
7808 
7811 {
7813  m_FirstGoodId = m_CurrId = max(range.first, row_range.first);
7814  m_FirstBadId = min(range.first+range.second,
7815  row_range.first+row_range.second);
7816  return *this;
7817 }
7818 
7819 
7821 {
7822  PROFILE(sw_FeatIterator);
7824  if ( !wgs ) {
7825  return;
7826  }
7827  m_Cur = wgs.GetNCObject().Feat();
7828  if ( !m_Cur ) {
7829  return;
7830  }
7831  m_Db = wgs;
7833  m_FirstGoodId = m_CurrId = range.first;
7834  m_FirstBadId = range.first+range.second;
7835 }
7836 
7837 
7838 void CWGSFeatureIterator::x_ReportInvalid(const char* method) const
7839 {
7840  NCBI_THROW_FMT(CSraException, eInvalidState,
7841  "CWGSFeatureIterator::"<<method<<"(): "
7842  "Invalid iterator state");
7843 }
7844 
7845 
7847 {
7848  x_CheckValid("CWGSFeatureIterator::GetLocSeqType");
7849  return *m_Cur->LOC_SEQ_TYPE(m_CurrId);
7850 }
7851 
7852 
7854 {
7855  x_CheckValid("CWGSFeatureIterator::GetProductSeqType");
7856  return *m_Cur->PRODUCT_SEQ_TYPE(m_CurrId);
7857 }
7858 
7859 
7861 {
7862  x_CheckValid("CWGSFeatureIterator::GetLocRowId");
7863  return *m_Cur->LOC_ROW_ID(m_CurrId);
7864 }
7865 
7866 
7868 {
7869  x_CheckValid("CWGSFeatureIterator::GetProductRowId");
7870  if ( !m_Cur->m_PRODUCT_ROW_ID ) {
7871  return 0;
7872  }
7873  CVDBValueFor<TVDBRowId> row = m_Cur->PRODUCT_ROW_ID(m_CurrId);
7874  return row.empty()? 0: *row;
7875 }
7876 
7877 
7879 {
7880  x_CheckValid("CWGSFeatureIterator::GetFeatType");
7881  return *m_Cur->FEAT_TYPE(m_CurrId);
7882 }
7883 
7884 
7886 {
7887  x_CheckValid("CWGSFeatureIterator::GetLocStart");
7888  return *m_Cur->LOC_START(m_CurrId);
7889 }
7890 
7891 
7893 {
7894  x_CheckValid("CWGSFeatureIterator::GetLocLength");
7895  return *m_Cur->LOC_LEN(m_CurrId);
7896 }
7897 
7898 
7900 {
7901  x_CheckValid("CWGSFeatureIterator::GetLocRange");
7903  range.SetFrom(*m_Cur->LOC_START(m_CurrId));
7904  range.SetLength(*m_Cur->LOC_LEN(m_CurrId));
7905  return range;
7906 }
7907 
7908 
7910 {
7911  PROFILE(sw_GetFeatBytes);
7912  return *m_Cur->SEQ_FEAT(m_CurrId);
7913 }
7914 
7915 
7917 {
7918  PROFILE(sw_Feat);
7919  CRef<CSeq_feat> feat(new CSeq_feat);
7920  CTempString bytes = GetSeq_featBytes();
7921  m_Cur.GetNCObject().m_ObjStr.OpenFromBuffer(bytes.data(), bytes.size());
7922  m_Cur.GetNCObject().m_ObjStr >> *feat;
7923  return feat;
7924 }
7925 
7926 
#define NAME
#define READ(buf, off, bytes, endian)
Definition: bytes.c:42
CAnnotdesc –.
Definition: Annotdesc.hpp:66
CAsnBinData(CSerialObject &obj)
Definition: wgsread.cpp:314
virtual void Serialize(CObjectOStreamAsnBinary &out) const
Definition: wgsread.cpp:325
CRef< CSerialObject > m_MainObject
Definition: wgsread.hpp:136
virtual ~CAsnBinData(void)
Definition: wgsread.cpp:320
CByte_graph –.
Definition: Byte_graph.hpp:66
CConstObjectInfoCV –.
Definition: objectiter.hpp:557
CConstObjectInfoMI –.
Definition: objectiter.hpp:397
CConstObjectInfo –.
Definition: objectinfo.hpp:421
Definition: Dbtag.hpp:53
CDelta_seq –.
Definition: Delta_seq.hpp:66
const CBioseq * TKey
Definition: wgsread.cpp:417
const TInfoMap & info_map
Definition: wgsread.cpp:451
CWGSAsnBinData::TDescrInfo TInfo
Definition: wgsread.cpp:418
map< TKey, TInfo > TInfoMap
Definition: wgsread.cpp:419
virtual void WriteClassMember(CObjectOStream &out, const CConstObjectInfoMI &member)
Definition: wgsread.cpp:425
CDescrWriteHook(const TInfoMap &info_map)
Definition: wgsread.cpp:420
CDirEntry –.
Definition: ncbifile.hpp:262
CFastMutex –.
Definition: ncbimtx.hpp:667
virtual void WriteChoiceVariant(CObjectOStream &out, const CConstObjectInfoCV &variant)
Definition: wgsread.cpp:391
map< TKey, TInfo > TInfoMap
Definition: wgsread.cpp:385
const CSeq_annot::TData::TFtable * TKey
Definition: wgsread.cpp:383
CWGSAsnBinData::SFtableInfo TInfo
Definition: wgsread.cpp:384
const TInfoMap & info_map
Definition: wgsread.cpp:410
CFtableWriteHook(const TInfoMap &info_map)
Definition: wgsread.cpp:386
void Release()
Manually force the resource to be released.
Definition: guard.hpp:166
CID2S_Bioseq_Ids –.
CID2S_Bioseq_place_Info –.
CID2S_Chunk_Content –.
place of data to insert
CID2S_Chunk_Data –.
CID2S_Chunk_Info –.
CID2S_Chunk –.
Definition: ID2S_Chunk.hpp:66
CID2S_Feat_type_Info –.
CID2S_Gi_Interval –.
CID2S_Gi_Range –.
CID2S_Interval –.
CID2S_Seq_annot_Info –.
CID2S_Seq_id_Interval –.
CID2S_Seq_id_Ints –.
CID2S_Seq_loc –.
CID2S_Sequence_Piece –.
CID2S_Split_Info –.
@ eMissing_Allow
Definition: kdbread.hpp:86
void GetData(char *buffer, size_t size, size_t offset=0) const
Definition: kdbread.cpp:166
size_t GetSize(void) const
Definition: kdbread.hpp:105
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
Writing class members.
Definition: objectio.hpp:101
Writing containers (SET OF, SEQUENCE OF).
Definition: objectio.hpp:237
Helper class: installs hooks in constructor, and uninstalls in destructor.
Definition: objhook.hpp:397
CObjectIStreamAsnBinary –.
Definition: objistrasnb.hpp:59
CObjectOStreamAsnBinary –.
Definition: objostrasnb.hpp:58
CObjectOStream –.
Definition: objostr.hpp:83
CObjectTypeInfo –.
Definition: objectinfo.hpp:94
CObject –.
Definition: ncbiobj.hpp:180
CSafeStatic<>::
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeq_hist –.
Definition: Seq_hist.hpp:66
Base class for all serializable objects.
Definition: serialbase.hpp:150
CStopWatch –.
Definition: ncbitime.hpp:1937
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
uint32_t GetElementCount(TVDBRowId row, const CVDBColumn &column, uint32_t elem_bits) const
Definition: vdbread.cpp:1449
void ReadElements(TVDBRowId row, const CVDBColumn &column, uint32_t elem_bits, uint32_t start, uint32_t count, void *buffer) const
Definition: vdbread.cpp:1468
TVDBRowIdRange GetRowIdRange(TVDBColumnIdx column=0) const
Definition: vdbread.cpp:1417
TVDBRowIdRange Find(const string &value) const
Definition: vdbread.cpp:1317
@ eMissing_Allow
Definition: vdbread.hpp:316
const TValue * data() const
Definition: vdbread.hpp:973
const_iterator begin() const
Definition: vdbread.hpp:1002
bool empty(void) const
Definition: vdbread.hpp:703
size_t size(void) const
Definition: vdbread.hpp:707
@ eMissing_Allow
Definition: vdbread.hpp:633
static bool IsPlainAccession(const string &acc_or_path)
Definition: vdbread.cpp:407
void AddFeature(TFtable &ftable, const CTempString &data)
Definition: wgsread.cpp:356
vector< char > TDescrInfo
Definition: wgsread.cpp:354
void AddDescr(CBioseq &seq, const CTempString &data)
Definition: wgsread.cpp:360
CWGSAsnBinData(CSerialObject &obj)
Definition: wgsread.cpp:334
virtual void Serialize(CObjectOStreamAsnBinary &out) const
Definition: wgsread.cpp:455
TFtableMap m_FtableMap
Definition: wgsread.cpp:375
CSeq_annot::TData::TFtable TFtable
Definition: wgsread.cpp:345
map< const TFtable *, SFtableInfo > TFtableMap
Definition: wgsread.cpp:373
virtual ~CWGSAsnBinData(void)
Definition: wgsread.cpp:339
map< const CBioseq *, TDescrInfo > TDescrMap
Definition: wgsread.cpp:374
CRef< CSeq_descr > m_EmptyDescr
Definition: wgsread.cpp:377
TDescrMap m_DescrMap
Definition: wgsread.cpp:376
CVDBTableIndex m_ProteinNameIndex
Definition: wgsread.hpp:628
CRef< CSeq_id > GetGeneralOrPatentSeq_id(CTempString str, TVDBRowId row, TGnlIdFlags gnl_id_flags=fGnlId_Default) const
Definition: wgsread.cpp:2832
string m_IdPrefix
Definition: wgsread.hpp:594
CVDBObjectCache< SProtIdxTableCursor > m_ProtIdx
Definition: wgsread.hpp:624
CRef< SSeqTableCursor > Seq(TVDBRowId row=0)
Definition: wgsread.cpp:895
CVDBObjectCache< SProt0TableCursor > m_Prot0
Definition: wgsread.hpp:620
atomic< bool > m_GiIdxTableIsOpened
Definition: wgsread.hpp:604
NCBI_gb_state m_ProjectGBState
Definition: wgsread.hpp:643
Uint4 m_IdRowDigits
Definition: wgsread.hpp:598
const CVDBTableIndex & ContigNameIndex(void)
Definition: wgsread.hpp:497
const CVDBTableIndex & ProductNameIndex(void)
Definition: wgsread.hpp:515
pair< TGi, TGi > GetNucGiRange(void)
Definition: wgsread.cpp:3310
CRef< CSeq_id > GetGeneralSeq_id(CTempString prefix, CTempString tag) const
Definition: wgsread.cpp:2777
bool LoadMasterDescr(int filter)
Definition: wgsread.cpp:3012
const CVDBTable & GiIdxTable(void)
Definition: wgsread.hpp:479
COpenRange< TIntId > TGiRange
Definition: wgsread.hpp:360
EFeatLocIdType GetFeatLocIdType()
Definition: wgsread.cpp:3771
CRef< CSeq_entry > GetMasterDescrEntry(void)
Definition: wgsread.cpp:3043
atomic< bool > m_FeatTableIsOpened
Definition: wgsread.hpp:603
CVDBTableIndex m_ScaffoldNameIndex
Definition: wgsread.hpp:627
atomic< bool > m_ScfTableIsOpened
Definition: wgsread.hpp:601
string m_IdPrefixDbWithVersion
Definition: wgsread.hpp:595
CRef< SSeq0TableCursor > Seq0(TVDBRowId row=0)
Definition: wgsread.cpp:883
CRef< SProtTableCursor > Prot(TVDBRowId row=0)
Definition: wgsread.cpp:947
bool HasCommonTaxId(void) const
Definition: wgsread.hpp:418
bool IsTSA(void) const
Definition: wgsread.cpp:2771
CFastMutex m_TableMutex
Definition: wgsread.hpp:600
CRef< CSeq_id > GetAccSeq_id(CTempString acc, int version) const
Definition: wgsread.cpp:2913
CRef< SGiIdxTableCursor > GiIdx(TVDBRowId row=0)
Definition: wgsread.cpp:975
void ResetMasterDescr(void)
Definition: wgsread.cpp:3005
CVDBTable m_ProtIdxTable
Definition: wgsread.hpp:615
TAmbiguityCache m_AmbiguityCache
Definition: wgsread.hpp:633
TVDBRowId GetNucGiRowId(TGi gi)
Definition: wgsread.cpp:3507
const CVDBTableIndex & ScaffoldNameIndex(void)
Definition: wgsread.hpp:503
CRef< CSeq_entry > GetMasterSeq_entry(void) const
Definition: wgsread.cpp:3238
void SetMasterDescr(const TMasterDescr &descr, int filter)
Definition: wgsread.cpp:3140
void OpenScaffoldNameIndex(void)
Definition: wgsread.cpp:2456
void OpenProtTable(void)
Definition: wgsread.cpp:2425
CRef< SAmbiguityInfo > GetAmbiguityInfo(TVDBRowId row)
Definition: wgsread.cpp:2211
TTaxId GetCommonTaxId(void) const
Definition: wgsread.hpp:422
void PutAmbiguityInfo(CRef< SAmbiguityInfo > &ambiguity)
Definition: wgsread.cpp:2223
pair< TVDBRowId, bool > GetGiRowId(TGi gi)
Definition: wgsread.cpp:3482
CVDBMgr m_Mgr
Definition: wgsread.hpp:589
int m_IdVersion
Definition: wgsread.hpp:597
TVDBRowCount GetTotalFeatureCount()
Definition: wgsread.cpp:3677
void x_SortGiRanges(TGiRanges &ranges)
Definition: wgsread.cpp:3350
CRef< SProtIdxTableCursor > ProtIdx(TVDBRowId row=0)
Definition: wgsread.cpp:989
void OpenGiIdxTable(void)
Definition: wgsread.cpp:2437
CVDBTable m_FeatTable
Definition: wgsread.hpp:613
EFeatLocIdType DetermineFeatLocIdType()
Definition: wgsread.cpp:3712
const CVDBTable & ScfTable(void)
Definition: wgsread.hpp:461
@ fAllowRowType_contig
Definition: wgsread.hpp:233
@ fAllowRowType_scaffold
Definition: wgsread.hpp:234
@ fAllowRowType_protein
Definition: wgsread.hpp:235
bool HasFeatures()
Definition: wgsread.cpp:3688
static pair< TVDBRowId, ERowType > ParseRowType(CTempString acc, TAllowRowType allow)
Definition: wgsread.cpp:2485
void AddMasterDescr(CSeq_descr &descr, const CBioseq *main_seq=0, TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:3191
CVDBTableIndex m_ProtAccIndex
Definition: wgsread.hpp:625
TMasterDescr m_MasterDescr
Definition: wgsread.hpp:641
CRef< CSeq_entry > m_MasterEntry
Definition: wgsread.hpp:640
void OpenIndex(const CVDBTable &table, CVDBTableIndex &index, atomic< Int1 > &index_is_opened, const char *index_name, const char *backup_index_name=0)
Definition: wgsread.cpp:2388
void OpenProteinNameIndex(void)
Definition: wgsread.cpp:2463
string m_ReplacedBy
Definition: wgsread.hpp:645
TVDBRowId GetScaffoldNameRowId(const string &name)
Definition: wgsread.cpp:3586
TVDBRowId GetContigNameRowId(const string &name)
Definition: wgsread.cpp:3564
CRef< CSeq_id > GetProteinSeq_id(TVDBRowId row_id) const
Definition: wgsread.cpp:2981
CVDBObjectCache< SSeqTableCursor > m_Seq
Definition: wgsread.hpp:618
TTaxId m_CommonTaxId
Definition: wgsread.hpp:646
CSeq_inst::TMol GetScaffoldMolType(void) const
Definition: wgsread.cpp:2993
atomic< Int1 > m_ScaffoldNameIndexIsOpened
Definition: wgsread.hpp:608
atomic< bool > m_ProtIdxTableIsOpened
Definition: wgsread.hpp:605
NCBI_gb_state GetProjectGBState() const
Definition: wgsread.hpp:408
CRef< SProt0TableCursor > Prot0(TVDBRowId row=0)
Definition: wgsread.cpp:933
TVDBRowId GetProductNameRowId(const string &name)
Definition: wgsread.cpp:3600
CVDBTable m_SeqTable
Definition: wgsread.hpp:592
CRef< CSeq_id > GetMasterSeq_id(void) const
Definition: wgsread.cpp:2953
TVDBRowId GetProtAccRowId(const string &acc, int version=-1)
Definition: wgsread.cpp:3607
CVDBObjectCache< SProtTableCursor > m_Prot
Definition: wgsread.hpp:621
CRef< SFeatTableCursor > Feat(TVDBRowId row=0)
Definition: wgsread.cpp:961
TVDBRowId GetProteinNameRowId(const string &name)
Definition: wgsread.cpp:3593
atomic< Int1 > m_ProductNameIndexIsOpened
Definition: wgsread.hpp:610
vector< TGiRange > TGiRanges
Definition: wgsread.hpp:361
TVDBRowId Lookup(const string &name, const CVDBTableIndex &index, bool upcase)
Definition: wgsread.cpp:3544
CRef< CSeq_id > GetPatentSeq_id(int id) const
Definition: wgsread.cpp:2821
virtual ~CWGSDb_Impl(void)
Definition: wgsread.cpp:877
void OpenProtIdxTable(void)
Definition: wgsread.cpp:2443
string m_WGSPath
Definition: wgsread.hpp:590
list< CRef< CSeqdesc > > TMasterDescr
Definition: wgsread.hpp:305
TProtAccRanges GetProtAccRanges(void)
Definition: wgsread.cpp:3445
CFastMutex m_AmbiguityCacheMutex
Definition: wgsread.hpp:631
static string NormalizePathOrAccession(CTempString path_or_acc, CTempString vol_path=CTempString())
Definition: wgsread.cpp:2322
const CVDBTableIndex & ProteinNameIndex(void)
Definition: wgsread.hpp:509
TGiRanges GetProtGiRanges(void)
Definition: wgsread.cpp:3401
TVDBRowId ParseRow(CTempString acc, bool *is_scaffold) const
Definition: wgsread.cpp:2523
TGiRanges GetNucGiRanges(void)
Definition: wgsread.cpp:3369
CVDBObjectCache< SScfTableCursor > m_Scf
Definition: wgsread.hpp:619
bool CanHaveGis()
Definition: wgsread.cpp:3659
void Put(CRef< SSeq0TableCursor > &curs, TVDBRowId row=0)
Definition: wgsread.cpp:1003
CRef< SSeq4naTableCursor > Seq4na(TVDBRowId row=0)
Definition: wgsread.cpp:907
bool IsSetMasterDescr(void) const
Definition: wgsread.hpp:307
void OpenFeatTable(void)
Definition: wgsread.cpp:2431
CVDBTable m_GiIdxTable
Definition: wgsread.hpp:614
CVDBTableIndex m_ContigNameIndex
Definition: wgsread.hpp:626
void OpenProtAccIndex(void)
Definition: wgsread.cpp:2477
bool m_HasNoDefaultGnlId
Definition: wgsread.hpp:637
CVDBObjectCache< SSeq0TableCursor > m_Seq0
Definition: wgsread.hpp:617
CSeq_inst::TMol GetContigMolType(void) const
Definition: wgsread.cpp:2987
void x_LoadMasterDescr(int filter)
Definition: wgsread.cpp:3097
bool IsReplaced() const
Definition: wgsread.hpp:414
CRef< CSeq_id > m_PatentId
Definition: wgsread.hpp:642
void SetPatentId(CRef< CSeq_id > id)
Definition: wgsread.cpp:3258
CVDBTable m_ScfTable
Definition: wgsread.hpp:611
atomic< bool > m_ProtTableIsOpened
Definition: wgsread.hpp:602
TGi GetMasterGi(void) const
Definition: wgsread.cpp:3284
bool HasPatentId() const
Definition: wgsread.hpp:327
CRef< SScfTableCursor > Scf(TVDBRowId row=0)
Definition: wgsread.cpp:919
CVDBTable m_ProtTable
Definition: wgsread.hpp:612
const CVDBTable & FeatTable(void)
Definition: wgsread.hpp:473
const CVDBTable & SeqTable(void)
Definition: wgsread.hpp:458
CRef< CSeq_id > GetContigSeq_id(TVDBRowId row_id) const
Definition: wgsread.cpp:2969
pair< TGi, TGi > GetProtGiRange(void)
Definition: wgsread.cpp:3330
CSeq_inst::TMol m_ContigMolType
Definition: wgsread.hpp:635
CWGSDb_Impl(CVDBMgr &mgr, CTempString path_or_acc, CTempString vol_path=CTempString())
Definition: wgsread.cpp:843
string m_IdPrefixWithVersion
Definition: wgsread.hpp:593
size_t GetMasterDescrBytes(TMasterDescrBytes &buffer)
Definition: wgsread.cpp:3022
void OpenProductNameIndex(void)
Definition: wgsread.cpp:2470
atomic< Int1 > m_ProtAccIndexIsOpened
Definition: wgsread.hpp:606
CSeq_id::E_Choice m_SeqIdType
Definition: wgsread.hpp:644
string m_IdPrefixDb
Definition: wgsread.hpp:596
const TMasterDescr & GetMasterDescr(void) const
Definition: wgsread.hpp:310
atomic< EFeatLocIdType > m_FeatLocIdType
Definition: wgsread.hpp:639
CVDBTableIndex m_ProductNameIndex
Definition: wgsread.hpp:629
CVDBObjectCache< SFeatTableCursor > m_Feat
Definition: wgsread.hpp:622
void OpenContigNameIndex(void)
Definition: wgsread.cpp:2449
@ fGnlId_NoWGSId
Definition: wgsread.hpp:275
@ fGnlId_NoWGSVersion
Definition: wgsread.hpp:274
void OpenScfTable(void)
Definition: wgsread.cpp:2419
const CVDBTable & ProtIdxTable(void)
Definition: wgsread.hpp:485
const CVDBTable & ProtTable(void)
Definition: wgsread.hpp:467
bool HasStandardFeatLocIdType()
Definition: wgsread.cpp:3694
bool m_HasCommonTaxId
Definition: wgsread.hpp:638
CSeq_inst::TMol GetProteinMolType(void) const
Definition: wgsread.cpp:2999
CVDBObjectCache< SGiIdxTableCursor > m_GiIdx
Definition: wgsread.hpp:623
atomic< Int1 > m_ProteinNameIndexIsOpened
Definition: wgsread.hpp:609
void OpenTable(CVDBTable &table, atomic< bool > &table_is_opened, const char *table_name)
Definition: wgsread.cpp:2375
bool m_IsSetMasterDescr
Definition: wgsread.hpp:636
CRef< CSeq_id > GetScaffoldSeq_id(TVDBRowId row_id) const
Definition: wgsread.cpp:2975
void x_InitIdParams(void)
Definition: wgsread.cpp:2239
@ eRowType_protein
Definition: wgsread.hpp:229
@ eRowType_contig
Definition: wgsread.hpp:227
@ eRowType_scaffold
Definition: wgsread.hpp:228
TVDBRowId GetProtGiRowId(TGi gi)
Definition: wgsread.cpp:3525
const string & GetWGSPath(void) const
Definition: wgsread.hpp:215
atomic< Int1 > m_ContigNameIndexIsOpened
Definition: wgsread.hpp:607
@ eDescrDefaultFilter
Definition: wgsread.hpp:774
@ eDescrNoFilter
Definition: wgsread.hpp:773
static TVDBRowId ParseProteinRow(CTempString acc)
Definition: wgsread.hpp:696
EDescrType
Definition: wgsread.hpp:788
@ eDescr_default
Definition: wgsread.hpp:790
@ eDescr_skip
Definition: wgsread.hpp:789
static TVDBRowId ParseScaffoldRow(CTempString acc)
Definition: wgsread.hpp:690
static EDescrType GetMasterDescrType(const CSeqdesc &desc)
Definition: wgsread.cpp:3108
static TVDBRowId ParseContigRow(CTempString acc)
Definition: wgsread.hpp:684
TVDBRowId m_FirstGoodId
Definition: wgsread.hpp:1612
CRange< TSeqPos > GetLocRange(void) const
Definition: wgsread.cpp:7899
TVDBRowId m_FirstBadId
Definition: wgsread.hpp:1612
CWGSDb_Impl & GetDb(void) const
Definition: wgsread.hpp:1596
TVDBRowId GetLocRowId(void) const
Definition: wgsread.cpp:7860
NCBI_WGS_seqtype GetProductSeqType(void) const
Definition: wgsread.cpp:7853
NCBI_WGS_feattype GetFeatType(void) const
Definition: wgsread.cpp:7878
TVDBRowId m_CurrId
Definition: wgsread.hpp:1612
TVDBRowId GetProductRowId(void) const
Definition: wgsread.cpp:7867
TSeqPos GetLocLength(void) const
Definition: wgsread.cpp:7892
CRef< CSeq_feat > GetSeq_feat() const
Definition: wgsread.cpp:7916
void Reset(void)
Definition: wgsread.cpp:7718
CWGSFeatureIterator & SelectRow(TVDBRowId row)
Definition: wgsread.cpp:7797
CTempString GetSeq_featBytes(void) const
Definition: wgsread.cpp:7909
CWGSFeatureIterator & operator=(const CWGSFeatureIterator &iter)
Definition: wgsread.cpp:7751
void x_ReportInvalid(const char *method) const
Definition: wgsread.cpp:7838
CWGSFeatureIterator & SelectRowRange(TVDBRowIdRange row_range)
Definition: wgsread.cpp:7810
NCBI_WGS_seqtype GetLocSeqType(void) const
Definition: wgsread.cpp:7846
TSeqPos GetLocStart(void) const
Definition: wgsread.cpp:7885
CRef< CWGSDb_Impl::SFeatTableCursor > m_Cur
Definition: wgsread.hpp:1611
void x_Init(const CWGSDb &wgs_db)
Definition: wgsread.cpp:7820
~CWGSFeatureIterator(void)
Definition: wgsread.cpp:7790
void x_CheckValid(const char *method) const
Definition: wgsread.hpp:1603
void x_Settle(void)
Definition: wgsread.cpp:7011
bool x_Excluded(void)
Definition: wgsread.cpp:6981
void x_Init(const CWGSDb &wgs_db, ESeqType seq_type)
Definition: wgsread.cpp:6956
ESeqType m_CurrSeqType
Definition: wgsread.hpp:1403
CWGSGiIterator(void)
Definition: wgsread.cpp:6895
TVDBRowId m_CurrRowId
Definition: wgsread.hpp:1402
CRef< CWGSDb_Impl::SGiIdxTableCursor > m_Cur
Definition: wgsread.hpp:1400
CWGSDb_Impl & GetDb(void) const
Definition: wgsread.hpp:1391
void Reset(void)
Definition: wgsread.cpp:6878
~CWGSGiIterator(void)
Definition: wgsread.cpp:6950
CWGSGiIterator & operator=(const CWGSGiIterator &iter)
Definition: wgsread.cpp:6909
ESeqType m_FilterSeqType
Definition: wgsread.hpp:1403
CRef< CSeq_id > GetAccSeq_id(void) const
Definition: wgsread.cpp:7189
CBioseq::TAnnot TAnnotSet
Definition: wgsread.hpp:1502
TVDBRowId GetReplacesRowId(void) const
Definition: wgsread.cpp:7523
void x_CreateEntry(SWGSCreateInfo &info) const
Definition: wgsread.cpp:7687
bool HasGi(void) const
Definition: wgsread.cpp:7149
CTempString GetPublicComment(void) const
Definition: wgsread.cpp:7406
CWGSProteinIterator & SelectRow(TVDBRowId row)
Definition: wgsread.cpp:7129
void x_Cur() const
Definition: wgsread.cpp:7121
CRef< CBioseq > GetBioseq(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:7694
CRef< CSeq_id > GetGiSeq_id(void) const
Definition: wgsread.cpp:7217
NCBI_gb_state GetGBState(void) const
Definition: wgsread.cpp:7361
size_t GetProductFeatCount(void) const
Definition: wgsread.cpp:7458
bool HasTaxId(void) const
Definition: wgsread.cpp:7302
void Reset(void)
Definition: wgsread.cpp:7024
TVDBRowId GetBestProductFeatRowId(void) const
Definition: wgsread.cpp:7480
TSeqPos GetSeqLength(void) const
Definition: wgsread.cpp:7337
CWGSDb_Impl & GetDb(void) const
Definition: wgsread.hpp:1518
bool HasTitle(void) const
Definition: wgsread.cpp:7417
NCBI_gb_state GetRawGBState(void) const
Definition: wgsread.cpp:7382
TVDBRowIdRange GetLocFeatRowIdRange(void) const
Definition: wgsread.cpp:7436
TVDBRowId GetProductFeatRowId(void) const
Definition: wgsread.cpp:7492
CWGSProteinIterator & operator=(const CWGSProteinIterator &iter)
Definition: wgsread.cpp:7056
CRef< CSeq_inst > GetSeq_inst(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:7607
CSeq_id::TGi GetGi(void) const
Definition: wgsread.cpp:7155
THash GetSeqHash(void) const
Definition: wgsread.cpp:7331
bool HasAnnotSet(void) const
Definition: wgsread.cpp:7589
void x_CheckValid(const char *method) const
Definition: wgsread.hpp:1523
CRef< CWGSDb_Impl::SProtTableCursor > m_Cur
Definition: wgsread.hpp:1535
CRef< CSeq_entry > GetSeq_entry(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:7703
TVDBRowId m_CurrId
Definition: wgsread.hpp:1536
int GetAccVersion(void) const
Definition: wgsread.cpp:7181
void x_CreateBioseq(SWGSCreateInfo &info) const
Definition: wgsread.cpp:7638
CTempString GetTitle(void) const
Definition: wgsread.cpp:7425
void GetIds(CBioseq::TId &ids, TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:7260
~CWGSProteinIterator(void)
Definition: wgsread.cpp:7097
TVDBRowId m_FirstBadId
Definition: wgsread.hpp:1536
void x_Init(const CWGSDb &wgs_db)
Definition: wgsread.cpp:7103
CTempString GetAccession(void) const
Definition: wgsread.cpp:7168
CRef< CSeq_id > GetGeneralOrPatentSeq_id(void) const
Definition: wgsread.cpp:7205
CRef< CSeq_id > GetId(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:7230
TVDBRowId m_FirstGoodId
Definition: wgsread.hpp:1536
bool HasPublicComment(void) const
Definition: wgsread.cpp:7395
CRef< CSeq_id > GetGeneralSeq_id(void) const
Definition: wgsread.cpp:7211
NCBI_WGS_hash THash
Definition: wgsread.hpp:1464
bool HasSeqHash(void) const
Definition: wgsread.cpp:7323
void GetAnnotSet(TAnnotSet &annot_set, TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:7597
CTempString GetRefAcc(void) const
Definition: wgsread.cpp:7353
TTaxId GetTaxId(void) const
Definition: wgsread.cpp:7312
CTempString GetProductName(void) const
Definition: wgsread.cpp:7294
void x_ReportInvalid(const char *method) const
Definition: wgsread.cpp:7141
CTempString GetProteinName(void) const
Definition: wgsread.cpp:7287
CRef< CWGSDb_Impl::SProt0TableCursor > m_Cur0
Definition: wgsread.hpp:1534
CRef< CSeq_descr > GetSeq_descr(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:7561
TVDBRowId GetReplacedByRowId(void) const
Definition: wgsread.cpp:7504
bool HasRefAcc(void) const
Definition: wgsread.cpp:7345
bool HasSeq_descr(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:7542
void x_Init(const CWGSDb &wgs_db)
Definition: wgsread.cpp:6460
void x_ReportInvalid(const char *method) const
Definition: wgsread.cpp:6490
bool HasSeq_descr(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6651
void x_CreateEntry(SWGSCreateInfo &info) const
Definition: wgsread.cpp:6833
CRef< CSeq_id > GetAccSeq_id(void) const
Definition: wgsread.cpp:6550
CWGSScaffoldIterator & SelectRow(TVDBRowId row)
Definition: wgsread.cpp:6478
CTempString GetScaffoldName(void) const
Definition: wgsread.cpp:6644
TVDBRowId m_FirstBadId
Definition: wgsread.hpp:1343
void x_CreateBioseq(SWGSCreateInfo &info) const
Definition: wgsread.cpp:6798
TVDBRowId m_FirstGoodId
Definition: wgsread.hpp:1343
TVDBRowIdRange GetLocFeatRowIdRange(void) const
Definition: wgsread.cpp:6696
CRef< CSeq_id > GetGeneralOrPatentSeq_id(void) const
Definition: wgsread.cpp:6565
bool IsCircular(void) const
Definition: wgsread.cpp:6688
CRef< CSeq_descr > GetSeq_descr(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6659
NCBI_gb_state GetRawGBState(void) const
Definition: wgsread.cpp:6537
CRef< CBioseq > GetBioseq(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6854
void GetIds(CBioseq::TId &ids, TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6617
CRef< CSeq_entry > GetSeq_entry(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6863
CRef< CSeq_inst > GetSeq_inst(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6718
CWGSScaffoldIterator & operator=(const CWGSScaffoldIterator &iter)
Definition: wgsread.cpp:6404
CRef< CWGSDb_Impl::SScfTableCursor > m_Cur
Definition: wgsread.hpp:1342
int GetAccVersion(void) const
Definition: wgsread.cpp:6509
CRef< CSeq_id > GetGeneralSeq_id(void) const
Definition: wgsread.cpp:6575
TSeqPos GetSeqLength(void) const
Definition: wgsread.cpp:6674
CRef< CSeq_id > GetId(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6587
NCBI_gb_state GetGBState(void) const
Definition: wgsread.cpp:6516
void x_CheckValid(const char *method) const
Definition: wgsread.hpp:1331
CWGSDb_Impl & GetDb(void) const
Definition: wgsread.hpp:1326
CTempString GetAccession(void) const
Definition: wgsread.cpp:6498
CRef< CSeq_id > GetGiSeq_id(void) const
Definition: wgsread.cpp:6581
void x_CreateChunk(SWGSCreateInfo &info, TChunkId chunk_id) const
Definition: wgsread.cpp:6226
CRef< CAsnBinData > GetSeq_entryData(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6275
CRef< CSeq_data > Get4na(TSeqPos pos, TSeqPos len) const
Definition: wgsread.cpp:5072
void x_Select(const CWGSDb &wgs_db, TIncludeFlags include_flags, EClipType clip_type)
Definition: wgsread.cpp:4117
void x_Init(const CWGSDb &wgs_db, TIncludeFlags include_flags, EClipType clip_type, TVDBRowId get_row)
Definition: wgsread.cpp:4177
bool x_Excluded(void) const
Definition: wgsread.cpp:3905
void x_CreateSplit(SWGSCreateInfo &info) const
Definition: wgsread.cpp:6113
CRef< CSeq_descr > GetSeq_descr(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:4600
void x_SetDelta(CSeq_inst &inst, const TSegments &segments) const
Definition: wgsread.cpp:5321
CTempString GetTitle(void) const
Definition: wgsread.cpp:4387
bool CanHaveQualityGraph(void) const
Definition: wgsread.cpp:4665
TIncludeFlags m_IncludeFlags
Definition: wgsread.hpp:1244
CTempString GetPublicComment(void) const
Definition: wgsread.cpp:4862
bool x_InitSplit(SWGSCreateInfo &info) const
Definition: wgsread.cpp:6073
CRef< CAsnBinData > GetChunkDataForVersion(TChunkId chunk_id, TSplitVersion split_version) const
Definition: wgsread.cpp:6362
void x_Settle(void)
Definition: wgsread.cpp:4240
CTempString GetContigName(void) const
Definition: wgsread.cpp:4375
CWGSSeqIterator & SelectRow(TVDBRowId row)
Definition: wgsread.cpp:4213
CRef< CSeq_id > GetGeneralOrPatentSeq_id(void) const
Definition: wgsread.cpp:4363
TSeqPos GetRawSeqLength(void) const
Definition: wgsread.cpp:4428
CWGSSeqIterator & operator++(void)
Definition: wgsread.cpp:4230
CRef< CID2S_Chunk > GetChunkForVersion(TChunkId chunk_id, TSplitVersion split_version) const
Definition: wgsread.cpp:6339
CRef< CAsnBinData > GetChunkData(TChunkId chunk_id, TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6350
SAmbiguityAccess GetAmbiguity() const
Definition: wgsread.cpp:4959
void GetQualityAnnot(TAnnotSet &annot_set, TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:4747
TTaxId GetTaxId(void) const
Definition: wgsread.cpp:4405
CTempString GetAccession(void) const
Definition: wgsread.cpp:4273
TVDBRowIdRange GetLocFeatRowIdRange(void) const
Definition: wgsread.cpp:4620
void x_AddQualityChunkInfo(SWGSCreateInfo &info) const
Definition: wgsread.cpp:5828
bool IsCircular(void) const
Definition: wgsread.cpp:4873
CWGSSeqIterator(void)
Definition: wgsread.cpp:3967
CRef< CSeq_id > GetGeneralSeq_id(void) const
Definition: wgsread.cpp:4369
bool HasQualityGraph(void) const
Definition: wgsread.cpp:4672
bool HasClippingInfo(void) const
Definition: wgsread.cpp:4459
CRef< CAsnBinData > GetSplitInfoData(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6307
TSeqPos GetClipQualityLength(void) const
Definition: wgsread.cpp:4446
string GetQualityAnnotName(void) const
Definition: wgsread.cpp:4706
NCBI_gb_state GetRawGBState(void) const
Definition: wgsread.cpp:4834
TVDBRowId m_FirstGoodId
Definition: wgsread.hpp:1242
NCBI_WGS_hash THash
Definition: wgsread.hpp:987
COpenRange< TSeqPos > x_NormalizeSeqRange(COpenRange< TSeqPos > range) const
Definition: wgsread.cpp:5137
void x_ReportInvalid(const char *method) const
Definition: wgsread.cpp:4249
bool HasGi(void) const
Definition: wgsread.cpp:4256
void x_GetSegmentsWithRecoveredGaps(TSegments &segments, COpenRange< TSeqPos > range) const
Definition: wgsread.cpp:5257
bool HasPublicComment(void) const
Definition: wgsread.cpp:4851
TVDBRowId m_CurrId
Definition: wgsread.hpp:1242
CTempString GetNucProtDescrBytes(void) const
Definition: wgsread.cpp:4567
CSeq_id::TGi GetGi(void) const
Definition: wgsread.cpp:4262
void SelectAccVersion(int version)
Definition: wgsread.cpp:4335
void x_CreateBioseq(SWGSCreateInfo &info) const
Definition: wgsread.cpp:5863
bool HasGapInfo(void) const
Definition: wgsread.cpp:4881
CWGSSeqIterator & operator=(const CWGSSeqIterator &iter)
Definition: wgsread.cpp:3949
bool HasTitle(void) const
Definition: wgsread.cpp:4381
void GetQualityVec(vector< INSDC_quality_phred > &quality_vec) const
Definition: wgsread.cpp:4688
CRef< CSeq_inst > x_GetSeq_inst(SWGSCreateInfo &info) const
Definition: wgsread.cpp:5363
TVDBRowId m_FirstBadId
Definition: wgsread.hpp:1242
CRef< CSeq_id > GetGiSeq_id(void) const
Definition: wgsread.cpp:4349
~CWGSSeqIterator(void)
Definition: wgsread.cpp:4110
void x_CreateProductsChunk(SWGSCreateInfo &info, unsigned index) const
Definition: wgsread.cpp:6159
CRef< CWGSDb_Impl::SSeq0TableCursor > m_Cur0
Definition: wgsread.hpp:1239
void Reset(void)
Definition: wgsread.cpp:3922
bool HasAnnotSet(void) const
Definition: wgsread.cpp:4642
CRef< CSeq_id > GetAccSeq_id(void) const
Definition: wgsread.cpp:4342
int GetAccVersion(void) const
Definition: wgsread.hpp:915
bool HasSeqHash(void) const
Definition: wgsread.cpp:4415
CTempString GetAnnotBytes(void) const
Definition: wgsread.cpp:4649
THash GetSeqHash(void) const
Definition: wgsread.cpp:4422
SVersionSelector x_GetAccVersionSelector(int version) const
Definition: wgsread.cpp:4315
CRef< CID2S_Chunk > GetChunk(TChunkId chunk_id, TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6328
pair< CRef< CAsnBinData >, TSplitVersion > GetSplitInfoDataAndVersion(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6314
void x_CreateDataChunk(SWGSCreateInfo &info, unsigned index) const
Definition: wgsread.cpp:6131
void x_GetSegmentsWithExplicitGaps(TSegments &data, COpenRange< TSeqPos > range, TWGSContigGapInfo gap_info, TInstSegmentFlags flags) const
Definition: wgsread.cpp:5145
void x_CreateEntry(SWGSCreateInfo &info) const
Definition: wgsread.cpp:6040
void x_GetQualityAnnot(TAnnotSet &annot_set, SWGSCreateInfo &info, TSeqPos pos=0, TSeqPos len=kInvalidSeqPos) const
Definition: wgsread.cpp:4756
bool HasSeq_descr(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:4578
void GetGapInfo(TWGSContigGapInfo &gap_info) const
Definition: wgsread.cpp:4965
void x_CreateQualityChunk(SWGSCreateInfo &info, unsigned index) const
Definition: wgsread.cpp:6119
CRef< CSeq_data > Get2na(TSeqPos pos, TSeqPos len) const
Definition: wgsread.cpp:5066
bool HasTaxId(void) const
Definition: wgsread.cpp:4399
CBioseq::TAnnot TAnnotSet
Definition: wgsread.hpp:1059
struct CWGSSeqIterator::SWGSContigGapInfo TWGSContigGapInfo
TSeqPos GetSeqLength(EClipType clip_type=eDefaultClip) const
Definition: wgsread.cpp:4478
CRef< CBioseq > GetBioseq(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6254
CRef< CWGSDb_Impl::SAmbiguityInfo > m_AmbiguityInfo
Definition: wgsread.hpp:1241
SVersionSelector m_AccVersion
Definition: wgsread.hpp:1243
TSeqPos x_GetQualityArraySize(void) const
Definition: wgsread.cpp:4680
CRef< CWGSDb_Impl::SSeqTableCursor > m_Cur
Definition: wgsread.hpp:1240
void x_SetDeltaOrData(CSeq_inst &inst, const TSegments &segments) const
Definition: wgsread.cpp:5344
TSeqPos GetClipQualityLeft(void) const
Definition: wgsread.cpp:4434
@ fExcludeProjectGBState
Definition: wgsread.hpp:818
pair< CRef< CID2S_Split_Info >, TSplitVersion > GetSplitInfoAndVersion(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6294
CRef< CID2S_Split_Info > GetSplitInfo(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6287
bool HasAccVersion(int version) const
Definition: wgsread.cpp:4301
unsigned GetAccVersionCount(void) const
Definition: wgsread.cpp:4288
void GetAnnotSet(TAnnotSet &annot_set, TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:4656
CRef< CSeq_inst > GetSeq_inst(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:5431
vector< Uint1 > GetAmbiguityBytes() const
Definition: wgsread.cpp:5060
bool HasNucProtDescrBytes(void) const
Definition: wgsread.cpp:4560
CTempString GetSeqDescrBytes(void) const
Definition: wgsread.cpp:4549
int GetLatestAccVersion(void) const
Definition: wgsread.cpp:4280
vector< SSegment > TSegments
Definition: wgsread.hpp:1219
TSeqPos GetSeqOffset(EClipType clip_type=eDefaultClip) const
Definition: wgsread.cpp:4471
void GetIds(CBioseq::TId &ids, TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:4515
CRef< CSeq_entry > GetSeq_entry(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6264
bool HasSeqDescrBytes(void) const
Definition: wgsread.cpp:4542
void x_AddGap(TSegments &segments, TSeqPos pos, TSeqPos len, const TWGSContigGapInfo &gap_info) const
Definition: wgsread.cpp:5117
bool m_ClipByQuality
Definition: wgsread.hpp:1245
void x_CreateFeaturesChunk(SWGSCreateInfo &info, unsigned index) const
Definition: wgsread.cpp:6181
bool GetClipByQualityFlag(EClipType clip_type=eDefaultClip) const
Definition: wgsread.hpp:959
void x_CheckValid(const char *method) const
Definition: wgsread.hpp:1161
CRef< CSeq_id > GetId(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:4485
NCBI_gb_state GetGBState(void) const
Definition: wgsread.cpp:4813
CWGSDb_Impl & GetDb(void) const
Definition: wgsread.hpp:1153
Write hook for a choice variant (CHOICE)
Definition: objhook.hpp:190
Write hook for data member of a containing object (eg, SEQUENCE)
Definition: objhook.hpp:175
void Cleanup(TGlobalAmbiguityCache &)
Definition: wgsread.cpp:2206
TGlobalAmbiguityCache * Create()
Definition: wgsread.cpp:2205
void put(const key_type &key, const mapped_type &value, const resource_type &resource_used)
mapped_type get(const key_type &key)
container_type::const_iterator const_iterator
Definition: map.hpp:53
container_type::iterator iterator
Definition: map.hpp:54
const_iterator end() const
Definition: map.hpp:152
const_iterator lower_bound(const key_type &key) const
Definition: map.hpp:154
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
container_type::value_type value_type
Definition: map.hpp:52
const_iterator find(const key_type &key) const
Definition: map.hpp:153
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
The NCBI C++ standard methods for dealing with std::string.
int32_t INSDC_coord_one
Definition: csraread.hpp:48
uint32_t INSDC_coord_len
Definition: csraread.hpp:49
static ulg bb
static uch flags
int GetSeqLength(const CBioseq &bioseq)
Definition: cuSequence.cpp:216
std::ofstream out("events_result.xml")
main entry point for tests
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static const char table_name[]
Definition: bcp.c:249
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
char data[12]
Definition: iconv.c:80
Uint4 uint32_t
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
Int8 TIntId
Definition: ncbimisc.hpp:999
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
#define GI_CONST(gi)
Definition: ncbimisc.hpp:1087
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
string
Definition: cgiapp.hpp:690
#define NULL
Definition: ncbistd.hpp:225
#define _VERIFY(expr)
Definition: ncbidbg.hpp:161
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
#define NCBI_THROW_FMT(exception_class, err_code, message)
The same as NCBI_THROW but with message processed as output to ostream.
Definition: ncbiexpt.hpp:719
static void SetErrno(int errno_code)
Set last error using errno code.
Definition: ncbierror.cpp:190
static string MakePath(const string &dir=kEmptyStr, const string &base=kEmptyStr, const string &ext=kEmptyStr)
Assemble a path from basic components.
Definition: ncbifile.cpp:413
const CVect2< U > & v2
Definition: globals.hpp:440
TPrim & Set(void)
Definition: serialbase.hpp:351
const TPrim & Get(void) const
Definition: serialbase.hpp:347
C * SerialClone(const C &src)
Create on heap a clone of the source object.
Definition: serialbase.hpp:512
@ eTypeFamilyPointer
Definition: serialdef.hpp:143
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
CObjectTypeInfo GetPointedType(void) const
Get type information of data to which this type refers.
Definition: objectinfo.cpp:91
CObjectTypeInfo GetMemberType(void) const
Get data type information.
const CConstObjectInfo & GetClassObject(void) const
Get containing class data.
void DefaultWrite(CObjectOStream &out, const CConstObjectInfoCV &variant)
Definition: objhook.cpp:231
CConstObjectInfo GetVariant(void) const
Get variant data.
ETypeFamily GetTypeFamily(void) const
Get data type family.
TConstObjectPtr GetObjectPtr(void) const
Get pointer to object.
void DefaultWrite(CObjectOStream &out, const CConstObjectInfoMI &member)
Definition: objhook.cpp:218
void OpenFromBuffer(const char *buffer, size_t size)
Attach reader to a data source.
Definition: objistr.cpp:501
Uint8 TCount
Alias for value type of counter.
Definition: ncbiobj.hpp:310
TObjectType * GetNCPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1174
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TObjectType & GetNCObject(void) const
Get object.
Definition: ncbiobj.hpp:1187
#define NCBI_PARAM_TYPE(section, name)
Generate typename for a parameter from its {section, name} attributes.
Definition: ncbi_param.hpp:149
@ eParam_NoThread
Do not use per-thread values.
Definition: ncbi_param.hpp:418
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int8_t Int1
1-byte (8-bit) signed integer
Definition: ncbitype.h:98
TThisType & SetFrom(position_type from)
Definition: range.hpp:170
TThisType & SetToOpen(position_type toOpen)
Definition: range.hpp:175
position_type GetToOpen(void) const
Definition: range.hpp:138
position_type GetFrom(void) const
Definition: range.hpp:134
TThisType & SetLength(position_type length)
Definition: range.hpp:194
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
CTempString literal(const char(&str)[Size])
Templatized initialization from a string literal.
Definition: tempstr.hpp:441
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static Uint8 StringToUInt8_DataSize(const CTempString str, TStringToNumFlags flags=0)
Convert string that can contain "software" qualifiers to Uint8.
Definition: ncbistr.cpp:1530
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
static bool IsUpper(const CTempString str)
Checks if all letters in the given string have a upper case.
Definition: ncbistr.cpp:445
size_type find_first_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character in the matching string within the current string,...
Definition: tempstr.hpp:538
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5378
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fWithCommas
Use commas as thousands separator.
Definition: ncbistr.hpp:254
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2775
void Stop(void)
Suspend the timer.
Definition: ncbitime.hpp:2792
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2764
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
const TData & GetData(void) const
Get the Data member data.
void SetType(TType &value)
Assign a value to Type data member.
const TType & GetType(void) const
Get the Type member data.
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
TId & SetId(void)
Select the variant.
Definition: Object_id_.hpp:277
vector< CRef< CUser_field > > TData
@ eLim_unk
unknown
Definition: Int_fuzz_.hpp:210
TGi & SetGi(void)
Select the variant.
void SetName(const TName &value)
Assign a value to Name data member.
void SetStart(TStart value)
Assign a value to Start data member.
list< CRef< C_E > > Tdata
TSeq_id_interval & SetSeq_id_interval(void)
Select the variant.
list< CRef< CID2S_Seq_loc > > TLoc_set
TSeq_id & SetSeq_id(void)
Select the variant.
TWhole_gi_range & SetWhole_gi_range(void)
Select the variant.
TGi_interval & SetGi_interval(void)
Select the variant.
TSeq_id & SetSeq_id(void)
Select the variant.
void SetGi(TGi value)
Assign a value to Gi data member.
void SetSeq_id(TSeq_id &value)
Assign a value to Seq_id data member.
void SetSeq_loc(TSeq_loc &value)
Assign a value to Seq_loc data member.
TGi & SetGi(void)
Select the variant.
void SetCount(TCount value)
Assign a value to Count data member.
list< CRef< CBioseq > > TBioseqs
void SetStart(TStart value)
Assign a value to Start data member.
void SetStart(TStart value)
Assign a value to Start data member.
TWhole_gi & SetWhole_gi(void)
Select the variant.
TFeat & SetFeat(void)
Assign a value to Feat data member.
TContent & SetContent(void)
Assign a value to Content data member.
TWhole_seq_id & SetWhole_seq_id(void)
Select the variant.
void SetSeq_id(TSeq_id &value)
Assign a value to Seq_id data member.
void SetGraph(void)
Set NULL data member (assign 'NULL' value to Graph data member).
void SetLength(TLength value)
Assign a value to Length data member.
void ResetSeq_loc(void)
Reset Seq_loc data member.
TInts & SetInts(void)
Assign a value to Ints data member.
void SetLength(TLength value)
Assign a value to Length data member.
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
E_Choice
Choice variants.
@ e_not_set
No variant selected.
@ e_MaxChoice
== e_Variation+1
void SetSeqid(TSeqid value)
Assign a value to Seqid data member.
NCBI_NS_NCBI::TGi TGi
Definition: Seq_id_.hpp:180
void SetTo(TTo value)
Assign a value to To data member.
TPatent & SetPatent(void)
Select the variant.
Definition: Seq_id_.cpp:331
void SetId(TId &value)
Assign a value to Id data member.
void SetFrom(TFrom value)
Assign a value to From data member.
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
TGi & SetGi(void)
Select the variant.
Definition: Seq_id_.hpp:896
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
void SetStrand(TStrand value)
Assign a value to Strand data member.
void SetCit(TCit &value)
Assign a value to Cit data member.
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
void SetMin(TMin value)
Assign a value to Min data member.
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Seq_graph_.hpp:784
vector< char > TValues
Definition: Byte_graph_.hpp:89
void SetNumval(TNumval value)
Assign a value to Numval data member.
TValues & SetValues(void)
Assign a value to Values data member.
void SetGraph(TGraph &value)
Assign a value to Graph data member.
Definition: Seq_graph_.cpp:250
void SetMax(TMax value)
Assign a value to Max data member.
void SetLoc(TLoc &value)
Assign a value to Loc data member.
Definition: Seq_graph_.cpp:224
void SetAxis(TAxis value)
Assign a value to Axis data member.
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
void SetId(TId &value)
Assign a value to Id data member.
Definition: Bioseq_set_.cpp:93
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
void SetClass(TClass value)
Assign a value to Class data member.
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ e_not_set
No variant selected.
Definition: Seq_entry_.hpp:88
void ResetStrand(void)
Reset Strand data member.
Definition: Seq_inst_.hpp:770
void SetLength(TLength value)
Assign a value to Length data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
void SetHist(THist &value)
Assign a value to Hist data member.
Definition: Seq_inst_.cpp:164
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TName & SetName(void)
Select the variant.
Definition: Annotdesc_.hpp:508
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_gap_.hpp:291
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
void SetReplaces(TReplaces &value)
Assign a value to Replaces data member.
Definition: Seq_hist_.cpp:162
void SetFuzz(TFuzz &value)
Assign a value to Fuzz data member.
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
void SetTopology(TTopology value)
Assign a value to Topology data member.
Definition: Seq_inst_.hpp:739
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
const TNcbi4na & GetNcbi4na(void) const
Get the variant data.
Definition: Seq_data_.hpp:570
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
TNcbi2na & SetNcbi2na(void)
Select the variant.
Definition: Seq_data_.hpp:557
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
const TNcbi2na & GetNcbi2na(void) const
Get the variant data.
Definition: Seq_data_.hpp:550
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
void SetReplaced_by(TReplaced_by &value)
Assign a value to Replaced_by data member.
Definition: Seq_hist_.cpp:179
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
void SetStrand(TStrand value)
Assign a value to Strand data member.
Definition: Seq_inst_.hpp:786
TLinkage_evidence & SetLinkage_evidence(void)
Assign a value to Linkage_evidence data member.
Definition: Seq_gap_.hpp:375
void SetLinkage(TLinkage value)
Assign a value to Linkage data member.
Definition: Seq_gap_.hpp:338
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
void ResetExt(void)
Reset Ext data member.
Definition: Seq_inst_.cpp:142
TNcbi4na & SetNcbi4na(void)
Select the variant.
Definition: Seq_data_.hpp:577
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
@ eRepr_not_set
empty
Definition: Seq_inst_.hpp:92
@ e_Embl
EMBL specific information.
Definition: Seqdesc_.hpp:127
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Update_date
date of last update
Definition: Seqdesc_.hpp:129
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Comment
a more extensive comment
Definition: Seqdesc_.hpp:117
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Create_date
date entry first created/released
Definition: Seqdesc_.hpp:128
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eType_heterochromatin
Definition: Seq_gap_.hpp:93
@ eStrand_ds
double strand
Definition: Seq_inst_.hpp:136
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
static CStopWatch sw
#define DEBUG
Definition: config.h:32
Definition of all error codes used in SRA C++ support libraries.
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is whole
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
@ e_not_set
int i
int len
Lightweight interface for getting lines of data with minimal memory copying.
static MDB_envinfo info
Definition: mdb_load.c:37
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
const string version
version string
Definition: variables.hpp:66
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
string s_Value(TValue value)
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
#define LABEL
Definition: newick.tab.cpp:71
T max(T x_, T y_)
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void split(std::vector< std::string > *strVec, const std::string &str_, const std::string &split_)
#define count
static uint8_t * buffer
Definition: pcre2test.c:1016
static bool GetIds(const T &d, set< string > &labels, const string name="", bool detect=false, bool found=false)
#define row(bind, expected)
Definition: string_bind.c:73
void AddFeature(const CTempString &data)
Definition: wgsread.cpp:349
vector< char > m_Bytes
Definition: wgsread.cpp:347
char m_Packed4na[kAmbiguityBlockSize/2]
Definition: wgsread.cpp:1143
T4naBlocks::const_iterator m_4naBlocksIter
Definition: wgsread.cpp:1153
EBaseType GetBaseType(const S4naReader &reader) const
Definition: wgsread.cpp:1954
CWGSDb_Impl::SSeqTableCursor SSeqTableCursor
Definition: wgsread.cpp:1067
void x_CalculateAmbiguityMask(CWGSDb_Impl &db)
Definition: wgsread.cpp:1741
SAmbiguityInfo(TVDBRowId row_id, CWGSDb_Impl &db, SSeqTableCursor &cur)
Definition: wgsread.cpp:1175
bool x_AmbiguousBlock(size_t block_index) const
Definition: wgsread.cpp:1097
bool x_IsValid(const S4naReader &reader) const
Definition: wgsread.cpp:1915
vector< INSDC_coord_len > m_GapLen
Definition: wgsread.cpp:1133
vector< NCBI_WGS_component_props > m_GapProps
Definition: wgsread.cpp:1134
bool x_AddAmbiguousBlock(const Uint1 *ptr, TSeqPos count, TSeqPos pos, TWGSContigGapInfo &gap_info) const
Definition: wgsread.cpp:1698
bool x_AddAmbiguities(const Uint1 *ptr, TSeqPos count, TSeqPos pos, TWGSContigGapInfo &gap_info) const
Definition: wgsread.cpp:1709
vector< INSDC_coord_zero > m_GapStart
Definition: wgsread.cpp:1132
vector< Uint1 > GetAmbiguityBytes(SSeqTableCursor &cur)
Definition: wgsread.cpp:1074
void Advance(S4naReader &reader) const
Definition: wgsread.cpp:1989
TWGSContigGapInfo GetGapInfo() const
Definition: wgsread.cpp:1323
void x_SetAmbiguousBlock(size_t block_index)
Definition: wgsread.cpp:1103
map< TSeqPos, S4naBlock > T4naBlocks
Definition: wgsread.cpp:1146
CWGSSeqIterator::TWGSContigGapInfo TWGSContigGapInfo
Definition: wgsread.cpp:1078
TSeqPos Get2naLengthExact(TSeqPos pos, TSeqPos len, CWGSDb_Impl &db, SSeqTableCursor &cur) const
Definition: wgsread.cpp:2025
CRef< CSeq_data > Get4na(TSeqPos pos, TSeqPos len, CWGSDb_Impl &db, SSeqTableCursor &cur) const
Definition: wgsread.cpp:2137
void x_Calculate4na(CWGSDb_Impl &db) const
Definition: wgsread.cpp:1834
S4naReader Get4naReader(TSeqPos pos, CWGSDb_Impl &db, SSeqTableCursor &cur) const
Definition: wgsread.cpp:1933
vector< INSDC_4na_bin > m_Ambiguity4na
Definition: wgsread.cpp:1139
vector< Uint1 > m_AmbiguityMask
Definition: wgsread.cpp:1137
CRef< CSeq_data > Get2na(TSeqPos pos, TSeqPos len, SSeqTableCursor &cur) const
Definition: wgsread.cpp:2120
TSeqPos Get4naLengthBlock(TSeqPos pos, TSeqPos len) const
Definition: wgsread.cpp:2183
TSeqPos Get2naLengthBlock(TSeqPos pos, TSeqPos len) const
Definition: wgsread.cpp:2167
vector< NCBI_WGS_gap_linkage > m_GapLinkage
Definition: wgsread.cpp:1135
TSeqPos GetGapLengthExact(TSeqPos pos, TSeqPos len, CWGSDb_Impl &db, SSeqTableCursor &cur) const
Definition: wgsread.cpp:2101
vector< INSDC_coord_zero > m_AmbiguityPos
Definition: wgsread.cpp:1138
void x_Need4na(CWGSDb_Impl &db) const
Definition: wgsread.cpp:1111
size_t GetUsedMemory() const
Definition: wgsread.cpp:1301
TSeqPos Get4naLengthExact(TSeqPos pos, TSeqPos len, TSeqPos stop_2na_len, TSeqPos stop_gap_len, CWGSDb_Impl &db, SSeqTableCursor &cur) const
Definition: wgsread.cpp:2058
SFeatTableCursor(const CVDBTable &table)
Definition: wgsread.cpp:759
DECLARE_VDB_COLUMN_AS(TVDBRowId, PRODUCT_ROW_ID)
DECLARE_VDB_COLUMN_AS(INSDC_coord_zero, PRODUCT_START)
CObjectIStreamAsnBinary m_ObjStr
Definition: wgsread.cpp:755
DECLARE_VDB_COLUMN_AS_STRING(LOC_ACCESSION)
DECLARE_VDB_COLUMN_AS(NCBI_WGS_feattype, FEAT_TYPE)
DECLARE_VDB_COLUMN_AS(NCBI_WGS_seqtype, LOC_SEQ_TYPE)
DECLARE_VDB_COLUMN_AS_STRING(PRODUCT_ACCESSION)
DECLARE_VDB_COLUMN_AS(NCBI_WGS_seqtype, PRODUCT_SEQ_TYPE)
DECLARE_VDB_COLUMN_AS(TVDBRowId, LOC_ROW_ID)
DECLARE_VDB_COLUMN_AS(INSDC_coord_zero, LOC_START)
DECLARE_VDB_COLUMN_AS(NCBI_WGS_loc_strand, LOC_STRAND)
DECLARE_VDB_COLUMN_AS(INSDC_coord_len, LOC_LEN)
DECLARE_VDB_COLUMN_AS(INSDC_coord_len, PRODUCT_LEN)
SGiIdxTableCursor(const CVDBTable &table)
Definition: wgsread.cpp:810
DECLARE_VDB_COLUMN_AS(TVDBRowId, PROT_ROW_ID)
DECLARE_VDB_COLUMN_AS(TVDBRowId, NUC_ROW_ID)
SProt0TableCursor(const CVDBTable &table)
Definition: wgsread.cpp:696
DECLARE_VDB_COLUMN_AS_STRING(GB_ACCESSION)
DECLARE_VDB_COLUMN_AS(uint32_t, ACC_VERSION)
DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX)
DECLARE_VDB_COLUMN_AS_STRING(PROTEIN_NAME)
string GetAcc(Uint4 id) const
Definition: wgsread.cpp:3434
pair< TVDBRowId, TVDBRowId > row_range_t
Definition: wgsread.cpp:824
DECLARE_VDB_COLUMN_AS(row_range_t, NAME_ROW_RANGE)
SProtIdxTableCursor(const CVDBTable &table)
Definition: wgsread.cpp:830
DECLARE_VDB_COLUMN_AS(TVDBRowId, ROW_ID)
DECLARE_VDB_COLUMN_AS_STRING(PUBLIC_COMMENT)
DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_ROW_START)
DECLARE_VDB_COLUMN_AS(TVDBRowId, REPLACED_BY)
SProtTableCursor(const CVDBTable &table)
Definition: wgsread.cpp:708
DECLARE_VDB_COLUMN_AS(NCBI_taxid, TAXID)
DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_ROW_END)
DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_PRODUCT_ROW_ID)
DECLARE_VDB_COLUMN_AS(INSDC_coord_len, PROTEIN_LEN)
DECLARE_VDB_COLUMN_AS(NCBI_gb_state, GB_STATE)
DECLARE_VDB_COLUMN_AS(NCBI_WGS_hash, HASH)
DECLARE_VDB_COLUMN_AS_STRING(PRODUCT_NAME)
DECLARE_VDB_COLUMN_AS(TVDBRowId, REPLACES)
DECLARE_VDB_COLUMN_AS(NCBI_WGS_component_props, COMPONENT_PROPS)
DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_ROW_START)
SScfTableCursor(const CVDBTable &table)
Definition: wgsread.cpp:631
DECLARE_VDB_COLUMN_AS(NCBI_WGS_gap_linkage, COMPONENT_LINKAGE)
DECLARE_VDB_COLUMN_AS_STRING(ACCESSION)
DECLARE_VDB_COLUMN_AS(TVDBRowId, COMPONENT_ID)
DECLARE_VDB_COLUMN_AS(INSDC_coord_len, COMPONENT_LEN)
DECLARE_VDB_COLUMN_AS(NCBI_gb_state, GB_STATE)
DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_PRODUCT_ROW_ID)
DECLARE_VDB_COLUMN_AS(INSDC_coord_one, COMPONENT_START)
DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_ROW_END)
DECLARE_VDB_COLUMN_AS_STRING(SCAFFOLD_NAME)
DECLARE_VDB_COLUMN_AS(bool, CIRCULAR)
DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX)
SSeq0TableCursor(const CVDBTable &table)
Definition: wgsread.cpp:536
DECLARE_VDB_COLUMN_AS(NCBI_taxid, TAXID)
DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX)
DECLARE_VDB_COLUMN_AS_STRING(ACC_PREFIX)
DECLARE_VDB_COLUMN_AS(INSDC_coord_len, ACC_CONTIG_LEN)
DECLARE_VDB_COLUMN_AS(INSDC_4na_bin, READ)
SSeq4naTableCursor(const CVDBTable &table)
Definition: wgsread.cpp:602
SSeqTableCursor(const CVDBTable &table)
Definition: wgsread.cpp:547
DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_PRODUCT_ROW_ID)
DECLARE_VDB_COLUMN_AS(NCBI_WGS_hash, HASH)
CVDBColumnBits< 2 > m_READ_2na
Definition: wgsread.cpp:519
DECLARE_VDB_COLUMN_AS(INSDC_coord_zero, TRIM_START)
DECLARE_VDB_COLUMN_AS(INSDC_quality_phred, QUALITY)
DECLARE_VDB_COLUMN_AS_STRING(CONTIG_NAME)
DECLARE_VDB_COLUMN_AS(NCBI_WGS_gap_linkage, GAP_LINKAGE)
DECLARE_VDB_COLUMN_AS(INSDC_4na_bin, AMBIGUITY_4NA)
DECLARE_VDB_COLUMN_AS(NCBI_gb_state, GB_STATE)
DECLARE_VDB_COLUMN_AS(INSDC_coord_zero, AMBIGUITY_POS)
DECLARE_VDB_COLUMN_AS(INSDC_coord_len, GAP_LEN)
DECLARE_VDB_COLUMN_AS(NCBI_gi, GI)
DECLARE_VDB_COLUMN_AS(INSDC_coord_zero, READ_START)
DECLARE_VDB_COLUMN_AS(INSDC_coord_len, READ_LEN)
DECLARE_VDB_COLUMN_AS(Uint1, AMBIGUITY_MASK)
DECLARE_VDB_COLUMN_AS(bool, CIRCULAR)
DECLARE_VDB_COLUMN_AS_STRING(ACCESSION)
DECLARE_VDB_COLUMN_AS(INSDC_coord_zero, GAP_START)
DECLARE_VDB_COLUMN_AS(row_range_t, CONTIG_NAME_ROW_RANGE)
DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_ROW_START)
DECLARE_VDB_COLUMN_AS(uint32_t, ACC_VERSION)
DECLARE_VDB_COLUMN_AS_STRING(PUBLIC_COMMENT)
DECLARE_VDB_COLUMN_AS(NCBI_WGS_component_props, GAP_PROPS)
DECLARE_VDB_COLUMN_AS_STRING(NUC_PROT_DESCR)
pair< TVDBRowId, TVDBRowId > row_range_t
Definition: wgsread.cpp:517
DECLARE_VDB_COLUMN_AS(INSDC_coord_len, TRIM_LEN)
DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_ROW_END)
CRef< CSeq_data > Get4na(TSeqPos pos, TSeqPos len) const
Definition: wgsread.cpp:4943
TSeqPos Get4naLengthExact(TSeqPos pos, TSeqPos len, TSeqPos stop_2na_len, TSeqPos stop_gap_len) const
Definition: wgsread.cpp:4926
TSeqPos Get2naLengthExact(TSeqPos pos, TSeqPos len) const
Definition: wgsread.cpp:4921
CRef< CWGSDb_Impl::SAmbiguityInfo > m_AmbiguityInfo
Definition: wgsread.cpp:4955
CRef< CWGSDb_Impl::SSeqTableCursor > m_Seq
Definition: wgsread.cpp:4954
vector< Uint1 > GetAmbiguityBytes() const
Definition: wgsread.cpp:4916
SAmbiguityInfo * operator->() const
Definition: wgsread.cpp:4911
SAmbiguityAccess(CRef< CWGSDb_Impl::SAmbiguityInfo > &info, CWGSDb_Impl &db, const CRef< CWGSDb_Impl::SSeqTableCursor > &seq, TVDBRowId row_id)
Definition: wgsread.cpp:4888
CRef< CSeq_data > Get2na(TSeqPos pos, TSeqPos len) const
Definition: wgsread.cpp:4938
SAmbiguityAccess(const SAmbiguityAccess &)=delete
TSeqPos GetGapLengthExact(TSeqPos pos, TSeqPos len) const
Definition: wgsread.cpp:4932
void operator=(const SAmbiguityAccess &)=delete
CRef< CSeq_literal > literal
Definition: wgsread.hpp:1217
COpenRange< TSeqPos > range
Definition: wgsread.hpp:1215
const INSDC_coord_len * gaps_len
Definition: wgsread.hpp:996
bool IsInGap(TSeqPos pos) const
Definition: wgsread.hpp:1027
TSeqPos GetGapLength(TSeqPos pos, TSeqPos len) const
Definition: wgsread.hpp:1029
const NCBI_WGS_component_props * gaps_props
Definition: wgsread.hpp:997
const INSDC_coord_zero * gaps_start
Definition: wgsread.hpp:995
TSeqPos GetDataLength(TSeqPos pos, TSeqPos len) const
Definition: wgsread.hpp:1033
const NCBI_WGS_gap_linkage * gaps_linkage
Definition: wgsread.hpp:998
CRef< CID2S_Split_Info > split
Definition: wgsread.cpp:3848
CRef< CBioseq > main_seq
Definition: wgsread.cpp:3846
void x_CreateProtSet(TVDBRowIdRange range)
Definition: wgsread.cpp:5976
void x_AddFeature(const CWGSFeatureIterator &it, CSeq_annot::TData::TFtable &dst)
Definition: wgsread.cpp:5478
void x_AddProducts(const vector< TVDBRowId > &product_row_ids)
Definition: wgsread.cpp:5929
void x_AddDescr(CTempString bytes)
Definition: wgsread.cpp:5463
SWGSCreateInfo(const CWGSDb &db)
Definition: wgsread.cpp:3808
CRef< CSeq_id > main_id
Definition: wgsread.cpp:3844
void x_SetSplitVersion(TSplitVersion split_version)
Definition: wgsread.cpp:5445
void x_SetId(Iter &it)
Definition: wgsread.cpp:3854
void x_ResetId()
Definition: wgsread.cpp:3867
void x_SetFlags(TFlags flags)
Definition: wgsread.cpp:5438
void x_SetSeq(CWGSProteinIterator &it)
Definition: wgsread.cpp:3878
CRef< CSeq_id > feat_id
Definition: wgsread.cpp:3845
void x_AddFeaturesSplit(TVDBRowIdRange range, vector< TVDBRowId > &product_row_ids)
Definition: wgsread.cpp:5763
void x_ResetSeq()
Definition: wgsread.cpp:3883
CRef< CWGSAsnBinData > data
Definition: wgsread.cpp:3851
CRef< CSeq_entry > entry
Definition: wgsread.cpp:3847
void x_SetSeq(Iter &it)
Definition: wgsread.cpp:3873
CBioseq_set & x_GetProtSet(void)
Definition: wgsread.cpp:5491
CRef< CID2S_Chunk > chunk
Definition: wgsread.cpp:3850
void x_AddFeatures(TVDBRowIdRange range, vector< TVDBRowId > &product_row_ids)
Definition: wgsread.cpp:5801
SWGSCreateInfo(const CWGSDb &db, EFromFlags, TFlags flags)
Definition: wgsread.cpp:3819
SWGSCreateInfo(const CWGSDb &db, EFromSplitVersion, TSplitVersion split_version)
Definition: wgsread.cpp:3827
TSplitVersion split_version
Definition: wgsread.cpp:3849
void x_AddFeaturesDirect(TVDBRowIdRange range, vector< TVDBRowId > &product_row_ids)
Definition: wgsread.cpp:5508
int TSplitVersion
Definition: wgsread.hpp:193
@ eFeatLocIdAccVer
Definition: wgsread.hpp:182
@ eFeatLocIdAccNoVer
Definition: wgsread.hpp:183
@ eFeatLocIdUninitialized
Definition: wgsread.hpp:180
@ fSplitProducts
Definition: wgsread.hpp:169
@ fMasterDescrMark
Definition: wgsread.hpp:157
@ fSplitQualityGraph
Definition: wgsread.hpp:167
@ fSplitFeatures
Definition: wgsread.hpp:170
@ eGBStateProject
Definition: wgsread.hpp:189
static const TSplitVersion kDefaultSplitVersion
Definition: wgsread.hpp:194
int TChunkId
Definition: wgsread.hpp:196
void AddFeature(NCBI_WGS_feattype type, COpenRange< TSeqPos > range)
Definition: wgsread.cpp:5593
void AddFeatType(NCBI_WGS_feattype feat_type)
Definition: wgsread.cpp:5555
static bool ExpandRange(COpenRange< TSeqPos > &dst, COpenRange< TSeqPos > src)
Definition: wgsread.cpp:5564
void AddFeatRange(COpenRange< TSeqPos > range)
Definition: wgsread.cpp:5587
bitset< CSeqFeatData::e_MaxChoice > feat_types
Definition: wgsread.cpp:5548
vector< COpenRange< TSeqPos > > loc_ranges
Definition: wgsread.cpp:5547
void AddContent(CID2S_Chunk_Info &chunk, CSeq_id &feat_id)
Definition: wgsread.cpp:5735
CRef< CSeq_id > main_id
Definition: wgsread.cpp:5540
CRef< CSeq_id > feat_id
Definition: wgsread.cpp:5541
void AddFeature(bool with_product, NCBI_WGS_feattype type, COpenRange< TSeqPos > range)
Definition: wgsread.cpp:5623
SFeatureSet features[2]
Definition: wgsread.cpp:5605
CRef< CID2S_Chunk_Info > CreateChunkInfo(int index, CWGSProteinIterator &prot_it, const vector< TVDBRowId > &product_row_ids, size_t product_index)
Definition: wgsread.cpp:5656
CRef< CID2S_Bioseq_Ids::C_E > seq_place
Definition: wgsread.cpp:5542
SWGSFeatChunkInfo(CSeq_id &main_id, CSeq_id &feat_id)
Definition: wgsread.cpp:5607
static DP_BlockInfo * blocks
Definition: type.c:6
#define _ASSERT
#define ftable
Definition: utilfeat.h:37
#define INIT_VDB_COLUMN(name)
Definition: vdbread.hpp:610
uint64_t TVDBRowCount
Definition: vdbread.hpp:83
#define INIT_VDB_COLUMN_AS(name, type)
Definition: vdbread.hpp:614
#define INIT_VDB_COLUMN_BACKUP(name, backup_name)
Definition: vdbread.hpp:612
pair< TVDBRowId, TVDBRowCount > TVDBRowIdRange
Definition: vdbread.hpp:84
#define INIT_OPTIONAL_VDB_COLUMN(name)
Definition: vdbread.hpp:616
int64_t TVDBRowId
Definition: vdbread.hpp:80
int32_t NCBI_WGS_gap_linkage
Definition: wgs-contig.h:111
int16_t NCBI_WGS_component_props
Definition: wgs-contig.h:54
@ NCBI_WGS_strand_minus
Definition: wgs-contig.h:92
@ NCBI_WGS_strand_plus
Definition: wgs-contig.h:91
@ NCBI_WGS_gap_linkage_evidence_paired_ends
Definition: wgs-contig.h:152
@ NCBI_WGS_gap_linkage_linked
Definition: wgs-contig.h:151
@ NCBI_WGS_gap_contig
Definition: wgs-contig.h:125
@ NCBI_WGS_gap_short_arm
Definition: wgs-contig.h:127
@ NCBI_WGS_gap_unknown_type
Definition: wgs-contig.h:131
@ NCBI_WGS_gap_telomere
Definition: wgs-contig.h:129
@ NCBI_WGS_gap_repeat
Definition: wgs-contig.h:130
@ NCBI_WGS_gap_centromere
Definition: wgs-contig.h:126
@ NCBI_WGS_gap_heterochromatin
Definition: wgs-contig.h:128
@ NCBI_WGS_gap_scaffold
Definition: wgs-contig.h:124
@ NCBI_WGS_gap_unknown
Definition: wgs-contig.h:105
@ NCBI_WGS_gap_known
Definition: wgs-contig.h:104
static const char kMasterDescrMark[]
Definition: wgsmaster.cpp:52
static const bool kRecoverGaps
Definition: wgsread.cpp:1335
static const size_t kProdPerChunk
Definition: wgsread.cpp:199
static TTaxId s_GetTaxId(const CVDBValueFor< NCBI_taxid > &value)
Definition: wgsread.cpp:4393
BEGIN_NAMESPACE(objects)
#define DEFAULT_AMBIGUITY_CACHE_SIZE
Definition: wgsread.cpp:70
void sx_SetSplitInterval(CID2S_Seq_loc &split_loc, CSeq_id &id, TSeqPos pos, TSeqPos end)
Definition: wgsread.cpp:2750
static CSafeStatic< TGlobalAmbiguityCache, SStaticGlobalAmbiguityCacheCallbacks > s_GlobalAmbiguityCache
Definition: wgsread.cpp:2208
static bool s_UseAmbiguity4na(void)
Definition: wgsread.cpp:139
static void s_Convert_2na_to_4na(char *dst_4na, const char *src_2na, size_t base_count)
Definition: wgsread.cpp:1460
static bool sx_HasAmbiguity(const Uint1 *ptr, const Uint1 *end)
Definition: wgsread.cpp:1391
static void s_Set_4na_gap(vector< char > &dst_4na_vec, size_t offset, size_t len)
Definition: wgsread.cpp:1520
static int kAssignedDefaultSplitVersion
Definition: wgsread.cpp:188
static CRef< CSeq_literal > sx_MakeGapLiteral(TSeqPos len, NCBI_WGS_component_props props, NCBI_WGS_gap_linkage gap_linkage)
Definition: wgsread.cpp:4987
static const size_t kMinFeatCountToSplit
Definition: wgsread.cpp:200
static bool s_UseFull4naBlocks(void)
Definition: wgsread.cpp:151
static void s_AddUserObjectType(const CSeqdesc &desc, set< string > &existing_uo_types)
Definition: wgsread.cpp:3182
BEGIN_LOCAL_NAMESPACE
Definition: wgsread.cpp:2537
static const TSeqPos kDataChunkSize
Definition: wgsread.cpp:203
static CWGSSeqIterator::TIncludeFlags s_ToFlags(CWGSSeqIterator::EWithdrawn withdrawn)
Definition: wgsread.cpp:4059
static const size_t kFeatPerChunk
Definition: wgsread.cpp:201
static const Uint1 * sx_FindAmbiguity(const Uint1 *ptr, const Uint1 *end)
Definition: wgsread.cpp:1354
static const TSeqPos kChunk2naSize
Definition: wgsread.cpp:5101
static void s_GetMinMax(const Uint1 *arr, size_t size, Uint1 &min_v, Uint1 &max_v)
Definition: wgsread.cpp:4712
static void s_Copy_4na(char *dst_4na, TSeqPos dst_offset, const char *src_4na, TSeqPos src_offset, size_t base_count)
Definition: wgsread.cpp:1545
static char s_ConvertBits_2na_to_4na_2nd(char bits_2na)
Definition: wgsread.cpp:1452
static size_t sx_Find_4na_Ambiguity(const char *ptr, size_t offset, size_t base_count)
Definition: wgsread.cpp:1378
void sx_SetSplitId(CID2S_Bioseq_Ids::C_E &split_id, CSeq_id &id)
Definition: wgsread.cpp:2711
void sx_AddAnnotBytes(CBioseq::TAnnot &annot_set, CTempString bytes)
Definition: wgsread.cpp:2698
static int kMainEntryId
Definition: wgsread.cpp:189
static bool s_GetClipByQuality(void)
Definition: wgsread.cpp:103
#define PROFILE(var)
Definition: wgsread.cpp:306
static const TSeqPos kMinDataSplitSize
Definition: wgsread.cpp:204
END_LOCAL_NAMESPACE
Definition: wgsread.cpp:2768
static char s_ConvertBits_2na_to_4na(char bits_2na)
Definition: wgsread.cpp:1428
static bool kEnableSplitData
Definition: wgsread.cpp:183
static void s_Set_4na(vector< char > &dst_4na_vec, size_t offset, INSDC_4na_bin amb)
Definition: wgsread.cpp:1503
static bool kEnableSplitProd
Definition: wgsread.cpp:184
static Uint1 sx_Get_4na(const char *ptr, size_t offset)
Definition: wgsread.cpp:1366
EFromFlags
Definition: wgsread.cpp:3797
@ eFromFlags
Definition: wgsread.cpp:3798
static void sx_AddMasterDescr(const CWGSDb &db, SWGSCreateInfo &info, SWGSDb_Defs::TFlags flags)
Definition: wgsread.cpp:6032
int sx_StringToNonNegativeInt(const CTempString &str)
Definition: wgsread.cpp:2563
void sx_AddSplitIds(CID2S_Bioseq_Ids::Tdata &split_ids, const CBioseq::TId &ids)
Definition: wgsread.cpp:2741
static bool s_UseAmbiguityMask(void)
Definition: wgsread.cpp:115
static const TSeqPos kMin2naSize
Definition: wgsread.cpp:5097
static bool sx_Is2na(Uint1 b)
Definition: wgsread.cpp:1346
bool sx_SetAccession(CSeq_id &id, CTempString accession)
Definition: wgsread.cpp:2549
static const char kSeq_descrFirstByte
Definition: wgsread.cpp:178
static const TSeqPos kChunk4naSize
Definition: wgsread.cpp:5100
void sx_SetTag(CDbtag &tag, CTempString str)
Definition: wgsread.cpp:2660
static int s_GetDebugLevel(void)
Definition: wgsread.cpp:87
static bool kEnableSplitQual
Definition: wgsread.cpp:182
static const TSeqPos kAmbiguityBlockSize
Definition: wgsread.cpp:179
int sx_GetStringId(CTempString str)
Definition: wgsread.cpp:2619
static void sx_Assign(vector< Value > &dst, const CVDBValueFor< Value > &src)
Definition: wgsread.cpp:1168
void sx_AddDescrBytes(CSeq_descr &descr, CTempString bytes)
Definition: wgsread.cpp:2673
static bool kEnableSplitFeat
Definition: wgsread.cpp:185
NCBI_PARAM_DEF_EX(int, WGS, DEBUG, 0, eParam_NoThread, WGS_DEBUG)
static void sx_AddEvidence(CSeq_gap &gap, CLinkage_evidence::TType type)
Definition: wgsread.cpp:4978
END_NCBI_NAMESPACE
Definition: wgsread.cpp:7928
static TGi s_ToGi(TVDBRowId gi, const char *method)
Definition: wgsread.cpp:3299
limited_resource_map< pair< string, TVDBRowId >, CRef< CWGSDb_Impl::SAmbiguityInfo >, size_t > TGlobalAmbiguityCache
Definition: wgsread.cpp:2202
static char s_ConvertBits_2na_to_4na_1st(char bits_2na)
Definition: wgsread.cpp:1443
static bool s_UseGapInfo(void)
Definition: wgsread.cpp:127
static void s_SetAmbiguitiesPos(vector< char > &dst_4na_vec, TSeqPos pos, TSeqPos len, const vector< INSDC_coord_zero > &amb_pos, const vector< INSDC_4na_bin > &amb_4na)
Definition: wgsread.cpp:1626
static bool sx_HasMoreProducts(const CWGSDb &db, TVDBRowIdRange range, size_t count)
Definition: wgsread.cpp:5914
END_NAMESPACE(objects)
BEGIN_NCBI_NAMESPACE
Definition: wgsread.cpp:75
NCBI_DEFINE_ERR_SUBCODE_X(19)
DEFINE_STATIC_FAST_MUTEX(s_GlobalAmbiguityCacheMutex)
static void s_AddGiRange(CID2S_Seq_loc::TLoc_set &loc_set, CSeq_id::TGi gi_range_start, CSeq_id::TGi gi_range_stop)
Definition: wgsread.cpp:5636
EFromSplitVersion
Definition: wgsread.cpp:3800
@ eFromSplitVersion
Definition: wgsread.cpp:3801
static void s_SetGaps(vector< char > &dst_4na_vec, TSeqPos pos, TSeqPos len, CWGSSeqIterator::TWGSContigGapInfo gap_info)
Definition: wgsread.cpp:1670
static size_t s_GetAmbiguityCacheSize(void)
Definition: wgsread.cpp:163
static const TSeqPos kQualChunkSize
Definition: wgsread.cpp:202
static const TSeqPos kSplit2naSize
Definition: wgsread.cpp:5105
bool sx_SetVersion(CSeq_id &id, int version)
Definition: wgsread.cpp:2539
static void s_SetAmbiguitiesBlocks(vector< char > &dst_4na_vec, TSeqPos pos, TSeqPos len, const CWGSDb_Impl::SAmbiguityInfo::T4naBlocks &blocks)
Definition: wgsread.cpp:1641
static string s_GetUserObjectType(const CSeqdesc &desc)
Definition: wgsread.cpp:3158
void sx_AddSplitId(CID2S_Bioseq_Ids::Tdata &split_ids, CSeq_id &id)
Definition: wgsread.cpp:2733
EChunkType
Definition: wgsread.cpp:190
@ eChunk_qual
Definition: wgsread.cpp:194
@ eChunk_feat
Definition: wgsread.cpp:193
@ kChunkIdStep
Definition: wgsread.cpp:195
@ eChunk_prod
Definition: wgsread.cpp:191
@ eChunk_data
Definition: wgsread.cpp:192
NCBI_PARAM_DEF(bool, WGS, MASTER_DESCR, true)
EDeltaType
Definition: wgsread.cpp:5110
@ eDelta_all
Definition: wgsread.cpp:5111
@ eDelta_split
Definition: wgsread.cpp:5112
static const TSeqPos kSplit4naSize
Definition: wgsread.cpp:5104
int sx_NewStringToNonNegativeInt(CTempString str)
Definition: wgsread.cpp:2568
static void s_Pack_4na(char *dst_packed_4na, const Uint1 *src_4na, size_t base_count)
Definition: wgsread.cpp:1604
NCBI_PARAM_DECL(int, WGS, DEBUG)
uint8_t NCBI_WGS_seqtype
Definition: wgsread.hpp:64
@ NCBI_gb_state_eWGSGenBankReplaced
Definition: wgsread.hpp:88
@ NCBI_gb_state_eWGSGenBankMissing
Definition: wgsread.hpp:91
@ NCBI_gb_state_eWGSGenBankLive
Definition: wgsread.hpp:86
uint8_t INSDC_quality_phred
Definition: wgsread.hpp:58
int32_t NCBI_WGS_hash
Definition: wgsread.hpp:95
uint8_t NCBI_WGS_loc_strand
Definition: wgsread.hpp:75
int32_t INSDC_coord_zero
Definition: wgsread.hpp:55
uint8_t NCBI_WGS_feattype
Definition: wgsread.hpp:73
uint32_t NCBI_gb_state
Definition: wgsread.hpp:53
static wxAcceleratorEntry entries[3]
Modified on Fri Sep 20 14:57:19 2024 by modify_doxy.py rev. 669887