NCBI C++ ToolKit
wgsread.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: wgsread.cpp 100559 2023-08-10 21:33:55Z vasilche $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eugene Vasilchenko
27  *
28  * File Description:
29  * Access to WGS files
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
36 #include <corelib/ncbistr.hpp>
37 #include <corelib/ncbifile.hpp>
38 #include <corelib/ncbi_param.hpp>
39 #include <util/line_reader.hpp>
41 #include <objects/seq/seq__.hpp>
48 #include <serial/objistrasnb.hpp>
49 #include <serial/objostrasnb.hpp>
50 #include <serial/serial.hpp>
51 #include <serial/pack_string.hpp>
52 #include <serial/objhook.hpp>
53 #include <serial/objectio.hpp>
54 #include <sra/error_codes.hpp>
55 
57 #include <ncbi/ncbi.h>
58 #include <insdc/insdc.h>
60 #include <vdb/vdb-priv.h>
61 #include <numeric>
62 
63 //#define COLLECT_PROFILE
64 //#define TEST_ACC_VERSION
65 //#define USE_TEST_PATH
66 
67 #define USE_GLOBAL_AMBIGUITY_CACHE
68 
69 #ifdef USE_GLOBAL_AMBIGUITY_CACHE
70 # define DEFAULT_AMBIGUITY_CACHE_SIZE "128MB"
71 #else
72 # define DEFAULT_AMBIGUITY_CACHE_SIZE "32MB"
73 #endif
74 
76 
77 #define NCBI_USE_ERRCODE_X WGSReader
79 
81 
82 
84 NCBI_PARAM_DEF_EX(int, WGS, DEBUG, 0, eParam_NoThread, WGS_DEBUG);
85 
86 
87 static int s_GetDebugLevel(void)
88 {
89  static int value = NCBI_PARAM_TYPE(WGS, DEBUG)::GetDefault();
90  return value;
91 }
92 
93 
94 NCBI_PARAM_DECL(bool, WGS, MASTER_DESCR);
95 NCBI_PARAM_DEF(bool, WGS, MASTER_DESCR, true);
96 
97 
98 NCBI_PARAM_DECL(bool, WGS, CLIP_BY_QUALITY);
99 NCBI_PARAM_DEF_EX(bool, WGS, CLIP_BY_QUALITY, true,
100  eParam_NoThread, CSRA_CLIP_BY_QUALITY);
101 
102 
103 static bool s_GetClipByQuality(void)
104 {
105  static CSafeStatic<NCBI_PARAM_TYPE(WGS, CLIP_BY_QUALITY)> s_Value;
106  return s_Value->Get();
107 }
108 
109 
110 NCBI_PARAM_DECL(bool, WGS, USE_AMBIGUITY_MASK);
111 NCBI_PARAM_DEF_EX(bool, WGS, USE_AMBIGUITY_MASK, true,
112  eParam_NoThread, WGS_USE_AMBIGUITY_MASK);
113 
114 
115 static bool s_UseAmbiguityMask(void)
116 {
117  static bool v = NCBI_PARAM_TYPE(WGS, USE_AMBIGUITY_MASK)::GetDefault();
118  return v;
119 }
120 
121 
122 NCBI_PARAM_DECL(bool, WGS, USE_GAP_INFO);
123 NCBI_PARAM_DEF_EX(bool, WGS, USE_GAP_INFO, true,
124  eParam_NoThread, WGS_USE_GAP_INFO);
125 
126 
127 static bool s_UseGapInfo(void)
128 {
129  static bool v = NCBI_PARAM_TYPE(WGS, USE_GAP_INFO)::GetDefault();
130  return v;
131 }
132 
133 
134 NCBI_PARAM_DECL(bool, WGS, USE_AMBIGUITY_4NA);
135 NCBI_PARAM_DEF_EX(bool, WGS, USE_AMBIGUITY_4NA, true,
136  eParam_NoThread, WGS_USE_AMBIGUITY_4NA);
137 
138 
139 static bool s_UseAmbiguity4na(void)
140 {
141  static bool v = NCBI_PARAM_TYPE(WGS, USE_AMBIGUITY_4NA)::GetDefault();
142  return v;
143 }
144 
145 
146 NCBI_PARAM_DECL(bool, WGS, USE_FULL_4NA_BLOCKS);
147 NCBI_PARAM_DEF_EX(bool, WGS, USE_FULL_4NA_BLOCKS, false,
148  eParam_NoThread, WGS_USE_FULL_4NA_BLOCKS);
149 
150 
151 static bool s_UseFull4naBlocks(void)
152 {
153  static bool v = NCBI_PARAM_TYPE(WGS, USE_FULL_4NA_BLOCKS)::GetDefault();
154  return v;
155 }
156 
157 
158 NCBI_PARAM_DECL(string, WGS, AMBIGUITY_CACHE);
159 NCBI_PARAM_DEF_EX(string, WGS, AMBIGUITY_CACHE, DEFAULT_AMBIGUITY_CACHE_SIZE,
160  eParam_NoThread, WGS_AMBIGUITY_CACHE);
161 
162 
163 static size_t s_GetAmbiguityCacheSize(void)
164 {
165  static size_t v = NStr::StringToUInt8_DataSize(NCBI_PARAM_TYPE(WGS, AMBIGUITY_CACHE)::GetDefault());
166  return v;
167 }
168 
169 
170 #ifdef USE_TEST_PATH
171 NCBI_PARAM_DECL(string, WGS, TEST_PATH);
172 NCBI_PARAM_DEF_EX(string, WGS, TEST_PATH, "",
173  eParam_NoThread, WGS_TEST_PATH);
174 #endif
175 
176 
177 // fixed WGS VDB parameters
178 static const char kSeq_descrFirstByte = 49; // first byte of Seq-descr ASN.1
179 static const TSeqPos kAmbiguityBlockSize = 1024; // defined by WGS VDB schema
180 
181 // split parameters, turn on/off splitting of different pieces of information
182 static bool kEnableSplitQual = true;
183 static bool kEnableSplitData = true;
184 static bool kEnableSplitProd = true;
185 static bool kEnableSplitFeat = true;
186 
187 // split info fixed parameters
189 static int kMainEntryId = 1;
195  kChunkIdStep = 4
196 };
197 
198 // split configurable parameters
199 static const size_t kProdPerChunk = 64;
200 static const size_t kMinFeatCountToSplit = 64;
201 static const size_t kFeatPerChunk = 256;
202 static const TSeqPos kQualChunkSize = 64<<10; // 64KiB
203 static const TSeqPos kDataChunkSize = 256<<10; // 64KiB in 2na encoding
204 static const TSeqPos kMinDataSplitSize = 128<<10;
205 
206 #ifdef COLLECT_PROFILE
207 struct SProfiler
208 {
209  const char* name;
210  size_t count;
211  CStopWatch sw;
212  SProfiler() : name(0), count(0) {}
213  ~SProfiler() {
214  if ( name )
215  cout << name<<" calls: "<<count<<" time: "<<sw.Elapsed()<<endl;
216  }
217 };
218 struct SProfilerGuard
219 {
220  SProfiler& sw;
221  SProfilerGuard(SProfiler& sw, const char* name)
222  : sw(sw)
223  {
224  sw.name = name;
225  sw.count += 1;
226  sw.sw.Start();
227  }
228  ~SProfilerGuard()
229  {
230  sw.sw.Stop();
231  }
232 };
233 
234 static SProfiler sw_Serialize;
235 static SProfiler sw_Feat;
236 static SProfiler sw_GetAccSeq_id;
237 static SProfiler sw_GetBioseq;
238 static SProfiler sw_GetSeq_entry;
239 static SProfiler sw_GetSeq_entryData;
240 static SProfiler sw_GetSplitInfo;
241 static SProfiler sw_GetSplitInfoData;
242 static SProfiler sw_InitSplit;
243 static SProfiler sw_GetFeatLocIdTypeRange;
244 static SProfiler sw_GetFeatLocIdTypeFeat;
245 static SProfiler sw_GetFeatLocIdTypeFeatBytes;
246 static SProfiler sw_GetFeatBytes;
247 static SProfiler sw_GetChunk;
248 static SProfiler sw_CreateQualityChunk;
249 static SProfiler sw_CreateDataChunk;
250 static SProfiler sw_CreateProductsChunk;
251 static SProfiler sw_CreateFeaturesChunk;
252 static SProfiler sw__GetProtFeat;
253 static SProfiler sw___GetProtAnnot;
254 static SProfiler sw___GetProtInst;
255 static SProfiler sw___GetProtDescr;
256 static SProfiler sw____GetProtWGSAcc;
257 static SProfiler sw____GetProtAccVer;
258 static SProfiler sw____GetProtAcc;
259 static SProfiler sw____GetProtGI;
260 static SProfiler sw____GetProtGISeq_id;
261 static SProfiler sw____GetProtGnlSeq_id;
262 static SProfiler sw____GetProtAccSeq_id;
263 static SProfiler sw___GetProtIds;
264 static SProfiler sw__GetProtBioseq;
265 static SProfiler sw_GetProtEntry;
266 static SProfiler sw__GetScaffoldFeat;
267 static SProfiler sw___GetScaffoldQual;
268 static SProfiler sw___GetScaffoldAnnot;
269 static SProfiler sw___GetScaffoldInst;
270 static SProfiler sw___GetScaffoldDescr;
271 static SProfiler sw___GetScaffoldIds;
272 static SProfiler sw__GetScaffoldBioseq;
273 static SProfiler sw_GetScaffoldEntry;
274 static SProfiler sw__GetContigFeat;
275 static SProfiler sw___GetContigQual;
276 static SProfiler sw____GetContigQualSize;
277 static SProfiler sw____GetContigQualData;
278 static SProfiler sw____GetContigQualMinMax;
279 static SProfiler sw___GetContigAnnot;
280 static SProfiler sw____IsGap;
281 static SProfiler sw____Get2naLen;
282 static SProfiler sw____Get4naLen;
283 static SProfiler sw____GetGapLen;
284 static SProfiler sw____GetRaw2na;
285 static SProfiler sw____GetRaw4na;
286 static SProfiler sw____GetAmb2Mask;
287 static SProfiler sw____Get4na2Mask;
288 static SProfiler sw____Scan4na;
289 static SProfiler sw____GetCvt4na;
290 static SProfiler sw____GetAmb4na;
291 static SProfiler sw____GetBlk4na;
292 static SProfiler sw____SetGaps;
293 static SProfiler sw___GetContigInst;
294 static SProfiler sw___GetContigDescr;
295 static SProfiler sw___GetContigIds;
296 static SProfiler sw__GetContigBioseq;
297 static SProfiler sw_GetContigEntry;
298 static SProfiler sw_FeatIterator;
299 static SProfiler sw_ProtIterator;
300 static SProfiler sw_ScafIterator;
301 static SProfiler sw_SeqIterator;
302 static SProfiler sw_WGSOpen;
303 
304 # define PROFILE(var) SProfilerGuard guard(var, #var)
305 #else
306 # define PROFILE(var)
307 #endif
308 
309 /////////////////////////////////////////////////////////////////////////////
310 // CAsnBinData
311 /////////////////////////////////////////////////////////////////////////////
312 
313 
315  : m_MainObject(&obj)
316 {
317 }
318 
319 
321 {
322 }
323 
324 
326 {
327  out << *m_MainObject;
328 }
329 
330 
332 {
333 public:
335  : CAsnBinData(obj),
337  {
338  }
339  virtual ~CWGSAsnBinData(void)
340  {
341  }
342 
343  virtual void Serialize(CObjectOStreamAsnBinary& out) const;
344 
346  struct SFtableInfo {
347  vector<char> m_Bytes;
348 
350  {
351  m_Bytes.insert(m_Bytes.end(), data.begin(), data.end());
352  }
353  };
354  typedef vector<char> TDescrInfo;
355 
357  {
358  m_FtableMap[&ftable].AddFeature(data);
359  }
360  void AddDescr(CBioseq& seq, const CTempString& data)
361  {
362  seq.SetDescr(*m_EmptyDescr);
363  vector<char>& dst = m_DescrMap[&seq];
364  if ( data[0] == kSeq_descrFirstByte ) {
365  // test for DESCR variant with Seqdesc list insead of Seq-descr
366  dst.assign(data.begin()+2, data.end()-2);
367  }
368  else {
369  dst.assign(data.begin(), data.end());
370  }
371  }
372 
378 };
379 
381 {
382 public:
387  : info_map(info_map)
388  {
389  }
390 
392  const CConstObjectInfoCV& variant)
393  {
394  CConstObjectInfo var_info = variant.GetVariant();
395  TKey key = (TKey)var_info.GetObjectPtr();
397  if ( iter != info_map.end() ) {
398  COStreamContainer cont(out, var_info);
400  cont << **it;
401  }
402  const TInfo& info = iter->second;
403  out.Write(info.m_Bytes.data(), info.m_Bytes.size());
404  }
405  else {
406  DefaultWrite(out, variant);
407  }
408  }
409 
411 };
412 
413 
415 {
416 public:
417  typedef const CBioseq* TKey;
421  : info_map(info_map)
422  {
423  }
424 
426  const CConstObjectInfoMI& member)
427  {
428  TKey key = (TKey)member.GetClassObject().GetObjectPtr();
430  if ( iter != info_map.end() ) {
431  COStreamClassMember mem(out, member);
432  const TInfo& info = iter->second;
433  if ( info.data()[0] == kSeq_descrFirstByte ) {
434  // Seq-descr
435  out.Write(info.data(), info.size());
436  }
437  else {
438  CObjectTypeInfo cont = member.GetMemberType();
439  while ( cont.GetTypeFamily() == eTypeFamilyPointer ) {
440  cont = cont.GetPointedType();
441  }
442  COStreamContainer mem(out, cont);
443  out.Write(info.data(), info.size());
444  }
445  }
446  else {
447  DefaultWrite(out, member);
448  }
449  }
450 
452 };
453 
454 
456 {
457  PROFILE(sw_Serialize);
459  CObjectHookGuard<CSeq_annot::TData> guard1("ftable", hook1, &out);
461  CObjectHookGuard<CBioseq> guard2("descr", hook2, &out);
463 }
464 
465 
466 /////////////////////////////////////////////////////////////////////////////
467 // CWGSDb_Impl cursors
468 /////////////////////////////////////////////////////////////////////////////
469 
470 
471 // SSeq0TableCursor is helper accessor structure for SEQUENCE table
473  explicit SSeq0TableCursor(const CVDBTable& table);
474 
476 
479  DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX);
481  DECLARE_VDB_COLUMN_AS(NCBI_taxid, TAXID);
482 };
483 
484 
485 // SSeqTableCursor is helper accessor structure for SEQUENCE table
487  explicit SSeqTableCursor(const CVDBTable& table);
488 
490 
491  DECLARE_VDB_COLUMN_AS(NCBI_gi, GI);
512  DECLARE_VDB_COLUMN_AS(bool, CIRCULAR);
516  DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_PRODUCT_ROW_ID);
517  typedef pair<TVDBRowId, TVDBRowId> row_range_t;
518  DECLARE_VDB_COLUMN_AS(row_range_t, CONTIG_NAME_ROW_RANGE);
520  DECLARE_VDB_COLUMN_AS(Uint1, AMBIGUITY_MASK);
522  DECLARE_VDB_COLUMN_AS(INSDC_4na_bin, AMBIGUITY_4NA);
523 };
524 
525 
526 // SSeqTableCursor is helper accessor structure for SEQUENCE table
528  explicit SSeq4naTableCursor(const CVDBTable& table);
529 
531 
532  DECLARE_VDB_COLUMN_AS(INSDC_4na_bin, READ); // unpacked 4na, one base per byte
533 };
534 
535 
537  : m_Cursor(table),
538  INIT_VDB_COLUMN(ACC_PREFIX),
539  INIT_VDB_COLUMN(ACC_CONTIG_LEN),
540  INIT_OPTIONAL_VDB_COLUMN(SEQID_GNL_PREFIX),
543 {
544 }
545 
546 
548  : m_Cursor(table),
550  INIT_VDB_COLUMN(ACCESSION),
551  INIT_VDB_COLUMN(ACC_VERSION),
552  INIT_VDB_COLUMN(CONTIG_NAME),
554  INIT_VDB_COLUMN(TITLE),
556  INIT_VDB_COLUMN(READ_START),
557  INIT_VDB_COLUMN(READ_LEN),
558  INIT_VDB_COLUMN(TRIM_START),
559  INIT_VDB_COLUMN(TRIM_LEN),
561  INIT_OPTIONAL_VDB_COLUMN(NUC_PROT_DESCR),
563  INIT_OPTIONAL_VDB_COLUMN(GB_STATE),
564  INIT_OPTIONAL_VDB_COLUMN(PUBLIC_COMMENT),
565  INIT_OPTIONAL_VDB_COLUMN(GAP_START),
566  INIT_OPTIONAL_VDB_COLUMN(GAP_LEN),
567  INIT_OPTIONAL_VDB_COLUMN(GAP_PROPS),
568  INIT_OPTIONAL_VDB_COLUMN(GAP_LINKAGE),
569  INIT_OPTIONAL_VDB_COLUMN(QUALITY),
570  INIT_OPTIONAL_VDB_COLUMN(CIRCULAR),
572  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_START),
573  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_END),
574  INIT_OPTIONAL_VDB_COLUMN(FEAT_PRODUCT_ROW_ID),
575  INIT_OPTIONAL_VDB_COLUMN(CONTIG_NAME_ROW_RANGE),
576  m_READ_2na(m_Cursor, "(INSDC:2na:packed)READ",
577  NULL, CVDBColumn::eMissing_Allow), // packed 2na
578  INIT_OPTIONAL_VDB_COLUMN(AMBIGUITY_MASK),
579  INIT_OPTIONAL_VDB_COLUMN(AMBIGUITY_POS),
580  INIT_OPTIONAL_VDB_COLUMN(AMBIGUITY_4NA)
581 {
582  if ( !s_UseAmbiguityMask() ) {
583  m_AMBIGUITY_MASK = CVDBColumnBits<8>();
584  }
585  if ( !s_UseGapInfo() ) {
586  m_GAP_START = CVDBColumnBits<32>();
587  }
588  if ( s_UseAmbiguity4na() && m_GAP_START && m_GAP_LEN && m_AMBIGUITY_POS && m_AMBIGUITY_4NA ) {
589  // all fields to restore ambiguities are present
590  }
591  else {
592  // otherwise we need 4na data
593  m_AMBIGUITY_POS.Reset();
594  m_AMBIGUITY_4NA.Reset();
595  }
596 
597  // optimization - treat completely empty QUALITY column as inexistent - no quality graphs
598  m_QUALITY.ResetIfAlwaysEmpty(m_Cursor);
599 }
600 
601 
603  : m_Cursor(table),
604  INIT_VDB_COLUMN_AS(READ, INSDC:4na:bin)
605 {
606 }
607 
608 
609 // SScfTableCursor is helper accessor structure for optional SCAFFOLD table
612 
614 
617  DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX);
623  DECLARE_VDB_COLUMN_AS(bool, CIRCULAR);
626  DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_PRODUCT_ROW_ID);
628 };
629 
630 
632  : m_Cursor(table),
633  INIT_VDB_COLUMN(SCAFFOLD_NAME),
634  INIT_OPTIONAL_VDB_COLUMN(ACCESSION),
635  INIT_OPTIONAL_VDB_COLUMN(SEQID_GNL_PREFIX),
636  INIT_VDB_COLUMN(COMPONENT_ID),
637  INIT_VDB_COLUMN(COMPONENT_START),
638  INIT_VDB_COLUMN(COMPONENT_LEN),
639  INIT_VDB_COLUMN(COMPONENT_PROPS),
640  INIT_OPTIONAL_VDB_COLUMN(COMPONENT_LINKAGE),
641  INIT_OPTIONAL_VDB_COLUMN(CIRCULAR),
642  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_START),
643  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_END),
644  INIT_OPTIONAL_VDB_COLUMN(FEAT_PRODUCT_ROW_ID),
645  INIT_OPTIONAL_VDB_COLUMN(GB_STATE)
646 {
647 }
648 
649 
650 // SProt0TableCursor is helper accessor structure for optional PROTEIN table
652  explicit SProt0TableCursor(const CVDBTable& table);
653 
655 
656  DECLARE_VDB_COLUMN_AS(NCBI_gi, GI);
657  //DECLARE_VDB_COLUMN_AS_STRING(ACCESSION);
660  DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX);
662 };
663 
664 
665 // SProtTableCursor is helper accessor structure for optional PROTEIN table
667  explicit SProtTableCursor(const CVDBTable& table);
668 
670 
671  //DECLARE_VDB_COLUMN_AS(NCBI_gi, GI);
672  //DECLARE_VDB_COLUMN_AS_STRING(ACCESSION);
673  //DECLARE_VDB_COLUMN_AS_STRING(GB_ACCESSION);
674  //DECLARE_VDB_COLUMN_AS(uint32_t, ACC_VERSION);
675  //DECLARE_VDB_COLUMN_AS_STRING(SEQID_GNL_PREFIX);
682  //DECLARE_VDB_COLUMN_AS_STRING(PROTEIN_NAME);
684  DECLARE_VDB_COLUMN_AS(NCBI_taxid, TAXID);
689  DECLARE_VDB_COLUMN_AS(TVDBRowId, FEAT_PRODUCT_ROW_ID);
693 };
694 
695 
697  : m_Cursor(table),
699  //INIT_VDB_COLUMN(ACCESSION),
700  INIT_OPTIONAL_VDB_COLUMN(GB_ACCESSION),
701  INIT_VDB_COLUMN(ACC_VERSION),
702  INIT_OPTIONAL_VDB_COLUMN(SEQID_GNL_PREFIX),
703  INIT_VDB_COLUMN(PROTEIN_NAME)
704 {
705 }
706 
707 
709  : m_Cursor(table),
710  //INIT_OPTIONAL_VDB_COLUMN(GI),
711  //INIT_VDB_COLUMN(ACCESSION),
712  //INIT_OPTIONAL_VDB_COLUMN(GB_ACCESSION),
713  //INIT_VDB_COLUMN(ACC_VERSION),
714  //INIT_OPTIONAL_VDB_COLUMN(SEQID_GNL_PREFIX),
718  INIT_VDB_COLUMN(GB_STATE),
719  INIT_OPTIONAL_VDB_COLUMN(PUBLIC_COMMENT),
720  INIT_VDB_COLUMN(PROTEIN_LEN),
721  //INIT_VDB_COLUMN(PROTEIN_NAME),
722  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_NAME),
724  INIT_OPTIONAL_VDB_COLUMN(REF_ACC),
726  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_START),
727  INIT_OPTIONAL_VDB_COLUMN(FEAT_ROW_END),
728  INIT_OPTIONAL_VDB_COLUMN(FEAT_PRODUCT_ROW_ID),
729  INIT_OPTIONAL_VDB_COLUMN(PROTEIN),
730  INIT_OPTIONAL_VDB_COLUMN(REPLACED_BY),
731  INIT_OPTIONAL_VDB_COLUMN(REPLACES)
732 {
733 }
734 
735 
736 // SFeatTableCursor is helper accessor structure for optional FEATURE table
738  explicit SFeatTableCursor(const CVDBTable& table);
739 
749  DECLARE_VDB_COLUMN_AS_STRING(PRODUCT_ACCESSION);
754 
756 };
757 
758 
760  : m_Cursor(table),
761  INIT_VDB_COLUMN(FEAT_TYPE),
762  INIT_VDB_COLUMN(LOC_SEQ_TYPE),
763  INIT_VDB_COLUMN(LOC_ACCESSION),
764  INIT_VDB_COLUMN(LOC_ROW_ID),
765  INIT_VDB_COLUMN(LOC_START),
766  INIT_VDB_COLUMN(LOC_LEN),
767  INIT_VDB_COLUMN(LOC_STRAND),
768  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_SEQ_TYPE),
769  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_ACCESSION),
770  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_ROW_ID),
771  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_START),
772  INIT_OPTIONAL_VDB_COLUMN(PRODUCT_LEN),
773  INIT_VDB_COLUMN(SEQ_FEAT)
774 {
776  if ( 1 ) {
778  type.FindVariant("str")
779  .SetLocalReadHook(m_ObjStr, new CPackStringChoiceHook);
780  }
781  if ( 1 ) {
783  type.FindMember("key")
784  .SetLocalReadHook(m_ObjStr, new CPackStringClassHook(32, 128));
785  }
786  if ( 1 ) {
788  type.FindMember("db")
789  .SetLocalReadHook(m_ObjStr, new CPackStringClassHook);
790  }
791  if ( 1 ) {
792  type = CType<CGb_qual>();
793  type.FindMember("qual")
794  .SetLocalReadHook(m_ObjStr, new CPackStringClassHook);
795  }
796 }
797 
798 
799 // SGiIdxTableCursor is helper accessor structure for optional GI_IDX table
801  explicit SGiIdxTableCursor(const CVDBTable& table);
802 
804 
807 };
808 
809 
811  : m_Cursor(table),
812  INIT_OPTIONAL_VDB_COLUMN(NUC_ROW_ID),
813  INIT_OPTIONAL_VDB_COLUMN(PROT_ROW_ID)
814 {
815 }
816 
817 
818 // SProtIdxTableCursor is helper accessor structure for optional PROT_IDX table
820  explicit SProtIdxTableCursor(const CVDBTable& table);
821 
823 
824  typedef pair<TVDBRowId, TVDBRowId> row_range_t;
827 };
828 
829 
831  : m_Cursor(table),
832  INIT_VDB_COLUMN_BACKUP(NAME_ROW_RANGE, ACCESSION_ROW_RANGE),
833  INIT_VDB_COLUMN_BACKUP(ROW_ID, PROTEIN_ROW_ID)
834 {
835 }
836 
837 
838 /////////////////////////////////////////////////////////////////////////////
839 // CWGSDb_Impl
840 /////////////////////////////////////////////////////////////////////////////
841 
842 
844  CTempString path_or_acc,
845  CTempString vol_path)
846  : m_Mgr(mgr),
847  m_WGSPath(NormalizePathOrAccession(path_or_acc, vol_path)),
848  m_IdVersion(0),
867 {
869  PROFILE(sw_WGSOpen);
870  //static CVDBSchema schema(mgr, "wgs.schema");
871  m_Db = CVDB(mgr, m_WGSPath);
872  m_SeqTable = CVDBTable(m_Db, "SEQUENCE"); // SEQUENCE table must exist
873  x_InitIdParams();
874 }
875 
876 
878 {
879 }
880 
881 
882 inline
884 {
885  CRef<SSeq0TableCursor> curs = m_Seq0.Get(row);
886  if ( !curs ) {
888  curs = new SSeq0TableCursor(SeqTable());
889  }
890  return curs;
891 }
892 
893 
894 inline
896 {
897  CRef<SSeqTableCursor> curs = m_Seq.Get(row);
898  if ( !curs ) {
900  curs = new SSeqTableCursor(SeqTable());
901  }
902  return curs;
903 }
904 
905 
906 inline
908 {
909  CRef<SSeq4naTableCursor> curs; // = m_Seq.Get(row);
910  if ( !curs ) {
912  curs = new SSeq4naTableCursor(SeqTable());
913  }
914  return curs;
915 }
916 
917 
918 inline
920 {
921  CRef<SScfTableCursor> curs = m_Scf.Get(row);
922  if ( !curs ) {
924  if ( const CVDBTable& table = ScfTable() ) {
925  curs = new SScfTableCursor(table);
926  }
927  }
928  return curs;
929 }
930 
931 
932 inline
934 {
935  CRef<SProt0TableCursor> curs = m_Prot0.Get(row);
936  if ( !curs ) {
938  if ( const CVDBTable& table = ProtTable() ) {
939  curs = new SProt0TableCursor(table);
940  }
941  }
942  return curs;
943 }
944 
945 
946 inline
948 {
949  CRef<SProtTableCursor> curs = m_Prot.Get(row);
950  if ( !curs ) {
952  if ( const CVDBTable& table = ProtTable() ) {
953  curs = new SProtTableCursor(table);
954  }
955  }
956  return curs;
957 }
958 
959 
960 inline
962 {
963  CRef<SFeatTableCursor> curs = m_Feat.Get(row);
964  if ( !curs ) {
966  if ( const CVDBTable& table = FeatTable() ) {
967  curs = new SFeatTableCursor(table);
968  }
969  }
970  return curs;
971 }
972 
973 
974 inline
976 {
977  CRef<SGiIdxTableCursor> curs = m_GiIdx.Get(row);
978  if ( !curs ) {
980  if ( const CVDBTable& table = GiIdxTable() ) {
981  curs = new SGiIdxTableCursor(table);
982  }
983  }
984  return curs;
985 }
986 
987 
988 inline
990 {
992  if ( !curs ) {
994  if ( const CVDBTable& table = ProtIdxTable() ) {
995  curs = new SProtIdxTableCursor(table);
996  }
997  }
998  return curs;
999 }
1000 
1001 
1002 inline
1004 {
1005  m_Seq0.Put(curs, row);
1006 }
1007 
1008 
1009 inline
1011 {
1012  m_Seq.Put(curs, row);
1013 }
1014 
1015 
1016 inline
1018 {
1019  //m_Seq.Put(curs, row);
1020 }
1021 
1022 
1023 inline
1025 {
1026  m_Scf.Put(curs, row);
1027 }
1028 
1029 
1030 inline
1032 {
1033  m_Prot0.Put(curs, row);
1034 }
1035 
1036 
1037 inline
1039 {
1040  m_Prot.Put(curs, row);
1041 }
1042 
1043 
1044 inline
1046 {
1047  m_Feat.Put(curs, row);
1048 }
1049 
1050 
1051 inline
1053 {
1054  m_GiIdx.Put(curs, row);
1055 }
1056 
1057 
1058 inline
1060 {
1061  m_ProtIdx.Put(curs, row);
1062 }
1063 
1064 
1066 {
1068 
1070  ~SAmbiguityInfo();
1071 
1072  size_t GetUsedMemory() const;
1073 
1074  vector<Uint1> GetAmbiguityBytes(SSeqTableCursor& cur) {
1075  return m_AmbiguityMask;
1076  }
1077 
1079  TWGSContigGapInfo GetGapInfo() const;
1080 
1083 
1085  CWGSDb_Impl& db, SSeqTableCursor& cur) const;
1087  TSeqPos stop_2na_len, TSeqPos stop_gap_len,
1088  CWGSDb_Impl& db, SSeqTableCursor& cur) const;
1090  CWGSDb_Impl& db, SSeqTableCursor& cur) const;
1091 
1093  SSeqTableCursor& cur) const;
1095  CWGSDb_Impl& db, SSeqTableCursor& cur) const;
1096 
1097  bool x_AmbiguousBlock(size_t block_index) const
1098  {
1099  size_t byte_index = block_index/8;
1100  Uint1 byte_bit = 1<<(block_index%8);
1101  return byte_index < m_AmbiguityMask.size() && (m_AmbiguityMask[byte_index] & byte_bit);
1102  }
1103  void x_SetAmbiguousBlock(size_t block_index)
1104  {
1105  size_t byte_index = block_index/8;
1106  Uint1 byte_bit = 1<<(block_index%8);
1107  m_AmbiguityMask[byte_index] |= byte_bit;
1108  }
1110  void x_Calculate4na(CWGSDb_Impl& db) const;
1111  void x_Need4na(CWGSDb_Impl& db) const
1112  {
1113  if ( !m_HasAmbiguityPos && !m_Has4naBlocks ) {
1114  x_Calculate4na(db);
1115  }
1116  }
1117  bool x_AddAmbiguities(const Uint1* ptr, TSeqPos count,
1118  TSeqPos pos, TWGSContigGapInfo& gap_info) const;
1119  bool x_AddAmbiguousBlock(const Uint1* ptr, TSeqPos count,
1120  TSeqPos pos, TWGSContigGapInfo& gap_info) const;
1121 
1122  string m_Prefix;
1124 
1125  mutable CFastMutex m_Mutex; // for m_4naBlocks update
1126 
1129  mutable bool m_HasAmbiguityPos;
1130  mutable bool m_Has4naBlocks;
1131 
1132  vector<INSDC_coord_zero> m_GapStart;
1133  vector<INSDC_coord_len> m_GapLen;
1134  vector<NCBI_WGS_component_props> m_GapProps;
1135  vector<NCBI_WGS_gap_linkage> m_GapLinkage;
1136 
1137  vector<Uint1> m_AmbiguityMask;
1138  mutable vector<INSDC_coord_zero> m_AmbiguityPos;
1139  mutable vector<INSDC_4na_bin> m_Ambiguity4na;
1140 
1141  struct S4naBlock
1142  {
1143  char m_Packed4na[kAmbiguityBlockSize/2]; // packed 4na - two 4na bases per byte
1144  };
1145 
1147  mutable T4naBlocks m_4naBlocks; // ambiguous blocks
1148 
1149  struct S4naReader
1150  {
1154  };
1155  bool x_IsValid(const S4naReader& reader) const;
1157  enum EBaseType {
1160  eBase_Gap
1161  };
1162  EBaseType GetBaseType(const S4naReader& reader) const;
1163  void Advance(S4naReader& reader) const;
1164 };
1165 
1166 
1167 template<class Value>
1168 static void sx_Assign(vector<Value>& dst, const CVDBValueFor<Value>& src)
1169 {
1170  dst.resize(src.size());
1171  copy_n(src.begin(), src.size(), dst.data());
1172 }
1173 
1174 
1176  : m_Prefix(db.GetIdPrefixWithVersion()),
1177  m_RowId(row_id),
1178  m_HasGapInfo(false),
1179  m_HasAmbiguityMask(false),
1180  m_HasAmbiguityPos(false),
1181  m_Has4naBlocks(false)
1182 {
1183  if ( cur.m_GAP_START ) {
1184  sx_Assign(m_GapStart, cur.GAP_START(m_RowId));
1185  if ( m_GapStart.size() ) {
1186  sx_Assign(m_GapLen, cur.GAP_LEN(m_RowId));
1187  sx_Assign(m_GapProps, cur.GAP_PROPS(m_RowId));
1188  if ( cur.m_GAP_LINKAGE ) {
1189  sx_Assign(m_GapLinkage, cur.GAP_LINKAGE(m_RowId));
1190  }
1191  }
1192  m_HasGapInfo = true;
1193  }
1194  const bool kVerify4na = false;
1195  vector<Uint1> m_ExpectedAmbiguityMask;
1196  vector<INSDC_coord_zero> m_ExpectedAmbiguityPos;
1197  vector<INSDC_4na_bin> m_ExpectedAmbiguity4na;
1198 
1199  if ( kVerify4na ) {
1201  swap(m_ExpectedAmbiguityMask, m_AmbiguityMask);
1202  swap(m_ExpectedAmbiguityPos, m_AmbiguityPos);
1203  swap(m_ExpectedAmbiguity4na, m_Ambiguity4na);
1204  m_HasAmbiguityMask = false;
1205  m_HasAmbiguityPos = false;
1206  }
1207  if ( cur.m_AMBIGUITY_MASK ) {
1208  // number of blocks
1209  sx_Assign(m_AmbiguityMask, cur.AMBIGUITY_MASK(m_RowId));
1210  m_HasAmbiguityMask = true;
1211  }
1212  if ( cur.m_AMBIGUITY_POS && cur.m_AMBIGUITY_4NA ) {
1213  sx_Assign(m_AmbiguityPos, cur.AMBIGUITY_POS(m_RowId));
1214  sx_Assign(m_Ambiguity4na, cur.AMBIGUITY_4NA(m_RowId));
1215  m_HasAmbiguityPos = true;
1216  }
1217  if ( !m_HasAmbiguityMask ) {
1219  }
1220  if ( s_GetDebugLevel() >= 6 ) {
1221  size_t memory = GetUsedMemory();
1222  size_t mask_bit_count = 0;
1223  for ( auto bb : m_AmbiguityMask ) {
1224  while ( bb ) {
1225  ++mask_bit_count;
1226  bb &= bb-1;
1227  }
1228  }
1229  CFastMutexGuard guard(m_Mutex);
1230  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1232  <<NStr::NumericToString(m_AmbiguityMask.size(),NStr::fWithCommas)<<" mask bytes, "
1233  <<NStr::NumericToString(mask_bit_count,NStr::fWithCommas)<<" bits, "
1235  <<"size: "<<NStr::NumericToString(memory,NStr::fWithCommas));
1236  if ( s_GetDebugLevel() >= 7 ) {
1237  for ( size_t i = 0; i < 2 && i < m_AmbiguityPos.size(); ++i ) {
1238  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1239  <<"ambiguity at "<<m_AmbiguityPos[i]<<" - "<<m_Ambiguity4na[i]*1);
1240  }
1241  }
1242  }
1243  if ( kVerify4na ) {
1244  x_Need4na(db);
1245  for ( size_t block_index = 0;
1246  block_index < 8*max(m_AmbiguityMask.size(), m_ExpectedAmbiguityMask.size());
1247  ++block_index ) {
1248  bool bit = x_AmbiguousBlock(block_index);
1249  bool exp_bit;
1250  {{
1251  size_t byte_index = block_index/8;
1252  Uint1 byte_bit = 1<<(block_index%8);
1253  exp_bit = byte_index < m_ExpectedAmbiguityMask.size() &&
1254  (m_ExpectedAmbiguityMask[byte_index] & byte_bit);
1255  }}
1256  if ( bit != exp_bit ) {
1257  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "<<
1258  "mask["<<block_index<<" = "<<oct<<block_index<<dec<<"] "<<bit<<", expected "<<exp_bit);
1259  }
1260  }
1261  size_t index = 0, exp_index = 0;
1262  while ( index < m_AmbiguityPos.size() || exp_index < m_ExpectedAmbiguityPos.size() ) {
1263  TSeqPos pos = index < m_AmbiguityPos.size()? m_AmbiguityPos[index]: kInvalidSeqPos;
1264  int base = index < m_AmbiguityPos.size()? m_Ambiguity4na[index]: 0;
1265  TSeqPos exp_pos = exp_index < m_ExpectedAmbiguityPos.size()? m_ExpectedAmbiguityPos[exp_index]: kInvalidSeqPos;
1266  int exp_base = exp_index < m_ExpectedAmbiguityPos.size()? m_ExpectedAmbiguity4na[exp_index]: 0;
1267  if ( pos == exp_pos ) {
1268  if ( base != exp_base ) {
1269  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "<<
1270  "amb["<<pos<<" = "<<oct<<pos<<dec<<"] "<<base<<", expected "<<exp_base);
1271  }
1272  ++index;
1273  ++exp_index;
1274  }
1275  else if ( pos < exp_pos ) {
1276  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "<<
1277  "amb["<<pos<<" = "<<oct<<pos<<dec<<"] "<<base<<", expected -");
1278  ++index;
1279  }
1280  else {
1281  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "<<
1282  "amb["<<exp_pos<<" = "<<oct<<exp_pos<<dec<<"] -, expected "<<exp_base);
1283  ++exp_index;
1284  }
1285  }
1286  }
1287 }
1288 
1289 
1291 {
1292  if ( s_GetDebugLevel() >= 6 ) {
1293  size_t memory = GetUsedMemory();
1294  CFastMutexGuard guard(m_Mutex);
1295  LOG_POST("~SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1296  <<"final size: "<<NStr::NumericToString(memory,NStr::fWithCommas));
1297  }
1298 }
1299 
1300 
1302 {
1303  const size_t kAllocateGap = sizeof(void*)*2;
1304  size_t ret = kAllocateGap + sizeof(*this);
1305  ret += kAllocateGap + m_GapStart.size()*sizeof(m_GapStart.front());
1306  ret += kAllocateGap + m_GapLen.size()*sizeof(m_GapLen.front());
1307  ret += kAllocateGap + m_GapProps.size()*sizeof(m_GapProps.front());
1308  ret += kAllocateGap + m_GapLinkage.size()*sizeof(m_GapLinkage.front());
1309  ret += kAllocateGap + m_AmbiguityMask.size()*sizeof(m_AmbiguityMask.front());
1310  if ( m_HasAmbiguityPos || m_Has4naBlocks ) {
1311  CFastMutexGuard guard(m_Mutex);
1312  ret += kAllocateGap + m_AmbiguityPos.size()*sizeof(m_AmbiguityPos.front());
1313  ret += kAllocateGap + m_Ambiguity4na.size()*sizeof(m_Ambiguity4na.front());
1314  const size_t kBlockUsedMemory =
1315  kAllocateGap + 4*sizeof(void*) + sizeof(S4naBlock); // including map overhead
1316  ret += kBlockUsedMemory * m_4naBlocks.size();
1317  }
1318  return ret;
1319 }
1320 
1321 
1324 {
1325  TWGSContigGapInfo gap_info;
1326  gap_info.gaps_count = m_GapStart.size();
1327  gap_info.gaps_start = m_GapStart.data();
1328  gap_info.gaps_len = m_GapLen.data();
1329  gap_info.gaps_props = m_GapProps.data();
1330  gap_info.gaps_linkage = m_GapLinkage.data();
1331  return gap_info;
1332 }
1333 
1334 
1335 static const bool kRecoverGaps = true;
1336 
1337 
1338 // 2na encoding has values 0-3, occupying 2 bits
1339 // 4na encoding has values 0-15, occupying 4 bits
1340 // unpacked 4na values occupy one base per byte
1341 // packed 4na bases are stored two per byte, first base in highest bits
1342 // packed 2na bases are stored four per byte, first base in highest bits
1343 
1344 // return true if the 4na value is unambiguous
1345 static inline
1347 {
1348  return b && !(b&(b-1));
1349 }
1350 
1351 
1352 // return pointer to the first ambiguity in an unpacked 4na array
1353 static inline
1354 const Uint1* sx_FindAmbiguity(const Uint1* ptr, const Uint1* end)
1355 {
1356  for ( ; ptr != end; ++ptr ) {
1357  if ( !sx_Is2na(*ptr) ) {
1358  return ptr;
1359  }
1360  }
1361  return ptr;
1362 }
1363 
1364 
1365 static inline
1366 Uint1 sx_Get_4na(const char* ptr, size_t offset)
1367 {
1368  Uint1 b = ptr[offset/2];
1369  if ( offset%2 == 0 ) {
1370  b = b >> 4;
1371  }
1372  return b & 0xf;
1373 }
1374 
1375 
1376 // return pointer to the first ambiguity in an unpacked 4na array
1377 static inline
1378 size_t sx_Find_4na_Ambiguity(const char* ptr, size_t offset, size_t base_count)
1379 {
1380  for ( size_t i = offset; i < offset+base_count; ++i ) {
1381  if ( !sx_Is2na(sx_Get_4na(ptr, i)) ) {
1382  return i;
1383  }
1384  }
1385  return offset+base_count;
1386 }
1387 
1388 
1389 // check if unpacked 4na array has any ambiguity
1390 static inline
1391 bool sx_HasAmbiguity(const Uint1* ptr, const Uint1* end)
1392 {
1393  return sx_FindAmbiguity(ptr, end) != end;
1394 }
1395 
1396 
1397 // check if unpacked 4na array has any ambiguity beside explicit gaps
1398 static inline
1399 bool sx_HasAmbiguity(const Uint1* ptr, TSeqPos count,
1401 {
1402  while ( count ) {
1403  gap_info.SetPos(pos);
1404  if ( gap_info.IsInGap(pos) ) {
1405  // skip gap
1406  TSeqPos gap_len = gap_info.GetGapLength(pos, count);
1407  ptr += gap_len;
1408  pos += gap_len;
1409  count -= gap_len;
1410  }
1411  else {
1412  TSeqPos na_len = gap_info.GetDataLength(pos, count);
1413  if ( sx_HasAmbiguity(ptr, ptr+na_len) ) {
1414  return true;
1415  }
1416  ptr += na_len;
1417  pos += na_len;
1418  count -= na_len;
1419  }
1420  }
1421  return false;
1422 }
1423 
1424 
1425 // convert 2 bases of packed 2na byte into packed 4na byte
1426 static
1427 inline
1428 char s_ConvertBits_2na_to_4na(char bits_2na)
1429 {
1430  static const unsigned char table[16] = {
1431  0x11, 0x12, 0x14, 0x18,
1432  0x21, 0x22, 0x24, 0x28,
1433  0x41, 0x42, 0x44, 0x48,
1434  0x81, 0x82, 0x84, 0x88
1435  };
1436  return table[bits_2na & 0xf];
1437 }
1438 
1439 
1440 // convert first 2 bases of packed 2na byte into packed 4na byte
1441 static
1442 inline
1443 char s_ConvertBits_2na_to_4na_1st(char bits_2na)
1444 {
1445  return s_ConvertBits_2na_to_4na(bits_2na >> 4);
1446 }
1447 
1448 
1449 // convert last 2 bases of packed 2na byte into packed 4na byte
1450 static
1451 inline
1452 char s_ConvertBits_2na_to_4na_2nd(char bits_2na)
1453 {
1454  return s_ConvertBits_2na_to_4na(bits_2na);
1455 }
1456 
1457 
1458 // convert packed 2na (4 bases per byte) array into packed 4na (2 bases per byte) array
1459 static
1460 void s_Convert_2na_to_4na(char* dst_4na, const char* src_2na, size_t base_count)
1461 {
1462  while ( base_count >= 4 ) {
1463  char bits_2na = src_2na[0];
1464  dst_4na[0] = s_ConvertBits_2na_to_4na_1st(bits_2na);
1465  dst_4na[1] = s_ConvertBits_2na_to_4na_2nd(bits_2na);
1466  base_count -= 4;
1467  src_2na += 1;
1468  dst_4na += 2;
1469  }
1470  if ( base_count ) {
1471  char bits_2na = src_2na[0] & (0xff00 >> base_count*2);
1472  {{
1473  char bits_4na = s_ConvertBits_2na_to_4na_1st(bits_2na);
1474  if ( base_count < 2 ) {
1475  bits_4na &= 0xf0;
1476  }
1477  dst_4na[0] = bits_4na;
1478  }}
1479  if ( base_count > 2 ) {
1480  dst_4na[1] = s_ConvertBits_2na_to_4na_2nd(bits_2na) & 0xf0;
1481  }
1482  }
1483 }
1484 
1485 
1486 // convert packed 2na (4 bases per byte) vector into packed 4na (2 bases per byte) vector
1487 static
1488 void s_Convert_2na_to_4na(vector<char>& dst_4na_vec,
1489  const vector<char>& src_2na_vec,
1490  size_t base_count)
1491 {
1492  size_t dst_4na_byte_count = (base_count+1)/2;
1493  // allocate 8-byte aligned memory to allow multi-byte operations at end
1494  dst_4na_vec.reserve((dst_4na_byte_count+7)/8*8);
1495  dst_4na_vec.resize(dst_4na_byte_count);
1496  s_Convert_2na_to_4na(dst_4na_vec.data(), src_2na_vec.data(), base_count);
1497 }
1498 
1499 
1500 // set 4na value into a packed 4na vector
1501 static
1502 inline
1503 void s_Set_4na(vector<char>& dst_4na_vec,
1504  size_t offset,
1505  INSDC_4na_bin amb)
1506 {
1507  char& dst = dst_4na_vec[offset/2];
1508  if ( offset%2 == 0 ) {
1509  dst = (dst & 0xf) | (amb << 4);
1510  }
1511  else {
1512  dst = (dst & 0xf0) | amb;
1513  }
1514 }
1515 
1516 
1517 // set 4na gap of specified length into a packed 4na vector
1518 static
1519 inline
1520 void s_Set_4na_gap(vector<char>& dst_4na_vec,
1521  size_t offset,
1522  size_t len)
1523 {
1524  char* dst = dst_4na_vec.data()+ (offset/2);
1525  if ( len && offset%2 == 1 ) {
1526  // start with odd gap base
1527  *dst |= 0xf;
1528  --len;
1529  ++dst;
1530  }
1531  while ( len >= 2 ) {
1532  *dst = char(0xff);
1533  len -= 2;
1534  ++dst;
1535  }
1536  if ( len ) {
1537  // end with odd gap base
1538  *dst |= 0xf0;
1539  }
1540 }
1541 
1542 
1543 // copy 4na bases with arbitrary offset
1544 static
1545 void s_Copy_4na(char* dst_4na, TSeqPos dst_offset,
1546  const char* src_4na, TSeqPos src_offset,
1547  size_t base_count)
1548 {
1549  if ( !base_count ) {
1550  return;
1551  }
1552  dst_4na += dst_offset/2;
1553  dst_offset %= 2;
1554  src_4na += src_offset/2;
1555  src_offset %= 2;
1556  // copy first odd dst base
1557  if ( dst_offset != 0 ) {
1558  Uint1 dst_b = dst_4na[0];
1559  Uint1 src_b = src_4na[0];
1560  src_4na += src_offset;
1561  if ( !src_offset ) {
1562  src_b = src_b >> 4;
1563  }
1564  src_offset ^= 1;
1565  dst_b = (dst_b & 0xf0) | (src_b & 0xf);
1566  dst_4na[0] = dst_b;
1567  ++dst_4na;
1568  dst_offset = 0;
1569  --base_count;
1570  }
1571  // copy pairs of bases
1572  if ( src_offset == 0 ) {
1573  size_t copy_bytes = base_count / 2;
1574  dst_4na = copy_n(src_4na, copy_bytes, dst_4na);
1575  src_4na += copy_bytes;
1576  base_count %= 2;
1577  }
1578  else {
1579  while ( base_count >= 2 ) {
1580  Uint1 src_b0 = src_4na[0];
1581  Uint1 src_b1 = src_4na[1];
1582  Uint1 dst_b = (src_b0 << 4) | (src_b1 >> 4);
1583  dst_4na[0] = dst_b;
1584  ++src_4na;
1585  ++dst_4na;
1586  base_count -= 2;
1587  }
1588  }
1589  // copy last odd base
1590  if ( base_count ) {
1591  Uint1 dst_b = dst_4na[0];
1592  Uint1 src_b = src_4na[0];
1593  if ( src_offset ) {
1594  src_b = src_b << 4;
1595  }
1596  dst_b = (dst_b & 0xf) | (src_b & 0xf0);
1597  dst_4na[0] = dst_b;
1598  }
1599 }
1600 
1601 
1602 // convert unpacked 4na (1 base per byte) array into packed 4na (2 bases per byte) array
1603 static
1604 void s_Pack_4na(char* dst_packed_4na,
1605  const Uint1* src_4na,
1606  size_t base_count)
1607 {
1608  while ( base_count >= 2 ) {
1609  auto b0 = src_4na[0];
1610  auto b1 = src_4na[1];
1611  auto packed_bb = (b0 << 4)+b1;
1612  *dst_packed_4na = packed_bb;
1613  base_count -= 2;
1614  src_4na += 2;
1615  ++dst_packed_4na;
1616  }
1617  if ( base_count ) {
1618  auto b0 = src_4na[0];
1619  auto packed_bb = (b0 << 4);
1620  *dst_packed_4na = packed_bb;
1621  }
1622 }
1623 
1624 
1625 static
1626 void s_SetAmbiguitiesPos(vector<char>& dst_4na_vec,
1627  TSeqPos pos, TSeqPos len,
1628  const vector<INSDC_coord_zero>& amb_pos,
1629  const vector<INSDC_4na_bin>& amb_4na)
1630 {
1631  auto iter_pos = lower_bound(amb_pos.begin(), amb_pos.end(), INSDC_coord_zero(pos));
1632  auto iter_4na = amb_4na.begin() + (iter_pos-amb_pos.begin());
1633  INSDC_coord_zero end = pos + len;
1634  for ( ; iter_pos != amb_pos.end() && *iter_pos < end; ++iter_pos, ++iter_4na ) {
1635  s_Set_4na(dst_4na_vec, *iter_pos-pos, *iter_4na);
1636  }
1637 }
1638 
1639 
1640 static
1641 void s_SetAmbiguitiesBlocks(vector<char>& dst_4na_vec,
1642  TSeqPos pos, TSeqPos len,
1644 {
1645  TSeqPos end = pos+len;
1646  TSeqPos block_pos = pos - pos%kAmbiguityBlockSize;
1647  for ( auto iter = blocks.lower_bound(block_pos);
1648  iter != blocks.end() && iter->first < end;
1649  ++iter ) {
1650  TSeqPos block_pos = iter->first;
1651  TSeqPos dst_offset;
1652  TSeqPos src_offset;
1653  TSeqPos copy_len;
1654  if ( block_pos < pos ) {
1655  dst_offset = 0;
1656  src_offset = pos-block_pos;
1657  copy_len = min(len, kAmbiguityBlockSize-src_offset);
1658  }
1659  else {
1660  dst_offset = block_pos-pos;
1661  src_offset = 0;
1662  copy_len = min(end-block_pos, kAmbiguityBlockSize);
1663  }
1664  s_Copy_4na(dst_4na_vec.data(), dst_offset, iter->second.m_Packed4na, src_offset, copy_len);
1665  }
1666 }
1667 
1668 
1669 static
1670 void s_SetGaps(vector<char>& dst_4na_vec,
1671  TSeqPos pos, TSeqPos len,
1673 {
1674  TSeqPos pos0 = pos;
1675  gap_info.SetPos(pos);
1676  for ( ; len > 0; ) {
1677  if ( gap_info.IsInGap(pos) ) {
1678  // add gap
1679  TSeqPos gap_len = gap_info.GetGapLength(pos, len);
1680  _ASSERT(gap_len <= len);
1681  s_Set_4na_gap(dst_4na_vec, pos-pos0, gap_len);
1682  ++gap_info;
1683  len -= gap_len;
1684  pos += gap_len;
1685  _ASSERT(!gap_info || pos <= gap_info.GetFrom());
1686  }
1687  else {
1688  // data segment
1689  TSeqPos rem_len = gap_info.GetDataLength(pos, len);
1690  _ASSERT(rem_len <= len);
1691  len -= rem_len;
1692  pos += rem_len;
1693  }
1694  }
1695 }
1696 
1697 
1699  TSeqPos pos, TWGSContigGapInfo& gap_info) const
1700 {
1701  bool ambiguous = sx_HasAmbiguity(ptr, count, pos, gap_info);
1702  if ( ambiguous ) {
1703  s_Pack_4na(m_4naBlocks[pos].m_Packed4na, ptr, count);
1704  }
1705  return ambiguous;
1706 }
1707 
1708 
1710  TSeqPos pos, TWGSContigGapInfo& gap_info) const
1711 {
1712  bool ambiguous = false;
1713  while ( count ) {
1714  gap_info.SetPos(pos);
1715  if ( gap_info.IsInGap(pos) ) {
1716  // skip gap
1717  TSeqPos gap_len = gap_info.GetGapLength(pos, count);
1718  ptr += gap_len;
1719  pos += gap_len;
1720  count -= gap_len;
1721  }
1722  else {
1723  TSeqPos na_len = gap_info.GetDataLength(pos, count);
1724  for ( TSeqPos i = 0; i < na_len; ++i ) {
1725  auto b = ptr[i];
1726  if ( !sx_Is2na(b) ) {
1727  ambiguous = true;
1728  m_AmbiguityPos.push_back(pos+i);
1729  m_Ambiguity4na.push_back(b);
1730  }
1731  }
1732  ptr += na_len;
1733  pos += na_len;
1734  count -= na_len;
1735  }
1736  }
1737  return ambiguous;
1738 }
1739 
1740 
1742 {
1743  if ( m_HasAmbiguityMask ) {
1744  return;
1745  }
1746  // calculate ambiguity mask using 4na read
1747  if ( m_HasAmbiguityPos ) {
1748  PROFILE(sw____GetAmb2Mask);
1749  // it's faster to use ambiguity position list if present
1750  if ( size_t ambiguity_count = m_AmbiguityPos.size() ) {
1751  size_t last_block_index = m_AmbiguityPos.back() / kAmbiguityBlockSize;
1752  size_t last_byte_index = last_block_index/8;
1753  m_AmbiguityMask.resize(last_byte_index+1);
1754  for ( size_t i = 0; i < ambiguity_count; ++i ) {
1755  x_SetAmbiguousBlock(m_AmbiguityPos[i] / kAmbiguityBlockSize);
1756  }
1757  }
1758  if ( s_GetDebugLevel() >= 6 ) {
1759  size_t memory = GetUsedMemory();
1760  size_t mask_bit_count = 0;
1761  for ( auto bb : m_AmbiguityMask ) {
1762  while ( bb ) {
1763  ++mask_bit_count;
1764  bb &= bb-1;
1765  }
1766  }
1767  CFastMutexGuard guard(m_Mutex);
1768  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1769  <<"calculated mask from ambiguities, "
1770  <<NStr::NumericToString(m_AmbiguityMask.size(),NStr::fWithCommas)<<" mask bytes, "
1771  <<NStr::NumericToString(mask_bit_count,NStr::fWithCommas)<<" bits, "
1772  <<"size: "<<NStr::NumericToString(memory,NStr::fWithCommas));
1773  }
1774  }
1775  else {
1776  // we'll have to scan for ambiguities in 4na data
1777 
1778  // use full 4na blocks or individual 4na points
1779  bool use_full_4na_blocks = s_UseFull4naBlocks();
1780 
1781  CRef<SSeq4naTableCursor> cur4na;
1783  {{
1784  PROFILE(sw____GetRaw4na);
1785  cur4na = db.Seq4na(m_RowId);
1786  read4na = cur4na->READ(m_RowId);
1787  }}
1788 
1789  PROFILE(sw____Get4na2Mask);
1790  TSeqPos read_length = TSeqPos(read4na.size());
1791  size_t block_count = (read_length+kAmbiguityBlockSize-1) / kAmbiguityBlockSize;
1792  size_t mask_bit_count = 0;
1793  m_AmbiguityMask.resize((block_count+7)/8);
1794  TWGSContigGapInfo gap_info = GetGapInfo();
1795  for ( size_t block_index = 0; block_index < block_count; ++block_index ) {
1796  TSeqPos block_pos = TSeqPos(block_index*kAmbiguityBlockSize);
1797  const Uint1* base_ptr = read4na.data() + block_pos;
1798  TSeqPos base_count = min(kAmbiguityBlockSize, read_length-block_pos);
1799  bool ambiguous = false;
1800  if ( use_full_4na_blocks ) {
1801  ambiguous = x_AddAmbiguousBlock(base_ptr, base_count, block_pos, gap_info);
1802  }
1803  else {
1804  ambiguous = x_AddAmbiguities(base_ptr, base_count, block_pos, gap_info);
1805  }
1806  if ( ambiguous ) {
1807  x_SetAmbiguousBlock(block_index);
1808  ++mask_bit_count;
1809  }
1810  }
1811  if ( use_full_4na_blocks ) {
1812  m_Has4naBlocks = true;
1813  }
1814  else {
1815  m_HasAmbiguityPos = true;
1816  }
1817  // db.Put(cur4na, m_RowId); do not store 4na cursor in cache to free memory
1818  if ( s_GetDebugLevel() >= 6 ) {
1819  size_t memory = GetUsedMemory();
1820  CFastMutexGuard guard(m_Mutex);
1821  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1822  "calculated mask from read, "
1823  <<NStr::NumericToString(m_AmbiguityMask.size(),NStr::fWithCommas)<<" mask bytes, "
1824  <<NStr::NumericToString(mask_bit_count,NStr::fWithCommas)<<" bits, "
1825  <<NStr::NumericToString(m_Ambiguity4na.size(),NStr::fWithCommas)<<" ambig, "
1826  <<NStr::NumericToString(m_4naBlocks.size(),NStr::fWithCommas)<<" blocks, "
1827  <<"size: "<<NStr::NumericToString(memory,NStr::fWithCommas));
1828  }
1829  }
1830  m_HasAmbiguityMask = true;
1831 }
1832 
1833 
1835 {
1836  CFastMutexGuard guard(m_Mutex);
1837  if ( m_HasAmbiguityPos || m_Has4naBlocks ) {
1838  return;
1839  }
1840 
1841  // use full 4na blocks or individual 4na points
1842  bool use_full_4na_blocks = s_UseFull4naBlocks();
1843 
1844  CRef<SSeq4naTableCursor> cur4na;
1846  TSeqPos read_length = 0;
1847  size_t bit_count = 0;
1848  size_t wrong_bit_count = 0;
1849  TWGSContigGapInfo gap_info = GetGapInfo();
1850  for ( size_t block_byte = 0; block_byte < m_AmbiguityMask.size(); ++block_byte ) {
1851  if ( auto bits = m_AmbiguityMask[block_byte] ) {
1852  if ( !cur4na ) {
1853  PROFILE(sw____GetRaw4na);
1854  cur4na = db.Seq4na(m_RowId);
1855  read4na = cur4na->READ(m_RowId);
1856  read_length = TSeqPos(read4na.size());
1857  }
1858  for ( size_t block_bit = 0; block_bit < 8; ++block_bit ) {
1859  if ( bits & (1<<block_bit) ) {
1860  PROFILE(sw____Scan4na);
1861  size_t block_index = block_byte*8+block_bit;
1862  TSeqPos block_pos = TSeqPos(block_index * kAmbiguityBlockSize);
1863  const Uint1* base_ptr = read4na.data() + block_pos;
1864  TSeqPos base_count = min(kAmbiguityBlockSize, read_length-block_pos);
1865  bool ambiguous = false;
1866  gap_info.SetPos(block_pos);
1867  if ( use_full_4na_blocks ) {
1868  ambiguous = x_AddAmbiguousBlock(base_ptr, base_count, block_pos, gap_info);
1869  }
1870  else {
1871  ambiguous = x_AddAmbiguities(base_ptr, base_count, block_pos, gap_info);
1872  }
1873  if ( ambiguous ) {
1874  ++bit_count;
1875  }
1876  else {
1877  ++wrong_bit_count;
1878  if ( s_GetDebugLevel() >= 7 && wrong_bit_count <= 2 ) {
1879  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1880  <<"wrong bit set at "<<block_pos);
1881  }
1882  }
1883  }
1884  }
1885  }
1886  }
1887  if ( use_full_4na_blocks ) {
1888  m_Has4naBlocks = true;
1889  }
1890  else {
1891  m_HasAmbiguityPos = true;
1892  }
1893  // db.Put(cur4na, m_RowId); do not store 4na cursor in cache to free memory
1894  if ( s_GetDebugLevel() >= 6 ) {
1895  guard.Release();
1896  size_t memory = GetUsedMemory();
1897  CFastMutexGuard guard(m_Mutex);
1898  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1899  <<"calculated 4na, "
1900  <<NStr::NumericToString(read_length,NStr::fWithCommas)<<" bases, "
1901  <<NStr::NumericToString(bit_count,NStr::fWithCommas)<<" bits, "
1902  <<NStr::NumericToString(m_Ambiguity4na.size(),NStr::fWithCommas)<<" ambig, "
1903  <<NStr::NumericToString(m_4naBlocks.size(),NStr::fWithCommas)<<" blocks, "
1904  <<"size: "<<NStr::NumericToString(memory,NStr::fWithCommas));
1905  if ( s_GetDebugLevel() >= 7 ) {
1906  for ( size_t i = 0; i < 2 && i < m_AmbiguityPos.size(); ++i ) {
1907  LOG_POST("SAmbiguityInfo("<<m_Prefix<<"/"<<m_RowId<<") "
1908  <<"ambiguity at "<<m_AmbiguityPos[i]<<" - "<<m_Ambiguity4na[i]*1);
1909  }
1910  }
1911  }
1912 }
1913 
1914 
1916 {
1917  if ( m_HasAmbiguityPos ) {
1918  // use explicit ambiguities list
1919  _ASSERT(reader.m_AmbiguityIndex == m_AmbiguityPos.size() ||
1920  (reader.m_AmbiguityIndex < m_AmbiguityPos.size() &&
1921  reader.m_Pos <= TSeqPos(m_AmbiguityPos[reader.m_AmbiguityIndex])));
1922  }
1923  else {
1924  // use 4na blocks
1925  _ASSERT(reader.m_4naBlocksIter == m_4naBlocks.end() ||
1926  (reader.m_Pos < reader.m_4naBlocksIter->first + kAmbiguityBlockSize));
1927  }
1928  return true;
1929 }
1930 
1931 
1934  CWGSDb_Impl& db, SSeqTableCursor& cur) const
1935 {
1936  x_Need4na(db);
1937  S4naReader reader;
1938  reader.m_Pos = pos;
1939  if ( m_HasAmbiguityPos ) {
1940  // use explicit ambiguities list
1941  reader.m_AmbiguityIndex =
1942  lower_bound(m_AmbiguityPos.begin(), m_AmbiguityPos.end(), INSDC_coord_zero(pos)) - m_AmbiguityPos.begin();
1943  }
1944  else {
1945  // use 4na blocks
1946  TSeqPos block_pos = pos - pos%kAmbiguityBlockSize;
1947  reader.m_4naBlocksIter = m_4naBlocks.lower_bound(block_pos);
1948  }
1949  return reader;
1950 }
1951 
1952 
1955 {
1956  _ASSERT(x_IsValid(reader));
1957  Uint1 base;
1958  if ( m_HasAmbiguityPos ) {
1959  // use explicit ambiguities list
1960  if ( reader.m_AmbiguityIndex == m_AmbiguityPos.size() ) {
1961  // no more ambiguities
1962  return eBase_2na;
1963  }
1964  // check if next ambiguity is at current position
1965  if ( reader.m_Pos != TSeqPos(m_AmbiguityPos[reader.m_AmbiguityIndex]) ) {
1966  // not an ambiguity yet
1967  return eBase_2na;
1968  }
1969  base = m_Ambiguity4na[reader.m_AmbiguityIndex];
1970  }
1971  else {
1972  // use 4na blocks
1973  if ( reader.m_4naBlocksIter == m_4naBlocks.end() ) {
1974  // no more 4na block
1975  return eBase_2na;
1976  }
1977  if ( reader.m_4naBlocksIter->first > reader.m_Pos ) {
1978  // not in a 4na block yet
1979  return eBase_2na;
1980  }
1981  // check actual 4na base
1982  TSeqPos offset = reader.m_Pos - reader.m_4naBlocksIter->first;
1983  base = sx_Get_4na(reader.m_4naBlocksIter->second.m_Packed4na, offset);
1984  }
1985  return base == 0xf? eBase_Gap: sx_Is2na(base)? eBase_2na: eBase_4na;
1986 }
1987 
1988 
1990 {
1991  _ASSERT(x_IsValid(reader));
1992  // advance
1993  ++reader.m_Pos;
1994  // update iterators
1995  if ( m_HasAmbiguityPos ) {
1996  // use explicit ambiguities list
1997  if ( reader.m_AmbiguityIndex == m_AmbiguityPos.size() ) {
1998  // no more ambiguities
1999  }
2000  else {
2001  // check if next ambiguity was at current position
2002  if ( reader.m_Pos > TSeqPos(m_AmbiguityPos[reader.m_AmbiguityIndex]) ) {
2003  // advance to next ambiguity
2004  ++reader.m_AmbiguityIndex;
2005  }
2006  }
2007  }
2008  else {
2009  // use 4na blocks
2010  if ( reader.m_4naBlocksIter == m_4naBlocks.end() ) {
2011  // no more 4na blocks
2012  }
2013  else {
2014  // check if we move out of current 4na block
2015  if ( reader.m_Pos >= reader.m_4naBlocksIter->first + kAmbiguityBlockSize ) {
2016  // advance to next 4na block
2017  ++reader.m_4naBlocksIter;
2018  }
2019  }
2020  }
2021  _ASSERT(x_IsValid(reader));
2022 }
2023 
2024 
2026  CWGSDb_Impl& db, SSeqTableCursor& cur) const
2027 {
2028  x_Need4na(db);
2029  PROFILE(sw____Get2naLen);
2030  TSeqPos end = pos+len;
2031  if ( m_HasAmbiguityPos ) {
2032  auto iter = lower_bound(m_AmbiguityPos.begin(), m_AmbiguityPos.end(), INSDC_coord_zero(pos));
2033  if ( iter == m_AmbiguityPos.end() || TSeqPos(*iter) >= end ) {
2034  return len;
2035  }
2036  return *iter - pos;
2037  }
2038  else {
2039  // use 4na blocks
2040  TSeqPos block_pos = pos - pos%kAmbiguityBlockSize;
2041  for ( auto block_iter = m_4naBlocks.lower_bound(block_pos);
2042  block_iter != m_4naBlocks.end() && block_iter->first < end;
2043  ++block_iter ) {
2044  size_t in_block_pos = pos <= block_iter->first? 0: pos-block_iter->first;
2045  size_t in_block_len = min(kAmbiguityBlockSize, end-block_iter->first);
2046  TSeqPos amb_pos = TSeqPos(sx_Find_4na_Ambiguity(block_iter->second.m_Packed4na,
2047  in_block_pos, in_block_len));
2048  if ( amb_pos < in_block_pos+in_block_len ) {
2049  return (block_iter->first+amb_pos) - pos;
2050  }
2051  }
2052  return len;
2053  }
2054 }
2055 
2056 
2057 // Calculate 4na length with gap recovering
2059  TSeqPos stop_2na_len,
2060  TSeqPos stop_gap_len,
2061  CWGSDb_Impl& db, SSeqTableCursor& cur) const
2062 {
2063  PROFILE(sw____Get4naLen);
2064  if ( len < stop_2na_len ) {
2065  return len;
2066  }
2067  S4naReader reader = Get4naReader(pos, db, cur);
2068  TSeqPos rem_len = len, len2na = 0, gap_len = 0;
2069  // |-------------------- len -----------------|
2070  // |- 4na -|- len2na -|- gap_len -$- rem_len -|
2071  // $ is current position
2072  // only one of len2na and gap_len can be above zero
2073 
2074  for ( ; rem_len; --rem_len, Advance(reader) ) {
2075  auto base_type = GetBaseType(reader);
2076  if ( base_type == eBase_2na ) {
2077  if ( len2na == stop_2na_len-1 ) { // 1 more 2na is enough
2078  return len-(rem_len+len2na);
2079  }
2080  ++len2na;
2081  if ( kRecoverGaps ) {
2082  gap_len = 0;
2083  }
2084  }
2085  else {
2086  if ( kRecoverGaps && (base_type == eBase_Gap) ) {
2087  if ( gap_len == stop_gap_len-1 ) { // 1 more gap is enough
2088  return len-(rem_len+gap_len);
2089  }
2090  ++gap_len;
2091  }
2092  len2na = 0;
2093  }
2094  }
2095  _ASSERT(len2na < stop_2na_len);
2096  _ASSERT(!kRecoverGaps || gap_len < stop_gap_len);
2097  return len;
2098 }
2099 
2100 
2102  CWGSDb_Impl& db, SSeqTableCursor& cur) const
2103 {
2104  PROFILE(sw____GetGapLen);
2105  S4naReader reader = Get4naReader(pos, db, cur);
2106  TSeqPos rem_len = len;
2107  for ( ; rem_len; --rem_len, Advance(reader) ) {
2108  // check both bases
2109  auto base_type = GetBaseType(reader);
2110  if ( base_type != eBase_Gap ) {
2111  return len-rem_len;
2112  }
2113  }
2114  return len;
2115 }
2116 
2117 
2118 // Return 2na Seq-data for specified range.
2119 // The data mustn't have ambiguities.
2121  SSeqTableCursor& cur) const
2122 {
2123  PROFILE(sw____GetRaw2na);
2124  CRef<CSeq_data> ret(new CSeq_data);
2125  vector<char>& data = ret->SetNcbi2na().Set();
2126  size_t bytes = (len+3)/4;
2127  // allocate 8-byte aligned memory to allow multi-byte operations at end
2128  data.reserve((bytes+7)/8*8);
2129  data.resize(bytes);
2130  cur.m_Cursor.ReadElements(m_RowId, cur.m_READ_2na, 2, pos, len,
2131  data.data());
2132  return ret;
2133 }
2134 
2135 
2136 // return 4na Seq-data for specified range
2138  CWGSDb_Impl& db, SSeqTableCursor& cur) const
2139 {
2140  x_Need4na(db);
2141  CRef<CSeq_data> ret(new CSeq_data);
2142  vector<char>& data = ret->SetNcbi4na().Set();
2143  {{
2144  auto seq_2na = Get2na(pos, len, cur);
2145  PROFILE(sw____GetCvt4na);
2146  s_Convert_2na_to_4na(data, seq_2na->GetNcbi2na().Get(), len);
2147  }}
2148  if ( m_HasAmbiguityPos ) {
2149  // restore 4na by adding ambiguous bases to 2na
2150  PROFILE(sw____GetAmb4na);
2151  // set ambiguities
2152  s_SetAmbiguitiesPos(data, pos, len, m_AmbiguityPos, m_Ambiguity4na);
2153  }
2154  else {
2155  // restore 4na by adding ambiguous blocks to 2na
2156  PROFILE(sw____GetBlk4na);
2157  s_SetAmbiguitiesBlocks(data, pos, len, m_4naBlocks);
2158  }
2159  {{
2160  PROFILE(sw____SetGaps);
2161  s_SetGaps(data, pos, len, GetGapInfo());
2162  }}
2163  return ret;
2164 }
2165 
2166 
2168 {
2169  TSeqPos pos0 = pos;
2170  TSeqPos end = pos+len;
2171  while ( pos != end ) {
2172  TSeqPos block_index = pos/kAmbiguityBlockSize;
2173  if ( x_AmbiguousBlock(block_index) ) {
2174  // 4na
2175  break;
2176  }
2177  pos = min(end, (block_index+1)*kAmbiguityBlockSize);
2178  }
2179  return pos-pos0;
2180 }
2181 
2182 
2184 {
2185  TSeqPos pos0 = pos;
2186  TSeqPos end = pos+len;
2187  while ( pos != end ) {
2188  TSeqPos block_index = pos/kAmbiguityBlockSize;
2189  if ( !x_AmbiguousBlock(block_index) ) {
2190  // 2na
2191  break;
2192  }
2193  pos = min(end, (block_index+1)*kAmbiguityBlockSize);
2194  }
2195  return pos-pos0;
2196 }
2197 
2198 
2199 #ifdef USE_GLOBAL_AMBIGUITY_CACHE
2200 
2201 DEFINE_STATIC_FAST_MUTEX(s_GlobalAmbiguityCacheMutex);
2204 public:
2207 };
2209 #endif
2210 
2212 {
2213 #ifdef USE_GLOBAL_AMBIGUITY_CACHE
2214  CFastMutexGuard guard(s_GlobalAmbiguityCacheMutex);
2215  return s_GlobalAmbiguityCache->get(make_pair(GetWGSPath(), row));
2216 #else
2218  return m_AmbiguityCache.get(row);
2219 #endif
2220 }
2221 
2222 
2224 {
2225  if ( !info ) {
2226  return;
2227  }
2228  size_t used_memory = info->GetUsedMemory();
2229 #ifdef USE_GLOBAL_AMBIGUITY_CACHE
2230  CFastMutexGuard guard(s_GlobalAmbiguityCacheMutex);
2231  s_GlobalAmbiguityCache->put(make_pair(GetWGSPath(), info->m_RowId), info, used_memory);
2232 #else
2234  m_AmbiguityCache.put(info->m_RowId, info, used_memory);
2235 #endif
2236 }
2237 
2238 
2240 {
2241  CRef<SSeq0TableCursor> seq = Seq0();
2242  if ( !seq->m_Cursor.TryOpenRow(1) ) {
2243  m_IdPrefixWithVersion.erase();
2244  m_IdPrefix.erase();
2245  m_IdVersion = 1;
2246  m_IdRowDigits = 0;
2247  return;
2248  }
2249  CTempString acc = *seq->ACC_PREFIX(1);
2250  const SIZE_TYPE prefix_len = acc.find_first_of("0123456789");
2251  m_IdRowDigits = *seq->ACC_CONTIG_LEN(1);
2252  if ( m_IdRowDigits < 6 || m_IdRowDigits > 8 ) {
2253  NCBI_THROW_FMT(CSraException, eInitFailed,
2254  "CWGSDb: bad WGS accession format: "<<acc);
2255  }
2256  m_IdPrefixWithVersion = acc.substr(0, prefix_len+2);
2257  m_IdPrefix = acc.substr(0, prefix_len);
2258  m_IdVersion = NStr::StringToNumeric<int>(acc.substr(prefix_len, 2));
2259  if ( seq->m_MOL ) {
2260  // explicit contig type
2261  m_ContigMolType = CSeq_inst::TMol(*seq->MOL(1));
2262  }
2263  else {
2264  // deduce contig type from accession prefix
2265  switch ( acc[0] ) {
2266  case 'G':
2267  case 'H':
2268  case 'I':
2270  break;
2271  default:
2273  break;
2274  }
2275  }
2276  m_IdPrefixDbWithVersion = (IsTSA()? "TSA:": "WGS:")+m_IdPrefixWithVersion;
2277  m_IdPrefixDb = (IsTSA()? "TSA:": "WGS:")+m_IdPrefix;
2278  m_HasNoDefaultGnlId = seq->m_SEQID_GNL_PREFIX && seq->SEQID_GNL_PREFIX(1).empty();
2279  bool has_static_taxid = seq->m_TAXID && seq->m_TAXID.IsStatic(seq->m_Cursor);
2280  TTaxId static_taxid = ZERO_TAX_ID;
2281  if ( has_static_taxid ) {
2282  auto value = seq->TAXID(1);
2283  if ( value.size() != 1 ) {
2284  has_static_taxid = false;
2285  }
2286  else {
2287  static_taxid = value[0];
2288  }
2289  }
2290  Put(seq);
2291 
2292  if ( CKMetadata meta = CKMetadata(SeqTable()) ) {
2293  if ( CKMDataNode node = CKMDataNode(meta, "GB_STATE", CKMDataNode::eMissing_Allow) ) {
2294  m_ProjectGBState = NCBI_gb_state(node.GetUint8());
2295  }
2296  if ( CKMDataNode node = CKMDataNode(meta, "REPLACED_BY", CKMDataNode::eMissing_Allow) ) {
2297  size_t size = node.GetSize();
2298  m_ReplacedBy.resize(size);
2299  node.GetData(&m_ReplacedBy[0], size);
2300  }
2301  if ( CKMDataNode node = CKMDataNode(meta, "SEQ_ID_TYPE", CKMDataNode::eMissing_Allow) ) {
2302  m_SeqIdType = CSeq_id::E_Choice(node.GetUint8());
2303  }
2304  if ( CKMDataNode node = CKMDataNode(meta, "EXTRA_TAXIDS", CKMDataNode::eMissing_Allow) ) {
2305  // all tax ids are separate
2306  }
2307  else if ( CKMDataNode node = CKMDataNode(meta, "TAXID", CKMDataNode::eMissing_Allow) ) {
2308  // common taxid
2309  if ( node.GetSize() != 0 ) {
2310  m_CommonTaxId = node.GetUint4();
2311  m_HasCommonTaxId = true;
2312  if ( has_static_taxid && static_taxid != m_CommonTaxId ) {
2314  m_HasCommonTaxId = false;
2315  }
2316  }
2317  }
2318  }
2319 }
2320 
2321 
2323  CTempString vol_path)
2324 {
2325 #ifdef USE_TEST_PATH
2326  {
2327  string test_path = NCBI_PARAM_TYPE(WGS, TEST_PATH)::GetDefault();
2328  if ( !test_path.empty() ) {
2329  string file_path = CDirEntry::MakePath(test_path, path_or_acc);
2330  if ( CDirEntry(file_path).Exists() ) {
2331  LOG_POST(Warning<<"Using local test file: "<<file_path);
2332  return file_path;
2333  }
2334  }
2335  }
2336 #endif
2337  if ( !vol_path.empty() ) {
2338  vector<CTempString> dirs;
2339  NStr::Split(vol_path, ":", dirs);
2340  ITERATE ( vector<CTempString>, it, dirs ) {
2341  string path = CDirEntry::MakePath(*it, path_or_acc);
2342  if ( CDirEntry(path).Exists() ) {
2343  return path;
2344  }
2345  }
2346  string path = CDirEntry::MakePath(vol_path, path_or_acc);
2347  if ( CDirEntry(path).Exists() ) {
2348  return path;
2349  }
2350  }
2351  if ( CVPath::IsPlainAccession(path_or_acc) &&
2352  path_or_acc.find('.') == string::npos ) { // no WGS accession sub-version
2353  // parse WGS accession
2354  const SIZE_TYPE start = 0;
2355  // ID-5322 : WGS prefix can consist of 4 or 6 characters, with optional
2356  // 2-digit version.
2357  // If no version is specified, set it to a default value 00, which is
2358  // resolved to a real version via a symlink on the file system.
2359  string acc = path_or_acc.substr(start);
2360  size_t acclen = acc.size();
2361  size_t digit_pos = acc.find_first_of("0123456789");
2362  if (digit_pos == string::npos && (acclen == 4 || acclen == 6)) {
2363  return string(path_or_acc) + "00";
2364  } else if ((digit_pos == 4 || digit_pos == 6) &&
2365  acclen > digit_pos + 2) {
2366  // remove contig/scaffold id
2367  return path_or_acc.substr(0, start+digit_pos+2);
2368  }
2369  }
2370  return path_or_acc;
2371 }
2372 
2373 
2374 inline
2376  atomic<bool>& table_is_opened,
2377  const char* table_name)
2378 {
2380  if ( !table_is_opened.load(memory_order_acquire) ) {
2382  table_is_opened.store(true, memory_order_release);
2383  }
2384 }
2385 
2386 
2387 inline
2389  CVDBTableIndex& index,
2390  atomic<Int1>& index_is_opened,
2391  const char* index_name,
2392  const char* backup_index_name)
2393 {
2394  if ( table ) {
2396  if ( !index_is_opened.load(memory_order_acquire) ) {
2397  Int1 type = -1;
2398  index = CVDBTableIndex(table, index_name,
2400  if ( index ) {
2401  type = 1;
2402  }
2403  else if ( backup_index_name ) {
2404  index = CVDBTableIndex(table, backup_index_name,
2406  if ( index ) {
2407  type = 2;
2408  }
2409  }
2410  index_is_opened.store(type, memory_order_release);
2411  }
2412  }
2413  else {
2414  index_is_opened.store(-1, memory_order_release);
2415  }
2416 }
2417 
2418 
2420 {
2421  OpenTable(m_ScfTable, m_ScfTableIsOpened, "SCAFFOLD");
2422 }
2423 
2424 
2426 {
2428 }
2429 
2430 
2432 {
2434 }
2435 
2436 
2438 {
2440 }
2441 
2442 
2444 {
2446 }
2447 
2448 
2450 {
2452  "contig_name_uc", "contig_name");
2453 }
2454 
2455 
2457 {
2459  "scaffold_name_uc", "scaffold_name");
2460 }
2461 
2462 
2464 {
2466  "protein_name_uc", "protein_name");
2467 }
2468 
2469 
2471 {
2473  "product_name_uc", "product_name");
2474 }
2475 
2476 
2478 {
2480  "gb_accession");
2481 }
2482 
2483 
2484 pair<TVDBRowId, CWGSDb_Impl::ERowType>
2486  TAllowRowType allow_type)
2487 {
2488  pair<TVDBRowId, ERowType> ret(0, eRowType_contig);
2489  const SIZE_TYPE start = 0;
2490  SIZE_TYPE prefix_len = acc.find_first_of("0123456789");
2491  if (prefix_len == NPOS || prefix_len >= acc.size() - 2)
2492  return ret;
2493  else prefix_len += 2;
2494 
2495  CTempString row = acc.substr(start+prefix_len);
2496  if ( row[0] == 'S' ) {
2497  if ( !(allow_type & fAllowRowType_scaffold) ) {
2498  return ret;
2499  }
2500  ret.second = eRowType_scaffold;
2501  row = row.substr(1); // skip scaffold prefix
2502  }
2503  else if ( row[0] == 'P' ) {
2504  if ( !(allow_type & fAllowRowType_protein) ) {
2505  return ret;
2506  }
2507  ret.second = eRowType_protein;
2508  row = row.substr(1); // skip scaffold prefix
2509  }
2510  else {
2511  if ( !(allow_type & fAllowRowType_contig) ) {
2512  return ret;
2513  }
2514  }
2515  ret.first = NStr::StringToNumeric<TVDBRowId>(row, NStr::fConvErr_NoThrow);
2516  if ( ret.first < 0 ) {
2517  ret.first = 0;
2518  }
2519  return ret;
2520 }
2521 
2522 
2523 TVDBRowId CWGSDb_Impl::ParseRow(CTempString acc, bool* is_scaffold) const
2524 {
2525  TAllowRowType allow_type = fAllowRowType_contig;
2526  if ( is_scaffold ) {
2527  allow_type |= fAllowRowType_scaffold;
2528  }
2529  pair<TVDBRowId, TRowType> rt = ParseRowType(acc, allow_type);
2530  if ( is_scaffold ) {
2531  *is_scaffold = rt.second == eRowType_scaffold;
2532  }
2533  return rt.first;
2534 }
2535 
2536 
2538 
2540 {
2541  if ( const CTextseq_id* text_id = id.GetTextseq_Id() ) {
2542  const_cast<CTextseq_id*>(text_id)->SetVersion(version);
2543  return true;
2544  }
2545  return false;
2546 }
2547 
2548 
2550 {
2551  if ( const CTextseq_id* text_id = id.GetTextseq_Id() ) {
2552  const_cast<CTextseq_id*>(text_id)->SetAccession(accession);
2553  return true;
2554  }
2555  return false;
2556 }
2557 
2558 
2560 
2561 
2562 inline
2564 {
2566 }
2567 
2569 {
2570  const bool kSetErrno = 0;
2571  const bool kSetNcbiError = 0;
2572 
2573  int error = 0, ret = -1;
2574  size_t len = str.size();
2575  if ( !len ) {
2576  error = EINVAL;
2577  }
2578  else {
2579  unsigned v = str.data()[0] - '0';
2580  if (v > 9) {
2581  error = EINVAL;
2582  }
2583  else {
2584  for (size_t i = 1; i < len; ++i) {
2585  unsigned d = str.data()[i] - '0';
2586  if (d > 9) {
2587  error = EINVAL;
2588  break;
2589  }
2590  unsigned nv = v * 10 + d;
2591  const unsigned kOverflowLimit = (INT_MAX - 9) / 10 + 1;
2592  if ( v >= kOverflowLimit ) {
2593  // possible overflow
2594  if ( v > kOverflowLimit || nv > INT_MAX) {
2595  error = ERANGE;
2596  break;
2597  }
2598  }
2599  v = nv;
2600  }
2601  if (!error) {
2602  ret = static_cast<int>(v);
2603  }
2604  }
2605  }
2606  if (kSetErrno) {
2607  errno = error;
2608  }
2609  if (kSetNcbiError && error) {
2611  }
2612  return ret;
2613 }
2614 
2615 
2616 // return non-negative integer if the string is its canonical representation -
2617 // no leading zeros or spaces,
2618 // otherwise return -1
2620 {
2621  int id = sx_StringToNonNegativeInt(str);
2622  if ( id >= 0 ) {
2623  if ( str.size() == 1 || str.data()[0] != '0' ) { // no leading zeroes
2624  return id;
2625  }
2626  }
2627  return -1;
2628 }
2629 
2630 /*
2631 CRange<int> sx_GetPatentRange(const CUser_object& obj, CTempString prefix)
2632 {
2633  int from = -1;
2634  int to = -1;
2635  if ( auto field = obj.GetFieldRef("Patent_accession_first") ) {
2636  if ( field->GetData().IsStr() ) {
2637  CTempString str = field->GetData().GetStr();
2638  if ( NStr::StartsWith(str, prefix) ) {
2639  from = sx_StringToNonNegativeInt(str.substr(prefix.size()));
2640  }
2641  }
2642  }
2643  if ( auto field = obj.GetFieldRef("Patent_accession_last") ) {
2644  if ( field->GetData().IsStr() ) {
2645  CTempString str = field->GetData().GetStr();
2646  if ( NStr::StartsWith(str, prefix) ) {
2647  to = sx_StringToNonNegativeInt(str.substr(prefix.size()));
2648  }
2649  }
2650  }
2651  if ( from >= 0 && to >= from ) {
2652  return CRange<int>(from, to);
2653  }
2654  else {
2655  return CRange<int>::GetEmpty();
2656  }
2657 }
2658 */
2659 
2661 {
2662  CObject_id& oid = tag.SetTag();
2663  int id = sx_GetStringId(str);
2664  if ( id >= 0 ) {
2665  oid.SetId(id);
2666  }
2667  else {
2668  oid.SetStr(str);
2669  }
2670 }
2671 
2672 
2674 {
2675  if ( !bytes.empty() ) {
2676  CObjectIStreamAsnBinary in(bytes.data(), bytes.size());
2677  // hack to determine if the data
2678  // is of type Seq-descr (starts with byte 49)
2679  // or of type Seqdesc (starts with byte >= 160)
2680  if ( bytes[0] == kSeq_descrFirstByte ) {
2681  CSeq_descr tmp;
2682  in >> tmp;
2683  for ( auto& desc : tmp.Set() ) {
2684  descr.Set().push_back(desc);
2685  }
2686  }
2687  else {
2688  while ( in.HaveMoreData() ) {
2689  CRef<CSeqdesc> desc(new CSeqdesc);
2690  in >> *desc;
2691  descr.Set().push_back(desc);
2692  }
2693  }
2694  }
2695 }
2696 
2697 
2699 {
2700  if ( !bytes.empty() ) {
2701  CObjectIStreamAsnBinary in(bytes.data(), bytes.size());
2702  while ( in.HaveMoreData() ) {
2703  CRef<CSeq_annot> annot(new CSeq_annot);
2704  in >> *annot;
2705  annot_set.push_back(annot);
2706  }
2707  }
2708 }
2709 
2710 
2712 {
2713  if ( id.IsGi() ) {
2714  split_id.SetGi(id.GetGi());
2715  }
2716  else {
2717  split_id.SetSeq_id(id);
2718  }
2719 }
2720 
2721 
2723 {
2724  if ( id.IsGi() ) {
2725  split_id.SetGi(id.GetGi());
2726  }
2727  else {
2728  split_id.SetSeq_id(id);
2729  }
2730 }
2731 
2732 
2734 {
2736  sx_SetSplitId(*split_id, id);
2737  split_ids.push_back(split_id);
2738 }
2739 
2740 
2742  const CBioseq::TId& ids)
2743 {
2744  ITERATE ( CBioseq::TId, it, ids ) {
2745  sx_AddSplitId(split_ids, it->GetNCObject());
2746  }
2747 }
2748 
2749 
2751  TSeqPos pos, TSeqPos end)
2752 {
2753  if ( id.IsGi() ) {
2754  CID2S_Gi_Interval& loc_gi = split_loc.SetGi_interval();
2755  loc_gi.SetGi(id.GetGi());
2756  loc_gi.SetStart(pos);
2757  loc_gi.SetLength(end-pos);
2758  }
2759  else {
2760  CID2S_Seq_id_Interval& loc_id = split_loc.SetSeq_id_interval();
2761  loc_id.SetSeq_id(id);
2762  loc_id.SetStart(pos);
2763  loc_id.SetLength(end-pos);
2764  }
2765 }
2766 
2767 
2769 
2770 
2771 bool CWGSDb_Impl::IsTSA(void) const
2772 {
2774 }
2775 
2776 
2778  CTempString tag) const
2779 {
2780  if ( prefix.empty() ) {
2781  return null;
2782  }
2783  else {
2784  CRef<CSeq_id> id(new CSeq_id);
2785  CDbtag& dbtag = id->SetGeneral();
2786  dbtag.SetDb(prefix);
2787  sx_SetTag(dbtag, tag);
2788  return id;
2789  }
2790 }
2791 
2792 
2794  TGnlIdFlags gnl_id_flags) const
2795 {
2796  CRef<CSeq_id> id;
2797  if ( m_HasNoDefaultGnlId ) {
2798  return id;
2799  }
2800  id = new CSeq_id;
2801  CDbtag& dbtag = id->SetGeneral();
2802  SIZE_TYPE colon = tag.rfind(':');
2803  if ( colon != NPOS ) {
2804  dbtag.SetDb(tag.substr(0, colon));
2805  tag = tag.substr(colon+1);
2806  }
2807  else {
2808  const string& db =
2810  dbtag.SetDb(db);
2811  if ( NStr::StartsWith(tag, db) &&
2812  tag[db.size()] == ':' ) {
2813  tag = tag.substr(db.size()+1);
2814  }
2815  }
2816  sx_SetTag(dbtag, tag);
2817  return id;
2818 }
2819 
2820 
2822 {
2823  CRef<CSeq_id> seq_id(new CSeq_id);
2824  CPatent_seq_id& pat_id = seq_id->SetPatent();
2826  pat_id.SetSeqid(id);
2827  return seq_id;
2828 }
2829 
2830 
2833  TVDBRowId row,
2834  TGnlIdFlags gnl_id_flags) const
2835 {
2836  if ( str.empty() ) {
2837  return null;
2838  }
2839  int id = sx_GetStringId(str);
2840  if ( id >= 0 && HasPatentId() ) {
2841  return GetPatentSeq_id(id);
2842  }
2843  if ( gnl_id_flags & fGnlId_NoWGSId ) {
2844  return null;
2845  }
2846  return GetGeneralSeq_id(str, gnl_id_flags);
2847 }
2848 
2849 
2852  const SSeq0TableCursor& cur,
2853  TVDBRowId row) const
2854 {
2855  if ( str.empty() ) {
2856  return null;
2857  }
2858  int id = sx_GetStringId(str);
2859  if ( id >= 0 && HasPatentId() ) {
2860  return GetPatentSeq_id(id);
2861  }
2862  if ( cur.m_SEQID_GNL_PREFIX ) {
2863  return GetGeneralSeq_id(cur.SEQID_GNL_PREFIX(1), str);
2864  }
2865  else {
2866  return GetGeneralSeq_id(str);
2867  }
2868 }
2869 
2870 
2873  const SScfTableCursor& cur,
2874  TVDBRowId row) const
2875 {
2876  if ( str.empty() ) {
2877  return null;
2878  }
2879  int id = sx_GetStringId(str);
2880  if ( id >= 0 && HasPatentId() ) {
2881  return GetPatentSeq_id(id);
2882  }
2883  if ( cur.m_SEQID_GNL_PREFIX ) {
2884  return GetGeneralSeq_id(cur.SEQID_GNL_PREFIX(1), str);
2885  }
2886  else {
2887  return GetGeneralSeq_id(str);
2888  }
2889 }
2890 
2891 
2894  const SProt0TableCursor& cur,
2895  TVDBRowId row) const
2896 {
2897  if ( str.empty() ) {
2898  return null;
2899  }
2900  int id = sx_GetStringId(str);
2901  if ( id >= 0 && HasPatentId() ) {
2902  return GetPatentSeq_id(id);
2903  }
2904  if ( cur.m_SEQID_GNL_PREFIX ) {
2905  return GetGeneralSeq_id(cur.SEQID_GNL_PREFIX(1), str);
2906  }
2907  else {
2909  }
2910 }
2911 
2912 
2914 {
2915  PROFILE(sw_GetAccSeq_id);
2916  CRef<CSeq_id> id;
2917  if ( !acc.empty() ) {
2918  if ( m_SeqIdType != CSeq_id::e_not_set ) {
2919  id = new CSeq_id();
2920  id->Select(m_SeqIdType);
2921  sx_SetAccession(*id, acc);
2922  }
2923  else {
2924  id = new CSeq_id(acc);
2925  }
2926  sx_SetVersion(*id, version);
2927  }
2928  return id;
2929 }
2930 
2931 
2933  TVDBRowId row_id,
2934  int version) const
2935 {
2936  CRef<CSeq_id> id;
2937  if ( m_IdPrefixWithVersion.empty() ) {
2938  return id;
2939  }
2942  if ( type != eRowType_contig ) {
2943  str << char(type);
2944  }
2945  str << setfill('0') << setw(m_IdRowDigits) << row_id;
2946  string id_str = CNcbiOstrstreamToString(str);
2947  id = new CSeq_id(id_str);
2948  sx_SetVersion(*id, version);
2949  return id;
2950 }
2951 
2952 
2954 {
2955  CRef<CSeq_id> id;
2956  if ( m_IdPrefix.empty() ) {
2957  return id;
2958  }
2959  string master_acc = m_IdPrefix;
2960  master_acc.resize(master_acc.size() + 2 + m_IdRowDigits, '0');
2961  id = new CSeq_id(master_acc);
2962  if ( !sx_SetVersion(*id, m_IdVersion) ) {
2963  id = null;
2964  }
2965  return id;
2966 }
2967 
2968 
2970 {
2971  return GetAccSeq_id(eRowType_contig, row_id, 1);
2972 }
2973 
2974 
2976 {
2977  return GetAccSeq_id(eRowType_scaffold, row_id, 1);
2978 }
2979 
2980 
2982 {
2983  return GetAccSeq_id(eRowType_protein, row_id, 1);
2984 }
2985 
2986 
2988 {
2989  return m_ContigMolType;
2990 }
2991 
2992 
2994 {
2995  return CSeq_inst::eMol_dna;
2996 }
2997 
2998 
3000 {
3001  return CSeq_inst::eMol_aa;
3002 }
3003 
3004 
3006 {
3007  m_MasterDescr.clear();
3008  m_IsSetMasterDescr = false;
3009 }
3010 
3011 
3013 {
3014  if ( !IsSetMasterDescr() &&
3015  NCBI_PARAM_TYPE(WGS, MASTER_DESCR)::GetDefault() ) {
3016  x_LoadMasterDescr(filter);
3017  }
3018  return IsSetMasterDescr();
3019 }
3020 
3021 
3023 {
3024  buffer.clear();
3025  CKMetadata meta(SeqTable());
3026  if ( !meta ) {
3027  return 0;
3028  }
3029  CKMDataNode node(meta, "MASTER", CKMDataNode::eMissing_Allow);
3030  if ( !node ) {
3031  return 0;
3032  }
3033  size_t size = node.GetSize();
3034  if ( !size ) {
3035  return 0;
3036  }
3037  buffer.resize_mem(size);
3038  node.GetData(buffer.data(), size);
3039  return size;
3040 }
3041 
3042 
3044 {
3045  if ( !m_MasterEntry ) {
3047  if ( !m_MasterEntry ) {
3049  if ( !GetMasterDescrBytes(buffer) ) {
3050  return null;
3051  }
3052 
3053  CObjectIStreamAsnBinary str(buffer.data(), buffer.size());
3054  CRef<CSeq_entry> master_entry(new CSeq_entry());
3055  str >> *master_entry;
3056  m_MasterEntry = master_entry;
3057  }
3058  if ( m_MasterEntry->IsSeq() ) {
3059  for ( auto& id : m_MasterEntry->GetSeq().GetId() ) {
3060  if ( id->IsPatent() ) {
3061  SetPatentId(id);
3062  break;
3063  }
3064  }
3065  /*
3066  m_PatentSeqIdRangeNuc = CRange<int>::GetEmpty();
3067  m_PatentSeqIdRangeProt = CRange<int>::GetEmpty();
3068  if ( HasPatentId() && m_MasterEntry->GetSeq().IsSetDescr() ) {
3069  for ( auto& d : m_MasterEntry->GetSeq().GetDescr().Get() ) {
3070  const CSeqdesc& desc = *d;
3071  if ( desc.IsUser() ) {
3072  const CUser_object& obj = desc.GetUser();
3073  const CObject_id& type = obj.GetType();
3074  if ( type.IsStr() &&
3075  type.GetStr() == "PatentProjects" ) {
3076  m_PatentSeqIdRangeNuc = sx_GetPatentRange(obj, GetIdPrefixWithVersion());
3077  m_PatentSeqIdRangeProt = m_PatentSeqIdRangeNuc;
3078  }
3079  if ( type.IsStr() &&
3080  type.GetStr() == "PatentProjectsNucleotide" ) {
3081  m_PatentSeqIdRangeNuc = sx_GetPatentRange(obj, GetIdPrefixWithVersion());
3082  }
3083  if ( type.IsStr() &&
3084  type.GetStr() == "PatentProjectsProtein" ) {
3085  m_PatentSeqIdRangeProt = sx_GetPatentRange(obj, GetIdPrefixWithVersion());
3086  }
3087  }
3088  }
3089  }
3090  */
3091  }
3092  }
3093  return m_MasterEntry;
3094 }
3095 
3096 
3098 {
3099  if ( CRef<CSeq_entry> master_entry = GetMasterDescrEntry() ) {
3100  if ( master_entry->IsSetDescr() ) {
3101  SetMasterDescr(master_entry->GetDescr().Get(), filter);
3102  }
3103  }
3104 }
3105 
3106 
3109 {
3110  switch ( desc.Which() ) {
3111  case CSeqdesc::e_Pub:
3112  case CSeqdesc::e_Comment:
3113  return eDescr_force;
3114  case CSeqdesc::e_Source:
3115  case CSeqdesc::e_Molinfo:
3118  case CSeqdesc::e_Genbank:
3119  case CSeqdesc::e_Embl:
3120  return eDescr_default;
3121  case CSeqdesc::e_User:
3122  if ( desc.GetUser().GetType().IsStr() ) {
3123  // only specific user objects are passed from WGS master
3124  const string& name = desc.GetUser().GetType().GetStr();
3125  if ( name == "DBLink" ||
3126  name == "GenomeProjectsDB" ||
3127  name == "StructuredComment" ||
3128  name == "FeatureFetchPolicy" ||
3129  name == "Unverified") {
3130  return eDescr_default;
3131  }
3132  }
3133  return eDescr_skip;
3134  default:
3135  return eDescr_skip;
3136  }
3137 }
3138 
3139 
3141  int filter)
3142 {
3143  if ( filter == CWGSDb::eDescrDefaultFilter ) {
3144  TMasterDescr descr2;
3145  ITERATE ( CSeq_descr::Tdata, it, descr ) {
3147  descr2.push_back(Ref(SerialClone(**it)));
3148  }
3149  }
3151  return;
3152  }
3153  m_MasterDescr = descr;
3154  m_IsSetMasterDescr = true;
3155 }
3156 
3157 static string
3159 {
3160  string uo_type;
3161 
3162  if (desc.IsUser() && desc.GetUser().GetType().IsStr()) {
3163  uo_type = desc.GetUser().GetType().GetStr();
3164  if (uo_type == "StructuredComment") {
3165  ITERATE (CUser_object::TData, it, desc.GetUser().GetData()) {
3166  if ((*it)->GetLabel().IsStr() &&
3167  (*it)->GetLabel().GetStr() == "StructuredCommentPrefix") {
3168  string data = ((*it)->GetData().IsStr() ?
3169  (string) (*it)->GetData().GetStr() :
3170  NStr::IntToString((*it)->GetData().GetInt()));
3171  uo_type += "|" + data;
3172  break;
3173  }
3174  }
3175  }
3176  }
3177 
3178  return uo_type;
3179 }
3180 
3181 static void
3182 s_AddUserObjectType(const CSeqdesc& desc, set<string>& existing_uo_types)
3183 {
3184  string uo_type = s_GetUserObjectType(desc);
3185 
3186  if (!uo_type.empty() && existing_uo_types.count(uo_type) == 0) {
3187  existing_uo_types.insert(uo_type);
3188  }
3189 }
3190 
3191 void CWGSDb_Impl::AddMasterDescr(CSeq_descr& descr, const CBioseq* main_seq, TFlags flags) const
3192 {
3193  if ( !GetMasterDescr().empty() ) {
3194  unsigned type_mask = 0;
3195  set<string> existing_uo_types;
3196 
3197  ITERATE ( CSeq_descr::Tdata, it, descr.Get() ) {
3198  const CSeqdesc& desc = **it;
3199  type_mask |= 1 << desc.Which();
3200  s_AddUserObjectType(desc, existing_uo_types);
3201  }
3202 
3203  if (main_seq && main_seq->IsSetDescr()) {
3204  for (auto& desc : main_seq->GetDescr().Get()) {
3205  type_mask |= 1 << desc->Which();
3206  s_AddUserObjectType(*desc, existing_uo_types);
3207  }
3208  }
3209 
3210  string kMasterDescrMark = "WithMasterDescr";
3211  if ( existing_uo_types.find(kMasterDescrMark) == existing_uo_types.end() ) {
3212  ITERATE ( TMasterDescr, it, GetMasterDescr() ) {
3213  const CSeqdesc& desc = **it;
3215  (type_mask & (1 << desc.Which())) ) {
3216  bool skip = true;
3217  string uo_type = s_GetUserObjectType(desc);
3218  if (!uo_type.empty() && existing_uo_types.count(uo_type) == 0)
3219  skip = false;
3220  // omit master descr if contig already has one of that type
3221  if (skip)
3222  continue;
3223  }
3224  descr.Set().push_back(*it);
3225  }
3226  if ( flags & fMasterDescrMark ) {
3227  CRef<CSeqdesc> desc(new CSeqdesc);
3228  auto& user_object = desc->SetUser();
3229  user_object.SetType().SetStr(kMasterDescrMark);
3230  user_object.SetData();
3231  descr.Set().push_back(desc);
3232  }
3233  }
3234  }
3235 }
3236 
3237 
3239 {
3240  if ( m_MasterEntry ) {
3241  return m_MasterEntry;
3242  }
3243 
3244  // generate one
3245  CRef<CSeq_entry> entry(new CSeq_entry);
3246  CBioseq& seq = entry->SetSeq();
3247  seq.SetId().push_back(GetMasterSeq_id());
3248  if ( !m_MasterDescr.empty() ) {
3249  seq.SetDescr().Set() = m_MasterDescr;
3250  }
3251  CSeq_inst& inst = seq.SetInst();
3254  return entry;
3255 }
3256 
3257 
3259 {
3260  m_PatentId = id;
3261 }
3262 
3263 /*
3264 int CWGSDb_Impl::GetPatentSeqIdNuc(CTempString str_id) const
3265 {
3266  if ( !HasPatentId() ) {
3267  return 0;
3268  }
3269  int id = sx_GetStringId(str_id);
3270  return id >= 0 && IsValidPatentSeqIdNuc(id)? id: 0;
3271 }
3272 
3273 
3274 int CWGSDb_Impl::GetPatentSeqIdProt(CTempString str_id) const
3275 {
3276  if ( !HasPatentId() ) {
3277  return 0;
3278  }
3279  int id = sx_GetStringId(str_id);
3280  return id >= 0 && IsValidPatentSeqIdProt(id)? id: 0;
3281 }
3282 */
3283 
3285 {
3286  if ( m_MasterEntry ) {
3287  const CBioseq::TId& ids = m_MasterEntry->GetSeq().GetId();
3288  ITERATE ( CBioseq::TId, it, ids ) {
3289  const CSeq_id& id = **it;
3290  if ( id.IsGi() ) {
3291  return id.GetGi();
3292  }
3293  }
3294  }
3295  return ZERO_GI;
3296 }
3297 
3298 
3299 static inline TGi s_ToGi(TVDBRowId gi, const char* method)
3300 {
3301  if ( gi < 0 ||
3302  (sizeof(TIntId) != sizeof(gi) && TVDBRowId(TIntId(gi)) != gi) ) {
3303  NCBI_THROW_FMT(CSraException, eDataError,
3304  method<<": GI is too big: "<<gi);
3305  }
3306  return GI_FROM(TVDBRowId, gi);
3307 }
3308 
3309 
3310 pair<TGi, TGi> CWGSDb_Impl::GetNucGiRange(void)
3311 {
3312  pair<TGi, TGi> ret;
3313  if ( CRef<SGiIdxTableCursor> idx = GiIdx() ) {
3314  if ( idx->m_NUC_ROW_ID ) {
3315  TVDBRowIdRange row_range =
3316  idx->m_NUC_ROW_ID.GetRowIdRange(idx->m_Cursor);
3317  if ( row_range.second ) {
3318  ret.first = s_ToGi(row_range.first,
3319  "CWGSDb::GetNucGiRange()");
3320  ret.second = s_ToGi(row_range.first + row_range.second - 1,
3321  "CWGSDb::GetNucGiRange()");
3322  }
3323  }
3324  Put(idx);
3325  }
3326  return ret;
3327 }
3328 
3329 
3330 pair<TGi, TGi> CWGSDb_Impl::GetProtGiRange(void)
3331 {
3332  pair<TGi, TGi> ret;
3333  if ( CRef<SGiIdxTableCursor> idx = GiIdx() ) {
3334  if ( idx->m_PROT_ROW_ID ) {
3335  TVDBRowIdRange row_range =
3336  idx->m_PROT_ROW_ID.GetRowIdRange(idx->m_Cursor);
3337  if ( row_range.second ) {
3338  ret.first = s_ToGi(row_range.first,
3339  "CWGSDb::GetProtGiRange()");
3340  ret.second = s_ToGi(row_range.first + row_range.second - 1,
3341  "CWGSDb::GetProtGiRange()");
3342  }
3343  }
3344  Put(idx);
3345  }
3346  return ret;
3347 }
3348 
3349 
3351 {
3352  if ( ranges.empty() ) {
3353  return;
3354  }
3355  sort(ranges.begin(), ranges.end());
3356  TGiRanges::iterator dst = ranges.begin();
3357  for ( TGiRanges::iterator i = dst+1; i != ranges.end(); ++i ) {
3358  if ( i->GetFrom() == dst->GetToOpen() ) {
3359  dst->SetToOpen(i->GetToOpen());
3360  }
3361  else {
3362  *++dst = *i;
3363  }
3364  }
3365  ranges.erase(dst+1, ranges.end());
3366 }
3367 
3368 
3370 {
3371  TGiRanges ranges;
3372  TVDBRowId row_id = 0;
3373  CRef<SSeqTableCursor> seq = Seq();
3374  if ( seq->m_GI ) {
3375  TIntId gi_start = -1, gi_end = -1;
3376  TVDBRowIdRange row_range = seq->m_GI.GetRowIdRange(seq->m_Cursor);
3377  for ( TVDBRowCount i = 0; i < row_range.second; ++i ) {
3378  row_id = row_range.first+i;
3379  TIntId gi = GI_TO(TIntId, s_ToGi(*seq->GI(row_id), "CWGSDb::GetNucGiRanges()"));
3380  if ( !gi ) {
3381  continue;
3382  }
3383  if ( gi != gi_end ) {
3384  if ( gi_end != gi_start ) {
3385  ranges.push_back(TGiRange(gi_start, gi_end));
3386  }
3387  gi_start = gi;
3388  }
3389  gi_end = gi+1;
3390  }
3391  if ( gi_end != gi_start ) {
3392  ranges.push_back(TGiRange(gi_start, gi_end));
3393  }
3394  x_SortGiRanges(ranges);
3395  }
3396  Put(seq, row_id);
3397  return ranges;
3398 }
3399 
3400 
3402 {
3403  TGiRanges ranges;
3404  return ranges;
3405 }
3406 
3407 
3409  : m_IdLength(0)
3410 {
3411  SIZE_TYPE prefix = 0;
3412  while ( prefix < acc.size() && isalpha(acc[prefix]&0xff) ) {
3413  ++prefix;
3414  }
3415  if ( prefix == acc.size() || prefix == 0 || acc.size()-prefix > 9 ) {
3416  // no prefix, or no digits, or too many digits
3417  return;
3418  }
3419  Uint4 v = 0;
3420  for ( SIZE_TYPE i = prefix; i < acc.size(); ++i ) {
3421  char c = acc[i];
3422  if ( c < '0' || c > '9' ) {
3423  return;
3424  }
3425  v = v*10 + (c-'0');
3426  }
3427  id = v;
3428  m_AccPrefix = acc.substr(0, prefix);
3430  m_IdLength = Uint4(acc.size());
3431 }
3432 
3433 
3435 {
3436  string acc = m_AccPrefix;
3437  acc.resize(m_IdLength, '0');
3438  for ( SIZE_TYPE i = m_IdLength; id; id /= 10 ) {
3439  acc[--i] += id % 10;
3440  }
3441  return acc;
3442 }
3443 
3444 
3446 {
3447  TProtAccRanges ranges;
3448  if ( CRef<SProt0TableCursor> seq = Prot0() ) {
3449  TVDBRowId row_id = 0;
3450  TVDBRowIdRange row_range = seq->m_GB_ACCESSION.GetRowIdRange(seq->m_Cursor);
3451  for ( TVDBRowCount i = 0; i < row_range.second; ++i ) {
3452  row_id = row_range.first+i;
3453  CTempString acc = *seq->GB_ACCESSION(row_id);
3454  if ( acc.empty() ) {
3455  continue;
3456  }
3457  Uint4 id;
3458  SProtAccInfo info(acc, id);
3459  if ( !info ) {
3460  continue;
3461  }
3463  if ( it == ranges.end() || it->first != info ) {
3464  TIdRange range(id, id+1);
3466  }
3467  else {
3468  if ( id < it->second.GetFrom() ) {
3469  it->second.SetFrom(id);
3470  }
3471  else if ( id >= it->second.GetToOpen() ) {
3472  it->second.SetTo(id);
3473  }
3474  }
3475  }
3476  Put(seq, row_id);
3477  }
3478  return ranges;
3479 }
3480 
3481 
3482 pair<TVDBRowId, bool> CWGSDb_Impl::GetGiRowId(TGi gi)
3483 {
3484  pair<TVDBRowId, bool> ret;
3485  TIntId row_id = GI_TO(TIntId, gi);
3486  if ( CRef<SGiIdxTableCursor> idx = GiIdx(row_id) ) {
3487  if ( idx->m_NUC_ROW_ID ) {
3489  idx->NUC_ROW_ID(row_id, CVDBValue::eMissing_Allow);
3490  if ( !value.empty() ) {
3491  ret.first = *value;
3492  }
3493  }
3494  if ( !ret.first && idx->m_PROT_ROW_ID ) {
3496  idx->PROT_ROW_ID(row_id, CVDBValue::eMissing_Allow);
3497  if ( !value.empty() ) {
3498  ret.first = *value;
3499  }
3500  }
3501  Put(idx, row_id);
3502  }
3503  return ret;
3504 }
3505 
3506 
3508 {
3509  TVDBRowId ret = 0;
3510  TIntId row_id = GI_TO(TIntId, gi);
3511  if ( CRef<SGiIdxTableCursor> idx = GiIdx(row_id) ) {
3512  if ( idx->m_NUC_ROW_ID ) {
3514  idx->NUC_ROW_ID(row_id, CVDBValue::eMissing_Allow);
3515  if ( !value.empty() ) {
3516  ret = *value;
3517  }
3518  }
3519  Put(idx, row_id);
3520  }
3521  return ret;
3522 }
3523 
3524 
3526 {
3527  TVDBRowId ret = 0;
3528  TIntId row_id = GI_TO(TIntId, gi);
3529  if ( CRef<SGiIdxTableCursor> idx = GiIdx(row_id) ) {
3530  if ( idx->m_PROT_ROW_ID ) {
3532  idx->PROT_ROW_ID(row_id, CVDBValue::eMissing_Allow);
3533  if ( !value.empty() ) {
3534  ret = *value;
3535  }
3536  }
3537  Put(idx, row_id);
3538  }
3539  return ret;
3540 }
3541 
3542 
3543 inline
3544 TVDBRowId CWGSDb_Impl::Lookup(const string& name,
3545  const CVDBTableIndex& index,
3546  bool upcase)
3547 {
3548  if ( !index ) {
3549  return 0;
3550  }
3551  if ( upcase && !NStr::IsUpper(name) ) {
3552  // upcase
3553  string tmp = name;
3555  return range.second? range.first: 0;
3556  }
3557  else {
3558  TVDBRowIdRange range = index.Find(name);
3559  return range.second? range.first: 0;
3560  }
3561 }
3562 
3563 
3565 {
3566  if ( 1 ) {
3567  CRef<SSeqTableCursor> seq = Seq();
3568  if ( seq->m_CONTIG_NAME_ROW_RANGE ) {
3569  seq->m_Cursor.SetParam("CONTIG_NAME_QUERY", name);
3572  seq->CONTIG_NAME_ROW_RANGE(0, CVDBValue::eMissing_Allow);
3573  if ( !value.empty() ) {
3574  range = *value;
3575  }
3576  Put(seq);
3577  return range.first;
3578  }
3579  Put(seq);
3580  }
3581  const CVDBTableIndex& index = ContigNameIndex();
3582  return Lookup(name, index, m_ContigNameIndexIsOpened.load(memory_order_relaxed) == 2);
3583 }
3584 
3585 
3587 {
3588  const CVDBTableIndex& index = ScaffoldNameIndex();
3589  return Lookup(name, index, m_ScaffoldNameIndexIsOpened.load(memory_order_relaxed) == 2);
3590 }
3591 
3592 
3594 {
3595  const CVDBTableIndex& index = ProteinNameIndex();
3596  return Lookup(name, index, m_ProteinNameIndexIsOpened.load(memory_order_relaxed) == 2);
3597 }
3598 
3599 
3601 {
3602  const CVDBTableIndex& index = ProductNameIndex();
3603  return Lookup(name, index, m_ProductNameIndexIsOpened.load(memory_order_relaxed) == 2);
3604 }
3605 
3606 
3607 TVDBRowId CWGSDb_Impl::GetProtAccRowId(const string& acc, int ask_version)
3608 {
3609  TVDBRowId prot_row_id = 0;
3610  if ( CRef<SProtIdxTableCursor> idx = ProtIdx() ) {
3611  CVDBMgr::CRequestContextUpdater ctx_updater;
3613  string tmp = acc;
3614  const char* query_param_name;
3615  if ( NStr::Equal(idx->m_ROW_ID.GetName(), "ROW_ID") ) {
3616  query_param_name = "NAME_QUERY";
3617  }
3618  else {
3619  query_param_name = "ACCESSION_QUERY";
3620  }
3621  idx->m_Cursor.SetParam(query_param_name, NStr::ToUpper(tmp));
3623  idx->NAME_ROW_RANGE(0, CVDBValue::eMissing_Allow);
3624  if ( !value.empty() ) {
3625  range = *value;
3626  }
3627  if ( range.first && range.first <= range.second ) {
3628  CVDBValueFor<TVDBRowId> prot_rows = idx->ROW_ID(range.first);
3629  if ( !prot_rows.empty() ) {
3630  if ( ask_version > 0 ) {
3631  // check if version exists
3632  size_t version_index = size_t(prot_rows.size() == 1? 0: ask_version-1);
3633  if ( version_index < prot_rows.size() ) {
3634  // check if version mathces
3635  prot_row_id = prot_rows[version_index];
3636  if ( prot_row_id ) {
3637  CRef<SProt0TableCursor> prot = Prot0(prot_row_id);
3638  int actual_version = *prot->ACC_VERSION(prot_row_id);
3639  Put(prot, prot_row_id);
3640  if ( actual_version != ask_version ) {
3641  // version mismatch
3642  prot_row_id = 0;
3643  }
3644  }
3645  }
3646  }
3647  else if ( ask_version == -1 ) {
3648  // last version
3649  prot_row_id = prot_rows[prot_rows.size()-1];
3650  }
3651  }
3652  }
3653  Put(idx);
3654  }
3655  return prot_row_id;
3656 }
3657 
3658 
3660 {
3661  bool can_have_gis = false;
3662  auto cur = Seq();
3663  if (cur->m_GI) {
3664  auto gi_range = cur->m_Cursor.GetRowIdRange(cur->m_GI.GetIndex());
3665  if ( gi_range.second ) {
3666  auto value = cur->GI(gi_range.first, CVDBValue::eMissing_Allow);
3667  if (value.size() == 1 && *value != 0) {
3668  can_have_gis = true;
3669  }
3670  }
3671  }
3672  Put(cur);
3673  return can_have_gis;
3674 }
3675 
3676 
3678 {
3679  TVDBRowCount feature_count = 0;
3680  if ( auto cur = Feat() ) {
3681  feature_count = cur->m_Cursor.GetRowIdRange().second;
3682  Put(cur);
3683  }
3684  return feature_count;
3685 }
3686 
3687 
3689 {
3690  return GetTotalFeatureCount() > 0;
3691 }
3692 
3693 
3695 {
3696  if (GetWGSPath().find_first_of("\\/.:") != NPOS) {
3697  // non-standard path
3698  return false;
3699  }
3700  if (IsReplaced()) {
3701  // old or replaced WGS project
3702  return false;
3703  }
3705  // disabled WGS project
3706  return false;
3707  }
3708  return true;
3709 }
3710 
3711 
3713 {
3714  // assume no feature id correction
3715  EFeatLocIdType loc_id_type = eFeatLocIdGi;
3716  if (HasStandardFeatLocIdType()) {
3717  // shortcut for regular VDB files
3718  if (!CanHaveGis()) {
3719  loc_id_type = eFeatLocIdAccVer;
3720  }
3721  return loc_id_type;
3722  }
3723  try {
3724  if ( CRef<SFeatTableCursor> cur = Feat() ) {
3725  TVDBRowId feat_row_id = 1;
3726  try {
3727  PROFILE(sw_GetFeatLocIdTypeRange);
3728  CRef<SSeqTableCursor> seq = Seq();
3729  auto row_range = seq->m_Cursor.GetRowIdRange(seq->m_FEAT_ROW_START.GetIndex());
3730  for ( TVDBRowCount i = 0; i < row_range.second; ++i ) {
3731  auto seq_row_id = row_range.first+i;
3732  auto row_start = seq->FEAT_ROW_START(seq_row_id);
3733  if ( !row_start.empty() ) {
3734  feat_row_id = *row_start;
3735  break;
3736  }
3737  }
3738  Put(seq);
3739  }
3740  catch ( exception& /*ignored*/ ) {
3741  // use first feature in the file
3742  }
3743  PROFILE(sw_GetFeatLocIdTypeFeat);
3744  CRef<CSeq_feat> feat(new CSeq_feat);
3745  CTempString bytes;
3746  {{
3747  PROFILE(sw_GetFeatLocIdTypeFeatBytes);
3748  bytes = *cur->SEQ_FEAT(feat_row_id);
3749  }}
3750  cur.GetNCObject().m_ObjStr.OpenFromBuffer(bytes.data(), bytes.size());
3751  cur.GetNCObject().m_ObjStr >> *feat;
3752  Put(cur);
3754  if ( const CTextseq_id* id = !seq_id? 0: seq_id->GetTextseq_Id() ) {
3755  if ( id->IsSetVersion() ) {
3756  loc_id_type = eFeatLocIdAccVer;
3757  }
3758  else {
3759  loc_id_type = eFeatLocIdAccNoVer;
3760  }
3761  }
3762  }
3763  }
3764  catch ( exception& /*ignored*/ ) {
3765  // assume no feature id correction
3766  }
3767  return loc_id_type;
3768 }
3769 
3770 
3772 {
3773  auto loc_id_type = m_FeatLocIdType.load(memory_order_relaxed);
3774  if ( loc_id_type == eFeatLocIdUninitialized ) {
3775  // determine and cache for the future
3776  loc_id_type = DetermineFeatLocIdType();
3777  m_FeatLocIdType.store(loc_id_type, memory_order_relaxed);
3778  }
3779  return loc_id_type;
3780 }
3781 
3782 
3783 /////////////////////////////////////////////////////////////////////////////
3784 // CWGSSeqIterator
3785 /////////////////////////////////////////////////////////////////////////////
3786 
3787 
3789 {
3790  // skip gaps starting before the requested position
3791  while ( *this && GetToOpen() <= pos ) {
3792  ++*this;
3793  }
3794 }
3795 
3796 
3798  eFromFlags
3799 };
3802 };
3803 
3804 
3806 {
3807  explicit
3809  : db(db),
3811  split_prod(false),
3812  split_data(false),
3813  split_feat(false),
3814  split_qual(false),
3815  split_version(kAssignedDefaultSplitVersion)
3816  {
3817  }
3818 
3820  : SWGSCreateInfo(db)
3821  {
3822  if ( flags != fDefaultFlags ) {
3823  x_SetFlags(flags);
3824  }
3825  }
3826 
3828  : SWGSCreateInfo(db)
3829  {
3830  if ( split_version != kDefaultSplitVersion ) {
3831  x_SetSplitVersion(split_version);
3832  }
3833  }
3834 
3835  // set flags and corresponding split_version
3836  void x_SetFlags(TFlags flags);
3837 
3838  // set split_version and corresponding flags
3839  void x_SetSplitVersion(TSplitVersion split_version);
3840 
3842  TFlags flags;
3843  bool split_prod, split_data, split_feat, split_qual;
3852 
3853  template<class Iter>
3854  void x_SetId(Iter& it)
3855  {
3856  main_id = it.GetId(flags);
3857  feat_id = main_id;
3858  // fix feature ids
3859  // it can be accession.version and accession
3860  if ( feat_id->IsGi() ) {
3861  EFeatLocIdType feat_loc_id_type = db->GetFeatLocIdType();
3862  if ( feat_loc_id_type != eFeatLocIdGi ) {
3863  feat_id = it.GetId(flags & ~fIds_gi);
3864  }
3865  }
3866  }
3867  void x_ResetId()
3868  {
3869  main_id = null;
3870  feat_id = null;
3871  }
3872  template<class Iter>
3873  void x_SetSeq(Iter& it)
3874  {
3875  main_seq = new CBioseq();
3876  x_SetId(it);
3877  }
3879  {
3880  main_seq = new CBioseq();
3881  x_SetId(it);
3882  }
3883  void x_ResetSeq()
3884  {
3885  main_seq = null;
3886  x_ResetId();
3887  }
3888 
3889  void x_AddDescr(CTempString bytes);
3890  void x_AddFeature(const CWGSFeatureIterator& it,
3892  void x_AddFeaturesDirect(TVDBRowIdRange range,
3893  vector<TVDBRowId>& product_row_ids);
3894  void x_AddFeaturesSplit(TVDBRowIdRange range,
3895  vector<TVDBRowId>& product_row_ids);
3896  void x_AddFeatures(TVDBRowIdRange range,
3897  vector<TVDBRowId>& product_row_ids);
3898  void x_AddFeatures(TVDBRowIdRange range);
3899  CBioseq_set& x_GetProtSet(void);
3900  void x_CreateProtSet(TVDBRowIdRange range);
3901  void x_AddProducts(const vector<TVDBRowId>& product_row_ids);
3902 };
3903 
3904 
3906 {
3907  if ( *this ) {
3908  auto state = GetGBState();
3909  // skip artificial entries with 'missing' state
3911  return true;
3912  }
3913  // skip not included entries
3914  if ( !(m_IncludeFlags & TIncludeFlags(1 << state)) ) {
3915  return true;
3916  }
3917  }
3918  return false;
3919 }
3920 
3921 
3923 {
3924  if ( m_Cur0 ) {
3925  if ( m_Db ) {
3926  GetDb().Put(m_Cur0, m_CurrId);
3927  if ( m_Cur ) {
3928  GetDb().Put(m_Cur, m_CurrId);
3929  }
3930  }
3931  else {
3932  m_Cur.Reset();
3933  m_Cur0.Reset();
3934  }
3935  }
3936  m_Db.Reset();
3937  m_CurrId = m_FirstGoodId = m_FirstBadId = 0;
3938  m_AccVersion = eLatest;
3939 }
3940 
3941 
3943  : m_AccVersion(eLatest)
3944 {
3945  *this = iter;
3946 }
3947 
3948 
3950 {
3951  if ( this != &iter ) {
3952  Reset();
3953  m_Db = iter.m_Db;
3954  m_Cur0 = iter.m_Cur0;
3955  m_Cur = iter.m_Cur;
3956  m_CurrId = iter.m_CurrId;
3957  m_AccVersion = iter.m_AccVersion;
3959  m_FirstBadId = iter.m_FirstBadId;
3962  }
3963  return *this;
3964 }
3965 
3966 
3968  : m_CurrId(0),
3969  m_FirstGoodId(0),
3970  m_FirstBadId(0),
3971  m_AccVersion(eLatest),
3972  m_IncludeFlags(fIncludeDefault),
3973  m_ClipByQuality(true)
3974 {
3975 }
3976 
3977 
3979  EIncludeFlags include_flags,
3980  EClipType clip_type)
3981  : m_AccVersion(eLatest)
3982 {
3983  x_Select(wgs_db, include_flags, clip_type);
3984 }
3985 
3986 
3988  TVDBRowId row,
3989  EIncludeFlags include_flags,
3990  EClipType clip_type)
3991  : m_AccVersion(eLatest)
3992 {
3993  x_Select(wgs_db, include_flags, clip_type, row);
3994 }
3995 
3996 
3998  TVDBRowId first_row,
3999  TVDBRowId last_row,
4000  EIncludeFlags include_flags,
4001  EClipType clip_type)
4002  : m_AccVersion(eLatest)
4003 {
4004  x_Select(wgs_db, include_flags, clip_type, first_row, last_row);
4005 }
4006 
4007 
4009  CTempString acc,
4010  EIncludeFlags include_flags,
4011  EClipType clip_type)
4012  : m_AccVersion(eLatest)
4013 {
4014  x_Select(wgs_db, include_flags, clip_type, acc);
4015 }
4016 
4017 
4019  TIncludeFlags include_flags,
4020  EClipType clip_type)
4021  : m_AccVersion(eLatest)
4022 {
4023  x_Select(wgs_db, include_flags, clip_type);
4024 }
4025 
4026 
4028  TVDBRowId row,
4029  TIncludeFlags include_flags,
4030  EClipType clip_type)
4031  : m_AccVersion(eLatest)
4032 {
4033  x_Select(wgs_db, include_flags, clip_type, row);
4034 }
4035 
4036 
4038  TVDBRowId first_row,
4039  TVDBRowId last_row,
4040  TIncludeFlags include_flags,
4041  EClipType clip_type)
4042  : m_AccVersion(eLatest)
4043 {
4044  x_Select(wgs_db, include_flags, clip_type, first_row, last_row);
4045 }
4046 
4047 
4049  CTempString acc,
4050  TIncludeFlags include_flags,
4051  EClipType clip_type)
4052  : m_AccVersion(eLatest)
4053 {
4054  x_Select(wgs_db, include_flags, clip_type, acc);
4055 }
4056 
4057 
4058 static inline
4059 CWGSSeqIterator::TIncludeFlags s_ToFlags(CWGSSeqIterator::EWithdrawn withdrawn)
4060 {
4061  if ( withdrawn == CWGSSeqIterator::eIncludeWithdrawn ) {
4063  }
4064  else {
4066  }
4067 }
4068 
4069 
4071  EWithdrawn withdrawn,
4072  EClipType clip_type)
4073  : m_AccVersion(eLatest)
4074 {
4075  x_Select(wgs_db, s_ToFlags(withdrawn), clip_type);
4076 }
4077 
4078 
4080  TVDBRowId row,
4081  EWithdrawn withdrawn,
4082  EClipType clip_type)
4083  : m_AccVersion(eLatest)
4084 {
4085  x_Select(wgs_db, s_ToFlags(withdrawn), clip_type, row);
4086 }
4087 
4088 
4090  TVDBRowId first_row,
4091  TVDBRowId last_row,
4092  EWithdrawn withdrawn,
4093  EClipType clip_type)
4094  : m_AccVersion(eLatest)
4095 {
4096  x_Select(wgs_db, s_ToFlags(withdrawn), clip_type, first_row, last_row);
4097 }
4098 
4099 
4101  CTempString acc,
4102  EWithdrawn withdrawn,
4103  EClipType clip_type)
4104  : m_AccVersion(eLatest)
4105 {
4106  x_Select(wgs_db, s_ToFlags(withdrawn), clip_type, acc);
4107 }
4108 
4109 
4111 {
4112  Reset();
4113 }
4114 
4115 
4116 
4118  TIncludeFlags include_flags,
4119  EClipType clip_type)
4120 {
4121  x_Init(wgs_db, include_flags, clip_type, 0);
4122  x_Settle();
4123 }
4124 
4125 
4127  TIncludeFlags include_flags,
4128  EClipType clip_type,
4129  TVDBRowId row)
4130 {
4131  CVDBMgr::CRequestContextUpdater ctx_updater;
4132  x_Init(wgs_db, include_flags, clip_type, row);
4133  SelectRow(row);
4134 }
4135 
4136 
4138  TIncludeFlags include_flags,
4139  EClipType clip_type,
4140  TVDBRowId first_row,
4141  TVDBRowId last_row)
4142 {
4143  CVDBMgr::CRequestContextUpdater ctx_updater;
4144  x_Init(wgs_db, include_flags, clip_type, first_row);
4145  if ( m_FirstBadId == 0 ) {
4146  return;
4147  }
4148  if ( first_row > m_FirstGoodId ) {
4149  m_CurrId = m_FirstGoodId = first_row;
4151  }
4152  if ( last_row < m_FirstBadId-1 ) {
4153  m_FirstBadId = last_row+1;
4154  }
4155  x_Settle();
4156 }
4157 
4158 
4160  TIncludeFlags include_flags,
4161  EClipType clip_type,
4162  CTempString acc)
4163 {
4164  CVDBMgr::CRequestContextUpdater ctx_updater;
4165  if ( TVDBRowId row = wgs_db.ParseContigRow(acc) ) {
4166  x_Init(wgs_db, include_flags, clip_type, row);
4167  SelectRow(row);
4168  }
4169  else {
4170  // bad format
4173  }
4174 }
4175 
4176 
4177 void CWGSSeqIterator::x_Init(const CWGSDb& wgs_db,
4178  TIncludeFlags include_flags,
4179  EClipType clip_type,
4180  TVDBRowId get_row)
4181 {
4182  PROFILE(sw_SeqIterator);
4185  m_ClipByQuality = false;
4186  if ( !wgs_db ) {
4187  return;
4188  }
4189  m_Cur0 = wgs_db.GetNCObject().Seq0(get_row);
4190  m_Cur = wgs_db.GetNCObject().Seq(get_row);
4191  if ( !m_Cur ) {
4192  return;
4193  }
4194  m_Db = wgs_db;
4195  m_IncludeFlags = include_flags;
4196  switch ( clip_type ) {
4197  case eNoClip:
4198  m_ClipByQuality = false;
4199  break;
4200  case eClipByQuality:
4201  m_ClipByQuality = true;
4202  break;
4203  default:
4205  break;
4206  }
4207  TVDBRowIdRange range = m_Cur->m_CONTIG_NAME.GetRowIdRange(m_Cur->m_Cursor);
4208  m_FirstGoodId = m_CurrId = range.first;
4209  m_FirstBadId = range.first+range.second;
4210 }
4211 
4212 
4214 {
4215  if ( row < m_FirstGoodId ) {
4216  // before the first id
4218  }
4219  else {
4220  m_CurrId = row;
4221  if ( x_Excluded() ) {
4223  }
4224  }
4226  return *this;
4227 }
4228 
4229 
4231 {
4232  x_CheckValid("CWGSSeqIterator::operator++");
4233  m_AmbiguityInfo = null;
4234  ++m_CurrId;
4235  x_Settle();
4236  return *this;
4237 }
4238 
4239 
4241 {
4242  while ( *this && x_Excluded() ) {
4243  ++m_CurrId;
4245  }
4246 }
4247 
4248 
4249 void CWGSSeqIterator::x_ReportInvalid(const char* method) const
4250 {
4251  NCBI_THROW_FMT(CSraException, eInvalidState,
4252  "CWGSSeqIterator::"<<method<<"(): Invalid iterator state");
4253 }
4254 
4255 
4256 bool CWGSSeqIterator::HasGi(void) const
4257 {
4258  return m_Cur->m_GI && GetGi() != ZERO_GI;
4259 }
4260 
4261 
4263 {
4264  x_CheckValid("CWGSSeqIterator::GetGi");
4265  if ( !m_Cur->m_GI || m_AccVersion.m_Offset != 0 ) {
4266  return ZERO_GI;
4267  }
4269  return gi.empty()? ZERO_GI: s_ToGi(*gi, "CWGSSeqIterator::GetGi()");
4270 }
4271 
4272 
4274 {
4275  x_CheckValid("CWGSSeqIterator::GetAccession");
4276  return *CVDBStringValue(m_Cur->ACCESSION(m_CurrId));
4277 }
4278 
4279 
4281 {
4282  CVDBMgr::CRequestContextUpdater ctx_updater;
4283  x_CheckValid("CWGSSeqIterator::GetLatestAccVersion");
4284  return *m_Cur->ACC_VERSION(m_CurrId);
4285 }
4286 
4287 
4289 {
4290  x_CheckValid("CWGSSeqIterator::GetAccVersionCount");
4291 #ifdef TEST_ACC_VERSION
4292  if ( GetLatestAccVersion() > 1 &&
4293  m_Cur->TRIM_START(m_CurrId).size() == 1 ) {
4294  return 2;
4295  }
4296 #endif
4297  return unsigned(m_Cur->TRIM_START(m_CurrId).size());
4298 }
4299 
4300 
4302 {
4303  CVDBMgr::CRequestContextUpdater ctx_updater;
4304  if ( version == -1 ) {
4305  // latest version
4306  return true;
4307  }
4308  int latest_version = GetLatestAccVersion();
4309  return version <= latest_version &&
4310  version > int(latest_version - GetAccVersionCount());
4311 }
4312 
4313 
4316 {
4318  if ( version != -1 ) {
4319  int latest_version = GetLatestAccVersion();
4320  int oldest_version = latest_version - GetAccVersionCount() + 1;
4321  if ( version > latest_version || version < oldest_version ) {
4322  NCBI_THROW_FMT(CSraException, eDataError,
4323  "CWGSSeqIterator: "<<
4324  GetDb().m_IdPrefixWithVersion<<"/"<<m_CurrId<<
4325  " version "<<version<<
4326  " is out of VDB version range: "<<
4327  oldest_version<<"-"<<latest_version);
4328  }
4329  ret.m_Offset = version - latest_version;
4330  }
4331  return ret;
4332 }
4333 
4334 
4336 {
4337  CVDBMgr::CRequestContextUpdater ctx_updater;
4339 }
4340 
4341 
4343 {
4344  CVDBMgr::CRequestContextUpdater ctx_updater;
4346 }
4347 
4348 
4350 {
4351  CRef<CSeq_id> id;
4352  if ( m_Cur->m_GI ) {
4353  CSeq_id::TGi gi = GetGi();
4354  if ( gi != ZERO_GI ) {
4355  id = new CSeq_id;
4356  id->SetGi(gi);
4357  }
4358  }
4359  return id;
4360 }
4361 
4362 
4364 {
4366 }
4367 
4368 
4370 {
4371  return GetGeneralOrPatentSeq_id();
4372 }
4373 
4374 
4376 {
4377  x_CheckValid("CWGSSeqIterator::GetContigName");
4378  return *m_Cur->CONTIG_NAME(m_CurrId);
4379 }
4380 
4382 {
4383  x_CheckValid("CWGSSeqIterator::HasTitle");
4384  return m_Cur->m_TITLE && !m_Cur->TITLE(m_CurrId).empty();
4385 }
4386 
4388 {
4389  x_CheckValid("CWGSSeqIterator::GetTitle");
4390  return *m_Cur->TITLE(m_CurrId);
4391 }
4392 
4394 {
4395  return value.empty()? ZERO_TAX_ID: TAX_ID_FROM(int, value[0]);
4396 }
4397 
4398 
4400 {
4401  return GetDb().HasCommonTaxId() || m_Cur0->m_TAXID;
4402 }
4403 
4404 
4406 {
4407  x_CheckValid("CWGSSeqIterator::GetTaxId");
4408  if ( GetDb().HasCommonTaxId() ) {
4409  return GetDb().GetCommonTaxId();
4410  }
4411  return s_GetTaxId(m_Cur0->TAXID(m_CurrId));
4412 }
4413 
4414 
4416 {
4417  x_CheckValid("CWGSSeqIterator::GetSeqHash");
4418  return m_Cur->m_HASH;
4419 }
4420 
4421 
4423 {
4424  return HasSeqHash()? *m_Cur->HASH(m_CurrId): 0;
4425 }
4426 
4427 
4429 {
4430  return *m_Cur->READ_LEN(m_CurrId);
4431 }
4432 
4433 
4435 {
4437 #ifdef TEST_ACC_VERSION
4438  if ( GetLatestAccVersion() > 1 && m_AccVersion.m_Offset != 0 && arr.size() == 1 ) {
4439  return *arr - 5*m_AccVersion.m_Offset;
4440  }
4441 #endif
4442  return arr[arr.size()-1+m_AccVersion.m_Offset];
4443 }
4444 
4445 
4447 {
4449 #ifdef TEST_ACC_VERSION
4450  if ( GetLatestAccVersion() > 1 && m_AccVersion.m_Offset != 0 && arr.size() == 1 ) {
4452  return len < *arr? len: 0;
4453  }
4454 #endif
4455  return arr[arr.size()-1+m_AccVersion.m_Offset];
4456 }
4457 
4458 
4460 {
4461  if ( GetClipQualityLeft() != 0 ) {
4462  return true;
4463  }
4464  if ( GetClipQualityLength() != GetRawSeqLength() ) {
4465  return true;
4466  }
4467  return false;
4468 }
4469 
4470 
4472 {
4473  return GetClipByQualityFlag(clip_type)?
4474  GetClipQualityLeft(): 0;
4475 }
4476 
4477 
4479 {
4480  return GetClipByQualityFlag(clip_type)?
4482 }
4483 
4484 
4486 {
4487  if ( flags & fIds_gi ) {
4488  // gi
4489  if ( CRef<CSeq_id> id = GetGiSeq_id() ) {
4490  return id;
4491  }
4492  }
4493 
4494  if ( flags & fIds_acc ) {
4495  // acc.ver
4496  if ( CRef<CSeq_id> id = GetAccSeq_id() ) {
4497  return id;
4498  }
4499  }
4500 
4501  if ( flags & fIds_gnl ) {
4502  // gnl
4503  if ( CRef<CSeq_id> id = GetGeneralOrPatentSeq_id() ) {
4504  return id;
4505  }
4506  }
4507 
4508  NCBI_THROW_FMT(CSraException, eDataError,
4509  "CWGSSeqIterator::GetId("<<flags<<"): "
4510  "no valid id found: "<<
4511  GetDb().m_IdPrefixWithVersion<<"/"<<m_CurrId);
4512 }
4513 
4514 
4516 {
4517  CVDBMgr::CRequestContextUpdater ctx_updater;
4518  PROFILE(sw___GetContigIds);
4519  if ( flags & fIds_acc ) {
4520  // acc.ver
4521  if ( CRef<CSeq_id> id = GetAccSeq_id() ) {
4522  ids.push_back(id);
4523  }
4524  }
4525 
4526  if ( flags & fIds_gnl ) {
4527  // gnl
4528  if ( CRef<CSeq_id> id = GetGeneralOrPatentSeq_id() ) {
4529  ids.push_back(id);
4530  }
4531  }
4532 
4533  if ( flags & fIds_gi ) {
4534  // gi
4535  if ( CRef<CSeq_id> id = GetGiSeq_id() ) {
4536  ids.push_back(id);
4537  }
4538  }
4539 }
4540 
4541 
4543 {
4544  x_CheckValid("CWGSSeqIterator::HasSeqDescrBytes");
4545  return m_Cur->m_DESCR && !m_Cur->DESCR(m_CurrId).empty();
4546 }
4547 
4548 
4550 {
4551  x_CheckValid("CWGSSeqIterator::GetSeqDescrBytes");
4552  CTempString descr_bytes;
4553  if ( m_Cur->m_DESCR ) {
4554  descr_bytes = m_Cur->DESCR(m_CurrId);
4555  }
4556  return descr_bytes;
4557 }
4558 
4559 
4561 {
4562  x_CheckValid("CWGSSeqIterator::HasNucProtDescrBytes");
4563  return m_Cur->m_NUC_PROT_DESCR && !m_Cur->NUC_PROT_DESCR(m_CurrId).empty();
4564 }
4565 
4566 
4568 {
4569  x_CheckValid("CWGSSeqIterator::GetNucProtDescrBytes");
4570  CTempString descr_bytes;
4571  if ( m_Cur->m_NUC_PROT_DESCR ) {
4572  descr_bytes = m_Cur->NUC_PROT_DESCR(m_CurrId);
4573  }
4574  return descr_bytes;
4575 }
4576 
4577 
4579 {
4580  x_CheckValid("CWGSSeqIterator::HasSeq_descr");
4581  if ( flags & fSeqDescr ) {
4582  if ( HasSeqDescrBytes() ) {
4583  return true;
4584  }
4585  }
4586  if ( flags & fNucProtDescr ) {
4587  if ( HasNucProtDescrBytes() ) {
4588  return true;
4589  }
4590  }
4591  if ( flags & fMasterDescr ) {
4592  if ( !GetDb().GetMasterDescr().empty() ) {
4593  return true;
4594  }
4595  }
4596  return false;
4597 }
4598 
4599 
4601 {
4602  x_CheckValid("CWGSSeqIterator::GetSeq_descr");
4603  CRef<CSeq_descr> ret(new CSeq_descr);
4604  if ( (flags & fSeqDescr) && m_Cur->m_DESCR ) {
4605  sx_AddDescrBytes(*ret, *m_Cur->DESCR(m_CurrId));
4606  }
4607  if ( (flags & fNucProtDescr) && m_Cur->m_NUC_PROT_DESCR ) {
4608  sx_AddDescrBytes(*ret, *m_Cur->NUC_PROT_DESCR(m_CurrId));
4609  }
4610  if ( flags & fMasterDescr ) {
4611  GetDb().AddMasterDescr(*ret, nullptr, flags);
4612  }
4613  if ( ret->Get().empty() ) {
4614  ret.Reset();
4615  }
4616  return ret;
4617 }
4618 
4619 
4621 {
4622  x_CheckValid("CWGSSeqIterator::GetLocFeatRowIdRange");
4623 
4624  if ( !m_Cur->m_FEAT_ROW_START ) {
4625  return TVDBRowIdRange(0, 0);
4626  }
4627  CVDBValueFor<TVDBRowId> start_val = m_Cur->FEAT_ROW_START(m_CurrId);
4628  if ( start_val.empty() ) {
4629  return TVDBRowIdRange(0, 0);
4630  }
4631  TVDBRowId start = *start_val;
4632  TVDBRowId end = *m_Cur->FEAT_ROW_END(m_CurrId);
4633  if ( end < start ) {
4634  NCBI_THROW_FMT(CSraException, eDataError,
4635  "CWGSSeqIterator::GetLocFeatRowIdRange: "
4636  "feature row range is invalid: "<<start<<","<<end);
4637  }
4638  return TVDBRowIdRange(start, end-start+1);
4639 }
4640 
4641 
4643 {
4644  x_CheckValid("CWGSSeqIterator::HasAnnotSet");
4645  return m_Cur->m_ANNOT && !m_Cur->ANNOT(m_CurrId).empty();
4646 }
4647 
4648 
4650 {
4651  x_CheckValid("CWGSSeqIterator::GetAnnotBytes");
4652  return *m_Cur->ANNOT(m_CurrId);
4653 }
4654 
4655 
4656 void CWGSSeqIterator::GetAnnotSet(TAnnotSet& annot_set, TFlags flags) const
4657 {
4658  x_CheckValid("CWGSSeqIterator::GetAnnotSet");
4659  if ( (flags & fSeqAnnot) && m_Cur->m_ANNOT ) {
4660  sx_AddAnnotBytes(annot_set, *m_Cur->ANNOT(m_CurrId));
4661  }
4662 }
4663 
4664 
4666 {
4667  x_CheckValid("CWGSSeqIterator::CanHaveQualityGraph");
4668  return m_Cur->m_QUALITY;
4669 }
4670 
4671 
4673 {
4674  x_CheckValid("CWGSSeqIterator::HasQualityGraph");
4675  return m_Cur->m_QUALITY && !m_Cur->QUALITY(m_CurrId).empty();
4676 }
4677 
4678 
4679 inline
4681 {
4682  PROFILE(sw____GetContigQualSize);
4683  return m_Cur->m_Cursor.GetElementCount(m_CurrId, m_Cur->m_QUALITY, 8);
4684 }
4685 
4686 
4687 void
4688 CWGSSeqIterator::GetQualityVec(vector<INSDC_quality_phred>& quality_vec) const
4689 {
4690  x_CheckValid("CWGSSeqIterator::GetQualityArray");
4691 
4692  TSeqPos pos = GetSeqOffset();
4694  if ( end <= pos ) {
4695  quality_vec.clear();
4696  return;
4697  }
4698  TSeqPos size = end-pos;
4699  quality_vec.reserve((size+7)/8*8);
4700  quality_vec.resize(size);
4701  m_Cur->m_Cursor.ReadElements(m_CurrId, m_Cur->m_QUALITY, 8, pos, size,
4702  quality_vec.data());
4703 }
4704 
4705 
4707 {
4708  return "Phrap Graph";
4709 }
4710 
4711 
4712 static inline void s_GetMinMax(const Uint1* arr, size_t size,
4713  Uint1& min_v, Uint1& max_v)
4714 {
4715  Uint1 min_v0 = 0xff, max_v0 = 0;
4716  Uint1 min_v1 = 0xff, max_v1 = 0;
4717  Uint1 min_v2 = 0xff, max_v2 = 0;
4718  Uint1 min_v3 = 0xff, max_v3 = 0;
4719  for ( ; size >= 4; arr += 4, size -= 4 ) {
4720  Uint1 v0 = arr[0];
4721  Uint1 v1 = arr[1];
4722  Uint1 v2 = arr[2];
4723  Uint1 v3 = arr[3];
4724  if ( v0 < min_v0 ) min_v0 = v0;
4725  if ( v1 < min_v1 ) min_v1 = v1;
4726  if ( v2 < min_v2 ) min_v2 = v2;
4727  if ( v3 < min_v3 ) min_v3 = v3;
4728  if ( v0 > max_v0 ) max_v0 = v0;
4729  if ( v1 > max_v1 ) max_v1 = v1;
4730  if ( v2 > max_v2 ) max_v2 = v2;
4731  if ( v3 > max_v3 ) max_v3 = v3;
4732  }
4733  for ( ; size > 0; arr += 1, size -= 1 ) {
4734  Uint1 v0 = arr[0];
4735  if ( v0 < min_v0 ) min_v0 = v0;
4736  if ( v0 > max_v0 ) max_v0 = v0;
4737  }
4738  min_v0 = min(min_v0, min_v2);
4739  max_v0 = max(max_v0, max_v2);
4740  min_v1 = min(min_v1, min_v3);
4741  max_v1 = max(max_v1, max_v3);
4742  min_v = min(min_v0, min_v1);
4743  max_v = max(max_v0, max_v1);
4744 }
4745 
4746 
4748  TFlags flags) const
4749 {
4751  info.x_SetId(*this);
4752  x_GetQualityAnnot(annot_set, info);
4753 }
4754 
4755 
4758  TSeqPos pos,
4759  TSeqPos len) const
4760 {
4761  x_CheckValid("CWGSSeqIterator::GetQualityAnnot");
4762  if ( !(info.flags & fQualityGraph) || !m_Cur->m_QUALITY ) {
4763  return;
4764  }
4765 
4766  PROFILE(sw___GetContigQual);
4767  TSeqPos end = len == kInvalidSeqPos? kInvalidSeqPos: pos + len;
4768  pos = max(pos, GetSeqOffset());
4769  end = min(end, x_GetQualityArraySize());
4770  if ( end <= pos ) {
4771  return;
4772  }
4773  TSeqPos size = end-pos;
4774  CByte_graph::TValues values;
4775  {
4776  PROFILE(sw____GetContigQualData);
4777  values.reserve((size+7)/8*8);
4778  values.resize(size);
4779  m_Cur->m_Cursor.ReadElements(m_CurrId, m_Cur->m_QUALITY, 8, pos, size,
4780  values.data());
4781  }
4782 
4783  Uint1 min_q = 0, max_q = 0;
4784  {
4785  PROFILE(sw____GetContigQualMinMax);
4786  s_GetMinMax((const Uint1*)values.data(), values.size(), min_q, max_q);
4787  }
4788  if ( max_q == 0 ) {
4789  return;
4790  }
4791 
4792  CRef<CSeq_annot> annot(new CSeq_annot);
4793  CRef<CAnnotdesc> name(new CAnnotdesc);
4794  name->SetName(GetQualityAnnotName());
4795  annot->SetDesc().Set().push_back(name);
4796  CRef<CSeq_graph> graph(new CSeq_graph);
4797  graph->SetTitle("Phrap Quality");
4798  CSeq_interval& loc = graph->SetLoc().SetInt();
4799  loc.SetId(*info.main_id);
4800  loc.SetFrom(pos);
4801  loc.SetTo(end-1);
4802  graph->SetNumval(TSeqPos(size));
4803  CByte_graph& bytes = graph->SetGraph().SetByte();
4804  bytes.SetValues().swap(values);
4805  bytes.SetAxis(0);
4806  bytes.SetMin(min_q);
4807  bytes.SetMax(max_q);
4808  annot->SetData().SetGraph().push_back(graph);
4809  annot_set.push_back(annot);
4810 }
4811 
4812 
4814 {
4815  x_CheckValid("CWGSSeqIterator::GetGBState");
4816 
4817  if ( m_AccVersion.m_Offset != 0 ) {
4818  // not the last version of sequence
4820  }
4821  CVDBMgr::CRequestContextUpdater ctx_updater;
4822  return m_Cur->m_GB_STATE? *m_Cur->GB_STATE(m_CurrId): m_Db->GetProjectGBState();
4823 }
4824 
4825 
4827 {
4828  x_CheckValid("CWGSSeqIterator::HasPublicComment");
4829 
4830  if ( !m_Cur->m_PUBLIC_COMMENT ) {
4831  return false;
4832  }
4833  return !m_Cur->PUBLIC_COMMENT(m_CurrId).empty();
4834 }
4835 
4836 
4838 {
4839  x_CheckValid("CWGSSeqIterator::GetPublicComment");
4840 
4841  if ( !m_Cur->m_PUBLIC_COMMENT ) {
4842  return string();
4843  }
4844  return *m_Cur->PUBLIC_COMMENT(m_CurrId);
4845 }
4846 
4847 
4849 {
4850  x_CheckValid("CWGSSeqIterator::IsCircular");
4851 
4852  return m_Cur->m_CIRCULAR && *m_Cur->CIRCULAR(m_CurrId);
4853 }
4854 
4855 
4857 {
4858  return m_Cur->m_GAP_START;
4859 }
4860 
4861 
4864  CWGSDb_Impl& db,
4866  TVDBRowId row_id)
4867  : m_Db(&db),
4868  m_Seq(seq),
4870  {
4871  if ( !m_AmbiguityInfo ) {
4872  m_AmbiguityInfo = db.GetAmbiguityInfo(row_id);
4873  if ( !m_AmbiguityInfo ) {
4875  }
4877  }
4878  }
4880  {
4881  if ( m_AmbiguityInfo ) {
4883  }
4884  }
4885 
4887  {
4888  return m_AmbiguityInfo.GetNCPointer();
4889  }
4890 
4891  vector<Uint1> GetAmbiguityBytes() const
4892  {
4894  }
4895 
4897  {
4898  return operator->()->Get2naLengthExact(pos, len,
4900  }
4902  TSeqPos stop_2na_len, TSeqPos stop_gap_len) const
4903  {
4904  return operator->()->Get4naLengthExact(pos, len, stop_2na_len, stop_gap_len,
4906  }
4908  {
4909  return operator->()->GetGapLengthExact(pos, len,
4911  }
4912 
4914  {
4915  return operator->()->Get2na(pos, len,
4916  m_Seq.GetNCObject());
4917  }
4919  {
4920  return operator->()->Get4na(pos, len,
4922  }
4923 
4924 private:
4926  void operator=(const SAmbiguityAccess&) = delete;
4927 
4931 };
4932 
4933 
4935 {
4937 }
4938 
4939 
4941 {
4942  x_CheckValid("CWGSSeqIterator::GetGapInfo");
4943 
4944  if ( HasGapInfo() ) {
4945  gap_info = GetAmbiguity()->GetGapInfo();
4946  }
4947  else {
4948  gap_info = TWGSContigGapInfo();
4949  }
4950 }
4951 
4952 static
4954 {
4956  evidence->SetType(type);
4957  gap.SetLinkage_evidence().push_back(evidence);
4958 }
4959 
4960 
4961 static
4964  NCBI_WGS_gap_linkage gap_linkage)
4965 {
4967  static const int kLenTypeMask =
4970  static const int kGapTypeMask =
4979  _ASSERT(props < 0);
4980  int len_type = -(-props & kLenTypeMask);
4981  int gap_type = -(-props & kGapTypeMask);
4982  literal->SetLength(len);
4983  if ( len_type == NCBI_WGS_gap_unknown ) {
4984  literal->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
4985  }
4986  if ( gap_type || gap_linkage ) {
4987  CSeq_gap& gap = literal->SetSeq_data().SetGap();
4988  switch ( gap_type ) {
4989  case 0:
4991  break;
4992  case NCBI_WGS_gap_scaffold:
4994  break;
4995  case NCBI_WGS_gap_contig:
4997  break;
5000  break;
5003  break;
5006  break;
5007  case NCBI_WGS_gap_telomere:
5009  break;
5010  case NCBI_WGS_gap_repeat:
5012  break;
5015  break;
5016  default:
5017  break;
5018  }
5019  // linkage-evidence bits should be in order of ASN.1 specification
5020  if ( gap_linkage & NCBI_WGS_gap_linkage_linked ) {
5021  gap.SetLinkage(gap.eLinkage_linked);
5022  }
5025  for ( ; bit && bit <= gap_linkage; bit<<=1, ++type ) {
5026  if ( gap_linkage & bit ) {
5027  sx_AddEvidence(gap, type);
5028  }
5029  }
5030  }
5031  return literal;
5032 }
5033 
5034 
5036 {
5037  return GetAmbiguity().GetAmbiguityBytes();
5038 }
5039 
5040 
5042 {
5043  return GetAmbiguity().Get2na(pos, len);
5044 }
5045 
5046 
5048 {
5049  return GetAmbiguity().Get4na(pos, len);
5050 }
5051 
5052 
5053 /////////////////////////////////////////////////////////////////////////////
5054 // delta control constants
5055 
5056 // kMin2naSize is the minimal size of 2na segment that will
5057 // save memory if inserted in between 4na segments.
5058 // It's determined by formula MinLen = 8*MemoryOverfeadOfSegment.
5059 // The memory overhead of a segment in total is
5060 // (assuming allocation overhead equal to 2 pointers):
5061 // 18*sizeof(void*)+7*sizeof(int)
5062 // (+sizeof(int) on some 64-bit platforms due to alignment).
5063 // So one segment memory overhead is 100 bytes on 32-bit platform,
5064 // and 176 bytes on 64-bit platform.
5065 // This leads to threshold size of 800 bases on 32-bit platforms and
5066 // 1408 bases on most 64-bit platforms.
5067 // We'll use slightly bigger threshold to take into account
5068 // possible CPU overhead for 2na operations.
5069 // static const TSeqPos kMin2naSize = 2048;
5070 // Actually use kAmbiguityBlockSize (=1024), it's optimal enough
5071 // and allows to use precomputed ambiguity info directly
5073 
5074 // size of chinks if the segment is split
5075 static const TSeqPos kChunk4naSize = 1<<16; // 64Ki bases or 32KiB
5076 static const TSeqPos kChunk2naSize = 1<<17; // 128Ki bases or 32KiB
5077 
5078 // min size of segment to split
5080 static const TSeqPos kSplit2naSize = kChunk2naSize; //+kChunk2naSize/4;
5081 
5082 // end of delta control constants
5083 /////////////////////////////////////////////////////////////////////////////
5084 
5087  eDelta_split
5088 };
5089 
5090 
5091 inline
5093  TSeqPos pos, TSeqPos len,
5094  const TWGSContigGapInfo& gap_info) const
5095 {
5096  SSegment seg;
5097  seg.range.SetFrom(pos);
5098  seg.range.SetLength(len);
5099  seg.is_gap = true;
5100  NCBI_WGS_component_props props = *gap_info.gaps_props;
5101  NCBI_WGS_gap_linkage linkage = 0;
5102  if ( gap_info.gaps_linkage ) {
5103  linkage = *gap_info.gaps_linkage;
5104  }
5105  seg.literal = sx_MakeGapLiteral(len, props, linkage);
5106  segments.push_back(seg);
5107 }
5108 
5109 
5110 inline
5113 {
5114  range.SetToOpen(min(range.GetToOpen(), GetSeqLength()));
5115  return range;
5116 }
5117 
5118 
5119 // add raw data as delta segments with explicit gap info
5122  TWGSContigGapInfo gap_info,
5123  TInstSegmentFlags flags) const
5124 {
5126  TSeqPos raw_offset = GetSeqOffset();
5127  TSeqPos pos = range.GetFrom() + raw_offset;
5128  TSeqPos len = range.GetLength();
5129 
5130  gap_info.SetPos(pos);
5131  auto ambiguity = GetAmbiguity();
5132 
5133  for ( ; len > 0; ) {
5134  if ( gap_info.IsInGap(pos) ) {
5135  // add gap
5136  TSeqPos gap_len = gap_info.GetGapLength(pos, len);
5137  _ASSERT(gap_len <= len);
5138  if ( flags & fInst_MakeGaps) {
5139  x_AddGap(segments, pos - raw_offset, gap_len, gap_info);
5140  }
5141  ++gap_info;
5142  len -= gap_len;
5143  pos += gap_len;
5144  _ASSERT(!gap_info || pos <= gap_info.GetFrom());
5145  continue;
5146  }
5147 
5148  // data segment
5149  TSeqPos rem_len = gap_info.GetDataLength(pos, len);
5150  _ASSERT(rem_len <= len);
5151 
5152  if ( flags & fInst_Split ) {
5153  // break data at the next chunk boundary
5154  TSeqPos chunk_start =
5155  (pos-raw_offset)/kDataChunkSize*kDataChunkSize;
5156