NCBI C++ ToolKit
gff3_location_merger.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2  * $Id: gff3_location_merger.cpp 100629 2023-08-21 15:12:00Z foleyjp $
3  *
4  * ===========================================================================
5  *
6  * PUBLIC DOMAIN NOTICE
7  * National Center for Biotechnology Information
8  *
9  * This software/database is a "United States Government Work" under the
10  * terms of the United States Copyright Act. It was written as part of
11  * the author's official duties as a United States Government employee and
12  * thus cannot be copyrighted. This software/database is freely available
13  * to the public for use. The National Library of Medicine and the U.S.
14  * Government have not placed any restriction on its use or reproduction.
15  *
16  * Although all reasonable efforts have been taken to ensure the accuracy
17  * and reliability of the software and data, the NLM and the U.S.
18  * Government do not and cannot warrant the performance or results that
19  * may be obtained by using this software or data. The NLM and the U.S.
20  * Government disclaim all warranties, express or implied, including
21  * warranties of performance, merchantability or fitness for any particular
22  * purpose.
23  *
24  * Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors: Frank Ludwig
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistr.hpp>
38 
41 
42 // ----------------------------------------------------------------------------
44  const CGff2Record& record,
45  unsigned int flags,
46  CGff3ReadRecord::SeqIdResolver seqIdResolve)
47 // ----------------------------------------------------------------------------
48 {
49  mGffId.Assign(*record.GetSeqId(flags, seqIdResolve));
50  mStart = static_cast<TSeqPos>(record.SeqStart());
51  mStop = static_cast<TSeqPos>(record.SeqStop());
52  mStrand = (record.IsSetStrand() ? record.Strand() : eNa_strand_plus);
53  mType = record.NormalizedType();
54  mPartNum = 0;
55  string recordPart;
56  if (record.GetAttribute("part", recordPart)) {
57  try {
58  mPartNum = NStr::StringToInt(recordPart);
59  }
60  catch (CStringException&) {
61  //mPartNum = 0;
62  }
63  }
64  mFrame = (mType == "cds" ? record.Phase() : CCdregion::eFrame_not_set);
65  mSeqId = record.Id();
66 }
67 
68 // ----------------------------------------------------------------------------
70  const CGff3LocationRecord& other)
71 // ----------------------------------------------------------------------------
72 {
73  mGffId.Assign(other.mGffId);
74  mStart = other.mStart;
75  mStop = other.mStop;
76  mStrand = other.mStrand;
77  mType = other.mType;
78  mPartNum = other.mPartNum;
79  mFrame = other.mFrame;
80  mSeqId = other.mSeqId;
81 }
82 
83 
84 // ============================================================================
85 bool
87  const CGff3LocationRecord& lhs,
88  const CGff3LocationRecord& rhs)
89 // ============================================================================
90 {
91  if (lhs.mStrand == eNa_strand_minus) {
92  return (lhs.mStart > rhs.mStart);
93  }
94  return (lhs.mStart < rhs.mStart);
95 }
96 
98  : m_pMessageListener(pListener)
99 {
100 }
101 
102 
103 // ============================================================================
105  const CGff2Record& record)
106 // ============================================================================
107 {
108  string id;
109  record.GetAttribute("ID", id);
110  CheckAndIndexRecord(id, record);
111 }
112 
113 
114 // ============================================================================
116  string id,
117  const CGff2Record& record)
118 // Add the given record to an index used for checking the validity og GFF3 IDs.
119 // Will do ammediate chack against the records that are indexed already (in the
120 // spirit of catching obvious problems early).
121 // ============================================================================
122 {
123  CReaderMessage errorDuplicateId(
124  eDiag_Error,
125  0,
126  string("Bad data line: record ID \"") + id + "\" is used multiple times");
127 
128  CGffIdTrackRecord trackRecord(record);
129  string parentId;
130  record.GetAttribute("Parent", parentId);
131  auto mapIt = mIds.find(id);
132  if (mapIt == mIds.end()) {
133  mapIt = mIds.emplace(id, list<CGffIdTrackRecord>()).first;
134  mapIt->second.push_back(trackRecord);
135  if (!parentId.empty()) {
136  mParentIds.emplace(parentId);
137  }
138  return;
139  }
140  auto& recordList = mapIt->second;
141  auto pendingType = record.NormalizedType();
142  if (pendingType == "exon") {
143  recordList.push_back(trackRecord);
144  if (!parentId.empty()) {
145  mParentIds.emplace(parentId);
146  }
147  return;
148  }
149 
150  if (!id.empty()) {
151  _ASSERT(!recordList.empty());
152  auto expectedType = recordList.front().mSeqType;
153  if (pendingType != expectedType) {
154  if (m_pMessageListener) {
155  m_pMessageListener->PutMessage(errorDuplicateId);
156  } else {
157  throw errorDuplicateId;
158  }
159  }
160  auto pendingSeqId = record.Id();
161  auto expectedSeqId = recordList.front().mSeqId;
162  if (pendingSeqId != expectedSeqId) {
163  if (m_pMessageListener) {
164  m_pMessageListener->PutMessage(errorDuplicateId);
165  } else {
166  throw errorDuplicateId;
167  }
168  }
169  }
170  if (!parentId.empty()) {
171  mParentIds.emplace(parentId);
172  }
173  recordList.push_back(trackRecord);
174 }
175 
176 // ============================================================================
178 // Check validity of GFF3 feature and parent IDs based on the assumption that
179 // all GFF3 records in the input have been seen and indexed.
180 // =============================================================================
181 {
182  // make sure there is an ID for every ParentID:
183  for (const auto& parentId: mParentIds) {
184  if (mIds.find(parentId) == mIds.end()) {
185  CReaderMessage errorBadParentId(
186  eDiag_Error,
187  0,
188  string("Bad data line: Parent \"" + parentId +
189  "\" does not refer to a GFF3 record ID"));
190  if (m_pMessageListener) {
191  m_pMessageListener->PutMessage(errorBadParentId);
192  }
193  else {
194  throw errorBadParentId;
195  }
196  }
197  }
198 }
199 
200 
201 // ============================================================================
203  unsigned int flags,
205  TSeqPos sequenceSize,
206  CReaderListener* pListener):
207 // ============================================================================
208  mFlags(flags),
209  mIdResolver(idResolver),
210  mIdTracker(pListener),
211  m_pMessageListener(pListener)
212 {
213 }
214 
215 // ============================================================================
216 void
218  const CGff2Record& record)
219 // ============================================================================
220 {
221  auto seqSizeIt = mSequenceSizes.find(record.Id());
222  if (seqSizeIt == mSequenceSizes.end()) {
223  return;
224  }
225  auto seqSize = seqSizeIt->second;
226  if (seqSize == 0) {
227  return; //pragma just gave ID, no size, hence useless here
228  }
229 
230  // (1) in point better be less then seqSize:
231  if (record.SeqStart() >= seqSize) {
232  string message = "Bad data line: ";
233  message += "feature in-point is outside the containing sequence.";
235  eDiag_Error,
236  0,
237  message);
238  if (m_pMessageListener) {
240  } else {
241  throw error;
242  }
243  }
244  // (2) no longer than sequence itself:
245  if (record.SeqStop() - record.SeqStart() >= seqSize) {
246  string message = "Bad data line: ";
247  message += "feature is longer than the entire containing sequence.";
249  eDiag_Error,
250  0,
251  message);
252  if (m_pMessageListener) {
254  } else {
255  throw error;
256  }
257  }
258 }
259 
260 // ============================================================================
261 bool
263  const CGff2Record& record)
264 // ============================================================================
265 {
267 
268  if (record.NormalizedType() == "cds") {
269  VerifyRecordLocation(record);
270  return true;
271  }
272 
273  list<string> ids;
274  if (!CGff3LocationMerger::xGetLocationIds(record, ids)) {
275  return false;
276  }
277 
278  for (const auto& id: ids) {
279  AddRecordForId(id, record);
280  }
281  return true;
282 }
283 
284 // ============================================================================
285 void
287  const string& id,
288  const CGff2Record& record)
289 // ============================================================================
290 {
291  VerifyRecordLocation(record);
292 
293  auto existingEntry = mMapIdToLocations.find(id);
294  if (existingEntry == mMapIdToLocations.end()) {
295  existingEntry = mMapIdToLocations.emplace(id, LOCATIONS()).first;
296  }
297  LOCATIONS& locations = existingEntry->second;
298  // special case: gene
299  if (locations.size() == 1 && locations.front().mType == "gene") {
300  return;
301  }
303  existingEntry->second.push_front(location);
304 }
305 
306 
307 // ============================================================================
309  const CGff2Record& record,
310  list<string>& ids)
311 // ============================================================================
312 {
313  string recordType = record.NormalizedType();
314 
315  if (NStr::EndsWith(recordType, "rna")) {
316  return false;
317  }
318  if (NStr::EndsWith(recordType, "transcript")) {
319  return false;
320  }
321  // prevent extra interval from getting into VDJC gene segment
322  if (NStr::EndsWith(recordType, "_gene_segment")) {
323  return false;
324  }
325  if (recordType == "exon") {
326  return record.GetAttribute("Parent", ids);
327  }
328  if (record.GetAttribute("ID", ids)) {
329  return true;
330  }
331  // create a temporary ID:
332  if (!record.GetAttribute("Parent", ids)) {
333  return false;
334  }
335  for (auto& id: ids) {
336  id = record.Type() + ":" + id;
337  }
338  return true;
339 }
340 
341 // ============================================================================
342 void
344  const string& id,
345  CRef<CSeq_loc>& pSeqLoc,
346  CCdregion::EFrame& frame)
347 // ============================================================================
348 {
349  auto it = mMapIdToLocations.find(id);
350  if (it == mMapIdToLocations.end()) {
351  pSeqLoc->Reset();
352  return;
353  }
354  MergeLocation(pSeqLoc, frame, it->second);
355 }
356 
357 // ============================================================================
358 TSeqPos
360  const string& seqId) const
361 // ============================================================================
362 {
363  auto sizeIt = mSequenceSizes.find(seqId);
364  if (sizeIt == mSequenceSizes.end()) {
365  return 0;
366  }
367  return sizeIt->second;
368 }
369 
370 // ============================================================================
373  const CGff3LocationRecord& locRecord)
374 // ============================================================================
375 {
376  CRef<CSeq_loc> pLocation(new CSeq_loc);
377  TSeqPos sequenceSize = GetSequenceSize(locRecord.mSeqId);
378  if (sequenceSize == 0) {
379  CRef<CSeq_interval> pInterval(new CSeq_interval);
380  pInterval->SetId().Assign(locRecord.mGffId);
381  pInterval->SetFrom(locRecord.mStart);
382  pInterval->SetTo(locRecord.mStop);
383  pInterval->SetStrand(locRecord.mStrand);
384  pLocation->SetInt(*pInterval);
385  return pLocation;
386  }
387 
388  if (locRecord.mStrand == eNa_strand_minus) {
389  if (locRecord.mStart >= sequenceSize || locRecord. mStop < sequenceSize) {
390  CRef<CSeq_interval> pInterval(new CSeq_interval);
391  pInterval->SetId().Assign(locRecord.mGffId);
392  pInterval->SetFrom(locRecord.mStart % sequenceSize);
393  pInterval->SetTo(locRecord.mStop % sequenceSize);
394  pInterval->SetStrand(locRecord.mStrand);
395  pLocation->SetInt(*pInterval);
396  }
397  else {
399  pTop->SetId().Assign(locRecord.mGffId);
400  pTop->SetFrom(0);
401  pTop->SetTo(locRecord.mStop % sequenceSize);
402  pTop->SetStrand(locRecord.mStrand);
403  pLocation->SetPacked_int().AddInterval(*pTop);
404  CRef<CSeq_interval> pBottom(new CSeq_interval);
405  pBottom->SetId().Assign(locRecord.mGffId);
406  pBottom->SetFrom(locRecord.mStart % sequenceSize);
407  pBottom->SetTo(sequenceSize - 1);
408  pBottom->SetStrand(locRecord.mStrand);
409  pLocation->SetPacked_int().AddInterval(*pBottom);
410  pLocation->ChangeToMix();
411  }
412  }
413  else {
414  if (locRecord.mStart >= sequenceSize || locRecord.mStop < sequenceSize) {
415  CRef<CSeq_interval> pInterval(new CSeq_interval);
416  pInterval->SetId().Assign(locRecord.mGffId);
417  pInterval->SetFrom(locRecord.mStart % sequenceSize);
418  pInterval->SetTo(locRecord.mStop % sequenceSize);
419  pInterval->SetStrand(locRecord.mStrand);
420  pLocation->SetInt(*pInterval);
421  }
422  else {
423  CRef<CSeq_interval> pBottom(new CSeq_interval);
424  pBottom->SetId().Assign(locRecord.mGffId);
425  pBottom->SetFrom(locRecord.mStart % sequenceSize);
426  pBottom->SetTo(sequenceSize - 1);
427  pBottom->SetStrand(locRecord.mStrand);
428  pLocation->SetPacked_int().AddInterval(*pBottom);
430  pTop->SetId().Assign(locRecord.mGffId);
431  pTop->SetFrom(0);
432  pTop->SetTo(locRecord.mStop % sequenceSize);
433  pTop->SetStrand(locRecord.mStrand);
434  pLocation->SetPacked_int().AddInterval(*pTop);
435  pLocation->ChangeToMix();
436  }
437  }
438  return pLocation;
439 }
440 
441 
442 // ============================================================================
443 void
445  CRef<CSeq_loc>& pSeqLoc,
446  CCdregion::EFrame& frame,
447  LOCATIONS& locations)
448 // ============================================================================
449 {
450  if (locations.empty()) {
451  pSeqLoc->SetNull();
453  return;
454  }
455  if (locations.size() == 1) {
456  auto& onlyOne = locations.front();
457  pSeqLoc = xGetRecordLocation(onlyOne);
458  frame = onlyOne.mFrame;
459  return;
460  }
462  auto& mix = pSeqLoc->SetMix();
463  for (auto& location: locations) {
464  mix.AddSeqLoc(*xGetRecordLocation(location));
465  }
466  const auto& front = locations.front();
467  frame = front.mFrame;
468 }
469 
470 
471 // ============================================================================
472 void
474 // ============================================================================
475 {
477 }
478 
479 
480 // =============================================================================
481 void
483  LOCATIONS& locations)
484 // =============================================================================
485 {
486  for (const auto& location: locations) {
487  if (location.mPartNum == 0) {
489  return;
490  }
491  }
493 }
494 
495 
bool GetAttribute(const string &, string &) const
Definition: gff2_data.cpp:305
void GetLocation(const string &, CRef< CSeq_loc > &, CCdregion::EFrame &)
map< string, TSeqPos > mSequenceSizes
CReaderListener * m_pMessageListener
void MergeLocation(CRef< CSeq_loc > &, CCdregion::EFrame &, LOCATIONS &)
CGff3ReadRecord::SeqIdResolver mIdResolver
TSeqPos GetSequenceSize(const string &) const
CRef< CSeq_loc > xGetRecordLocation(const CGff3LocationRecord &)
bool AddRecord(const CGff2Record &)
static bool xGetLocationIds(const CGff2Record &, list< string > &)
list< CGff3LocationRecord > LOCATIONS
CGff3LocationMerger(unsigned int flags=0, CGff3ReadRecord::SeqIdResolver=CReadUtil::AsSeqId, TSeqPos sequenceSize=0, CReaderListener *pListener=nullptr)
void VerifyRecordLocation(const CGff2Record &)
void AddRecordForId(const string &, const CGff2Record &)
static void xSortLocations(LOCATIONS &)
CCdregion::EFrame mFrame
static bool ComparePositions(const CGff3LocationRecord &lhs, const CGff3LocationRecord &rhs)
static bool ComparePartNumbers(const CGff3LocationRecord &lhs, const CGff3LocationRecord &rhs)
CGff3LocationRecord(const CGff2Record &, unsigned int, CGff3ReadRecord::SeqIdResolver)
CRef< CSeq_id > GetSeqId(TReaderFlags, SeqIdResolver=nullptr) const
TSeqPos SeqStop() const
const string & Type() const
ENa_strand Strand() const
TSeqPos SeqStart() const
bool IsSetStrand() const
const string & Id() const
TFrame Phase() const
const string & NormalizedType() const
CGffIdTracker(CReaderListener *pListener=nullptr)
CReaderListener * m_pMessageListener
map< string, list< CGffIdTrackRecord > > mIds
set< string > mParentIds
void CheckAndIndexRecord(string id, const CGff2Record &record)
virtual bool PutMessage(const IObjtoolsMessage &message)
Definition: listener.cpp:53
CRef –.
Definition: ncbiobj.hpp:618
CStringException –.
Definition: ncbistr.hpp:4505
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
static const char location[]
Definition: config.c:97
The NCBI C++ standard methods for dealing with std::string.
static uch flags
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
void SetPacked_int(TPacked_int &v)
Definition: Seq_loc.hpp:984
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void ChangeToMix(void)
Definition: Seq_loc.cpp:3633
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5429
@ eFrame_not_set
not set, code uses one
Definition: Cdregion_.hpp:95
void SetTo(TTo value)
Assign a value to To data member.
void SetId(TId &value)
Assign a value to Id data member.
void SetFrom(TFrom value)
Assign a value to From data member.
virtual void Reset(void)
Reset the whole object.
Definition: Seq_loc_.cpp:59
void SetStrand(TStrand value)
Assign a value to Strand data member.
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
constexpr auto front(list< Head, As... >, T=T()) noexcept -> Head
constexpr bool empty(list< Ts... >) noexcept
#define _ASSERT
Modified on Tue Nov 28 02:30:09 2023 by modify_doxy.py rev. 669887