1 /* $Id: update_align.cpp 47479 2023-05-02 13:24:02Z ucko $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Igor Filippov
27  */
30 #include <ncbi_pch.hpp>
33 #include <util/line_reader.hpp>
34 #include <objmgr/bioseq_ci.hpp>
49 #include <wx/filedlg.h>
50 #include <wx/choicdlg.h>
51 #include <wx/msgdlg.h>
56 void CUpdateAlign::apply(CSeq_entry_Handle tse, ICommandProccessor* cmdProcessor, wxWindow *parent)
57 {
58  wxFileDialog file(parent, wxT("Import file"), wxEmptyString, wxEmptyString,
59  CFileExtensions::GetDialogFilter(CFileExtensions::kAllFiles), wxFD_OPEN | wxFD_FILE_MUST_EXIST);
60  if (file.ShowModal() != wxID_OK) {
61  return;
62  }
64  m_FilePath = file.GetPath();
65  CTextAlignParams params;
66  params.SetUnknown(wxT("?Nn"));
67  params.SetMatch(wxT(":"));
68  params.SetBegin(wxT("-.Nn?"));
69  params.SetMiddle(wxT("-.")); // for interpreting gaps
70  params.SetEnd(wxT("-.Nn?"));
71  params.SetSeqType(1);
74  dlg.SetData(params);
75  if (dlg.ShowModal() == wxID_OK) {
76  params = dlg.GetData();
77  x_ReadAlign(params);
79  if (m_SeqAlign) {
81  if (x_ProcessNotPresentIDs(parent)) {
82  CRef<CSeq_annot> new_annot(new CSeq_annot);
83  new_annot->SetData().SetAlign().push_back(m_SeqAlign);
85  cmdProcessor->Execute(cmd);
86  }
87  }
88  }
89 }
92 {
93  m_NonPresentIDs.clear();
94  _ASSERT(m_NonPresentIDs.empty());
96  for (size_t index = 0; index < m_Ids.size(); ++index) {
97  CRef<CSeq_id> id1;
98  try {
99  id1.Reset(new CSeq_id(m_Ids[index], CSeq_id::fParse_Default));
100  } catch (exception &) {
101  id1.Reset();
102  }
104  bool found = false;
105  if (id1) {
106  for (CBioseq_CI bi(tse); bi && !found; ++bi) {
107  for (auto&& id_it : bi->GetId()) {
108  CConstRef<CSeq_id> id2 = id_it.GetSeqId();
109  if (id2 && id1->Match(*id2)) {
110  found = true;
111  break;
112  }
113  }
115  }
116  }
117  if (!found) {
118  m_NonPresentIDs.emplace_back(m_Ids[index], (int)index);
119  }
120  }
121 }
123  // TODO ask user if not-present id should be a far pointer, deleted, etc.
124 /*
125  "This is a far pointer"// id = "acc"+id;
126  "Remove this sequence from the alignment"
127  "Use this ID for this sequence"
128 */
130 {
131  if (m_NonPresentIDs.empty()) return true;
133  wxArrayString choices;
134  choices.Add(_("All unmatched sequences are far pointers"));
135  choices.Add(_("Remove all unmatched sequences from the alignment"));
136  choices.Add(_("Read in a file that maps alignment IDs to sequence IDs"));
138  wxString msg;
139  msg << "Unable to find " << m_NonPresentIDs.size() << " sequence";
140  if (m_NonPresentIDs.size() > 1) {
141  msg << "s";
142  }
143  msg << " from alignment in set";
145  int answer = wxGetSingleChoiceIndex(msg, _("Unmatched sequences found"), choices);
146  if (answer == 0) {
148  }
149  else if (answer == 1) {
151  }
152  else if (answer == 2) {
153  x_MapNonPresentIDs(parent);
154  }
156  return (answer != -1); // -1 corresponds to pressing the Cancel button
157 }
159 namespace {
161  struct SFind_ID
162  {
163  SFind_ID(const string& id_name) : m_IdName(id_name) {}
165  bool operator() (const pair<string, string>& str_pair) const
166  {
167  return NStr::FindNoCase(str_pair.first, m_IdName) != NPOS;
168  }
169  private:
170  string m_IdName;
171  };
173  CUpdateAlign::TIdToRowVec::iterator s_IsIDNonPresent(CUpdateAlign::TIdToRowVec& nonPresentIDs, int index)
174  {
175  for (CUpdateAlign::TIdToRowVec::iterator it = nonPresentIDs.begin();
176  it != nonPresentIDs.end(); ++it) {
177  if (it->second == index) {
178  return it;
179  }
180  }
181  return nonPresentIDs.end();
182  }
184  string s_GetIDLabel(const string& initial_val)
185  {
186  string id_label = initial_val;
187  size_t pos = id_label.find("|");
188  if (pos != string::npos) {
189  id_label = id_label.substr(pos + 1, string::npos);
190  }
191  return id_label;
192  }
193 }
196 {
198  if (!m_SeqAlign->IsSetSegs()
199  || !m_SeqAlign->GetSegs().IsDenseg()
200  || m_SeqAlign->GetDim() == 2) {
201  return;
202  }
204  CDense_seg& dense_seg = m_SeqAlign->SetSegs().SetDenseg();
205  int dim = static_cast<int>(dense_seg.GetIds().size());
207  vector<int> rows;
208  for (auto& it : m_NonPresentIDs) {
209  rows.push_back(it.second);
210  }
212  if (dense_seg.IsSetNumseg()) {
213  CDense_seg::TNumseg numseg = dense_seg.GetNumseg();
215  // remove rows from starts
216  if (dense_seg.IsSetStarts()) {
217  CDense_seg::TStarts new_starts;
218  auto starts_it = dense_seg.GetStarts().begin();
220  for (int seg = 0; seg < numseg; ++seg) {
221  for (int index = 0; index < dim; ++index) {
222  if (find(rows.begin(), rows.end(), index) == rows.end()) {
223  new_starts.push_back(*starts_it);
224  }
225  ++starts_it;
226  }
227  }
228  dense_seg.SetStarts().swap(new_starts);
229  }
231  // remove rows from strands
232  if (dense_seg.IsSetStrands()) {
233  CDense_seg::TStrands new_strands;
234  auto strand_it = dense_seg.GetStrands().begin();
236  for (int seg = 0; seg < numseg; ++seg) {
237  for (int index = 0; index < dim; ++index) {
238  if (find(rows.begin(), rows.end(), index) == rows.end()) {
239  new_strands.push_back(*strand_it);
240  }
241  ++strand_it;
242  }
243  }
244  dense_seg.SetStrands().swap(new_strands);
245  }
246  }
248  // remove ids that are not present
249  CDense_seg::TIds new_ids;
250  for (size_t index = 0; index < dense_seg.GetIds().size(); ++index) {
251  if (find(rows.begin(), rows.end(), (int)index) == rows.end()) {
252  new_ids.push_back(dense_seg.GetIds()[index]);
253  }
254  }
256  dense_seg.SetDim(static_cast<CDense_seg::TDim>(new_ids.size()));
257  m_SeqAlign->SetDim(static_cast<CSeq_align::TDim>(new_ids.size()));
258  dense_seg.SetIds().swap(new_ids);
259  dense_seg.RemovePureGapSegs(); // sets numsegs correctly
260 }
264 {
266  _ASSERT(!m_NonPresentIDs.empty());
268  if (!m_SeqAlign->IsSetSegs()
269  || !m_SeqAlign->GetSegs().IsDenseg()) {
270  return;
271  }
273  CDense_seg& dense_seg = m_SeqAlign->SetSegs().SetDenseg();
274  size_t previous_ids = dense_seg.GetIds().size();
276  CDense_seg::TIds new_ids;
277  for (size_t index = 0; index < dense_seg.GetIds().size(); ++index) {
278  bool found = false;
279  for (auto& it : m_NonPresentIDs) {
280  if ((int)index == it.second) {
281  string farptrID = "acc" + s_GetIDLabel(it.first);
282  CRef<CSeq_id> new_id(new CSeq_id(CSeq_id::e_Local, farptrID));
283  new_ids.push_back(new_id);
284  found = true;
285  break;
286  }
287  }
289  if (!found) {
290  new_ids.push_back(dense_seg.GetIds()[index]);
291  }
292  }
294  dense_seg.SetIds().swap(new_ids);
295  _ASSERT(previous_ids == dense_seg.GetIds().size());
296 }
298 void CUpdateAlign::x_MapNonPresentIDs(wxWindow *parent)
299 {
301  _ASSERT(!m_NonPresentIDs.empty());
303  if (!m_SeqAlign->IsSetSegs()
304  || !m_SeqAlign->GetSegs().IsDenseg()) {
305  return;
306  }
308  vector<pair<string, string>> oldId_toNewId;
309  wxFileDialog file(parent, wxT("Import ID map from file"), wxEmptyString, wxEmptyString,
312  if (file.ShowModal() != wxID_OK) {
313  return;
314  }
316  wxString path = file.GetPath();
317  if (!path.IsEmpty()) {
319  CNcbiIfstream istr(path.fn_str());
320  CStreamLineReader line_reader(istr);
321  do {
322  string str = *++line_reader;
324  if (str.empty())
325  continue;
326  list<string> row_values;
327  NStr::Split(str, "\t", row_values, NStr::fSplit_MergeDelimiters);
328  if (row_values.size() == 2) {
329  oldId_toNewId.emplace_back(*row_values.begin(), *row_values.rbegin());
330  }
331  } while (!line_reader.AtEOF());
332  }
334  CDense_seg& dense_seg = m_SeqAlign->SetSegs().SetDenseg();
335  size_t previous_ids = dense_seg.GetIds().size();
337  int unmapped = 0;
338  string unmapped_ids = "(";
339  CDense_seg::TIds new_ids;
341  for (size_t index = 0; index < dense_seg.SetIds().size(); ++index) {
342  auto id_it = s_IsIDNonPresent(m_NonPresentIDs, (int)index);
343  if (id_it == m_NonPresentIDs.end()) {
344  new_ids.push_back(dense_seg.GetIds()[index]);
345  }
346  else {
347  std::function<bool(const pair<string, string>& str_pair)> tester = SFind_ID(id_it->first);
348  auto map_it = find_if(begin(oldId_toNewId), end(oldId_toNewId), tester);
349  if (map_it == oldId_toNewId.end()) {
350  unmapped++;
351  unmapped_ids += id_it->first;
352  unmapped_ids += ", ";
353  new_ids.push_back(dense_seg.GetIds()[index]);
354  }
355  else {
356  CRef<CSeq_id> new_id(new CSeq_id(CSeq_id::e_Local, s_GetIDLabel(map_it->second)));
357  new_ids.push_back(new_id);
358  m_NonPresentIDs.erase(id_it);
359  }
360  }
361  }
362  dense_seg.SetIds().swap(new_ids);
364  if (unmapped > 0) {
365  unmapped_ids.pop_back();
366  unmapped_ids.pop_back();
367  unmapped_ids += ")";
369  string msg = "Mapping not found for ";
370  msg += NStr::NumericToString(unmapped) + unmapped_ids;
371  msg += " sequence";
372  if (unmapped > 1) {
373  msg += "s";
374  }
376  msg += "\n";
377  if (unmapped == 1) {
378  msg += "Is this a far pointer? ";
379  }
380  else {
381  msg += "Are these far pointers?";
382  }
384  if (wxYES == wxMessageBox(ToWxString(msg), "Info", wxYES_DEFAULT|wxYES_NO, parent)) {
386  }
387  }
389  _ASSERT(previous_ids == dense_seg.GetIds().size());
390 }
392 /*
393 int CUpdateAlign::FindMostFrequentLength(const map<string,string> &id_to_seq)
394 {
395  map<int,int> lengths;
396  for (map<string, string>::const_iterator i = id_to_seq.begin(); i != id_to_seq.end(); ++i)
397  {
398  size_t length = i->second.length();
399  lengths[length]++;
400  }
401  int length = 0;
402  int max_occ = 0;
403  for (map<int,int>::iterator c = lengths.begin(); c != lengths.end(); ++c)
404  {
405  if (c->second > max_occ)
406  {
407  max_occ = c->second;
408  length = c->first;
409  }
410  }
411  return length;
412 }
414 bool CUpdateAlign::ProcessDiffLengths(map<string,string> &id_to_seq, int length)
415 {
416  set<string> diff_length;
417  for (map<string, string>::const_iterator i = id_to_seq.begin(); i != id_to_seq.end(); ++i)
418  {
419  size_t length2 = i->second.length();
420  if (length2 != length)
421  diff_length.insert(i->first);
422  }
424  if (!diff_length.empty())
425  {
426  wxString msg;
427  msg << "Remove " << diff_length.size() << " sequence";
428  if (diff_length.size() > 1)
429  msg << "s";
430  msg << " with different length";
431  if (diff_length.size() > 1)
432  msg << "s";
433  int answer = wxMessageBox (msg, _("Different length sequences detected"), wxYES_NO | wxICON_QUESTION);
434  if (answer == wxYES)
435  {
436  for (set<string>::const_iterator s = diff_length.begin(); s != diff_length.end(); ++s)
437  id_to_seq.erase(*s);
438  }
439  else
440  return true;
441  }
442  return false;
443 }
444 */
446 static string s_FormatErrors(const CAlnReader::TErrorList& errors)
447 {
448  string errmsg = "";
450  ITERATE(CAlnReader::TErrorList, iter2, errors) {
451  const string& id = (*iter2).GetID();
452  int line_num = (*iter2).GetLineNum();
454  if (!NStr::IsBlank(errmsg)) {
455  errmsg += "\n";
456  }
457  if (line_num > -1) {
458  errmsg += "At line ";
459  errmsg += NStr::IntToString(line_num);
460  }
461  if (!NStr::IsBlank(id)) {
462  errmsg += "(Sequence ID ";
463  errmsg += id;
464  errmsg += ") ";
465  }
466  errmsg += (*iter2).GetMsg();
467  }
468  return errmsg;
469 }
472 {
473  string unknown = ToStdString(params.GetUnknown());
474  string match = ToStdString(params.GetMatch());
475  string gapbegin = ToStdString(params.GetBegin());
476  string gapmiddle = ToStdString(params.GetMiddle());
477  string gapend = ToStdString(params.GetEnd());
479  vector<CAlnReader::EAlphabet> alphas;
480  if (params.GetSeqType() == 1) {
481  alphas.push_back(CAlnReader::eAlpha_Nucleotide);
482  }
483  else if (params.GetSeqType() == 2) {
484  alphas.push_back(CAlnReader::eAlpha_Protein);
485  }
486  else {
487  alphas.push_back(CAlnReader::eAlpha_Nucleotide);
488  alphas.push_back(CAlnReader::eAlpha_Protein);
489  }
492  string errmsg;
494  ITERATE(vector<CAlnReader::EAlphabet>, iter, alphas) {
495  CAlnReader reader(file.GetIstream());
496  try {
497  reader.SetMissing(unknown);
498  reader.SetMatch(match);
499  reader.SetBeginningGap(gapbegin);
500  reader.SetMiddleGap(gapmiddle);
501  reader.SetEndGap(gapend);
502  reader.SetAlphabet(*iter);
504  reader.Read();
505  m_SeqAlign = reader.GetSeqAlign();
506  m_Ids = reader.GetIds();
507  errmsg = s_FormatErrors(reader.GetErrorList());
508  break;
509  }
511  _TRACE("reading text alignment failed: " << e.what());
512  errmsg = s_FormatErrors(reader.GetErrorList());
513  NcbiMessageBox(errmsg);
514  }
515  }
516 }
