1 /* $Id: biosample_chk.cpp 97547 2022-07-27 16:13:18Z grichenk $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin
27  *
28  * File Description:
29  * check biosource and structured comment descriptors against biosample database
30  *
31  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistre.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbienv.hpp>
38 #include <corelib/ncbiargs.hpp>
39 #include <corelib/ncbiutil.hpp>
41 #include <serial/serial.hpp>
42 #include <serial/objistr.hpp>
43 #include <serial/objectio.hpp>
46 #include <connect/ncbi_util.h>
49 // Objects includes
51 #include <objects/seq/Bioseq.hpp>
55 #include <objects/seq/Seq_inst.hpp>
67 #include <objects/seq/Pubdesc.hpp>
68 #include <objects/pub/Pub.hpp>
74 #include <objects/biblio/Affil.hpp>
84 #include <util/line_reader.hpp>
86 #include <util/format_guess.hpp>
90 // Object Manager includes
92 #include <objmgr/scope.hpp>
93 #include <objmgr/seq_descr_ci.hpp>
94 #include <objmgr/bioseq_handle.hpp>
95 #include <objmgr/bioseq_ci.hpp>
96 #include <objmgr/seqdesc_ci.hpp>
99 #ifdef HAVE_NCBI_VDB
101 #endif
109 #include <common/test_assert.h> /* This header must go last */
112 using namespace ncbi;
113 using namespace objects;
114 using namespace xml;
116 const char * BIOSAMPLE_CHK_APP_VER = "1.0";
118 /////////////////////////////////////////////////////////////////////////////
119 //
120 // Demo application
121 //
125 {
126 public:
128  m_ReportStream(0),
129  m_UseDevServer(false)
130  {}
132  virtual ~CBiosampleHandler() {}
134  virtual void ProcessBioseq(CBioseq_Handle bh) {}
135  virtual bool NeedsReportStream() { return false; }
136  virtual void AddSummary() {}
138  void SetReportStream(CNcbiOstream* stream) { m_ReportStream = stream; }
140 protected:
143 };
147 {
148 public:
151  virtual void ProcessBioseq(CBioseq_Handle bh);
152  virtual bool NeedsReportStream() { return true; }
153  virtual void AddSummary();
155 protected:
157 };
161 {
162  vector<string> ids = biosample_util::GetBiosampleIDs(bsh);
163  if (ids.empty()) {
164  return;
165  }
167  for (const auto &it : ids) {
168  if (m_Status.find(it) == m_Status.end()) {
170  m_Status.insert(new_pair);
171  }
172  }
173 }
176 {
177  if (m_Status.empty()) {
178  *m_ReportStream << "No BioSample IDs found" << endl;
179  } else {
180  biosample_util::GetBiosampleStatus(m_Status, m_UseDevServer);
181  biosample_util::TStatuses::iterator it = m_Status.begin();
182  while (it != m_Status.end()) {
183  *m_ReportStream << it->first << "\t" << biosample_util::GetBiosampleStatusName(it->second) << endl;
184  ++it;
185  }
186  }
187  m_Status.clear();
188 }
192 {
193 public:
194  CBiosampleChkApp(void);
196  virtual void Init(void);
197  virtual int Run (void);
199  void ReadClassMember(CObjectIStream& in,
200  const CObjectInfo::CMemberIterator& member);
202 private:
204  void Setup(const CArgs& args);
206  unique_ptr<CObjectIStream> OpenFile(const CArgs& args);
207  unique_ptr<CObjectIStream> OpenFile(const string &fname);
208  void SaveFile(const string &fname, bool useBinaryOutputFormat);
211  void PushToRecord(CBioseq_Handle bh);
213  void ProcessBioseqForUpdate(CBioseq_Handle bh);
214  void ProcessBioseqHandle(CBioseq_Handle bh);
215  void ProcessSeqEntry(CRef<CSeq_entry> se);
216  void ProcessSeqEntry(void);
217  void ProcessSet(void);
218  void ProcessSeqSubmit(void);
219  void ProcessAsnInput (void);
220  void ProcessList (const string& fname);
221  void ProcessFileList (const string& fname);
222  int ProcessOneDirectory(const string& dir_name, const string& file_suffix, const string& file_mask, bool recurse);
223  void ProcessOneFile(string fname);
224  void ProcessReleaseFile(const CArgs& args);
225  CRef<CSeq_entry> ReadSeqEntry(void);
226  CRef<CBioseq_set> ReadBioseqSet(void);
228  void CreateBiosampleUpdateWebService(biosample_util::TBiosampleFieldDiffList& diffs, bool del_okay);
230  void PrintDiffs(biosample_util::TBiosampleFieldDiffList& diffs);
231  void PrintTable(CRef<CSeq_table> table);
233  CRef<CScope> BuildScope(void);
235  // for mode 3, biosample_push
236  void UpdateBioSource (CBioseq_Handle bh, const CBioSource& src);
237  vector<CRef<CSeqdesc> > GetBiosampleDescriptors(string fname);
238  vector<CRef<CSeqdesc> > GetBiosampleDescriptorsFromSeqSubmit();
239  vector<CRef<CSeqdesc> > GetBiosampleDescriptorsFromSeqEntry();
240  vector<CRef<CSeqdesc> > GetBiosampleDescriptorsFromSeqEntry(const CSeq_entry& se);
243  unique_ptr<CObjectIStream> m_In;
246  size_t m_Level;
253  enum E_Mode {
254  e_report_diffs = 1, // Default - report diffs between biosources on records with biosample accessions
255  // and biosample data
258  e_take_from_biosample, // update with qualifiers from BioSample, stop if conflict
259  e_take_from_biosample_force, // update with qualifiers from BioSample, no stop on conflict
260  e_report_status, // make table with list of BioSample IDs and statuses
261  e_update_with, // use web API for update (with delete)
262  e_update_no // use web API for update (no delete)
263  };
265  enum E_ListType {
266  e_none = 0,
268  e_files
269  };
271  int m_Mode;
278  string m_IDPrefix;
279  string m_HUPDate;
282  string m_Owner;
283  string m_Comment;
287  size_t m_Processed;
292  vector<CRef<CSeqdesc> > m_Descriptors;
297 };
301  m_ObjMgr(0), m_Continue(false),
302  m_Level(0), m_ReportStream(0), m_NeedReportHeader(true), m_AsnOut(0),
303  m_LogStream(0), m_Mode(e_report_diffs), m_ReturnCode(0),
304  m_StructuredCommentPrefix(""), m_CompareStructuredComments(true),
305  m_FirstSeqOnly(false), m_IDPrefix(""), m_HUPDate(""),
306  m_BioSampleAccession(""), m_BioProjectAccession(""),
307  m_Owner(""), m_Comment(""),
308  m_Processed(0), m_Unprocessed(0), m_Handler(NULL)
309 {
310 }
314 {
315  // Prepare command line descriptions
317  // Create
318  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
320  arg_desc->AddOptionalKey
321  ("p", "Directory", "Path to ASN.1 Files",
323  arg_desc->AddOptionalKey
324  ("i", "InFile", "Single Input File",
326  arg_desc->AddOptionalKey(
327  "o", "OutFile", "Single Output File",
329  arg_desc->AddOptionalKey(
330  "f", "Filter", "Substring Filter",
332  arg_desc->AddDefaultKey
333  ("x", "String", "File Selection Substring", CArgDescriptions::eString, ".sqn");
334  arg_desc->AddFlag("u", "Recurse");
335  arg_desc->AddFlag("d", "Use development Biosample server");
337  arg_desc->AddDefaultKey("a", "a",
338  "ASN.1 Type (a Automatic, z Any, e Seq-entry, b Bioseq, s Bioseq-set, m Seq-submit, t Batch Bioseq-set, u Batch Seq-submit) or accession list (l)",
340  "a");
342  arg_desc->AddFlag("b", "Output binary ASN.1");
343  //arg_desc->AddFlag("c", "Batch File is Compressed");
344  arg_desc->AddFlag("M", "Process only first sequence in file (master)");
345  arg_desc->AddOptionalKey("R", "BioSampleIDPrefix", "BioSample ID Prefix", CArgDescriptions::eString);
346  arg_desc->AddOptionalKey("HUP", "HUPDate", "Hold Until Publish Date", CArgDescriptions::eString);
348  arg_desc->AddOptionalKey(
349  "L", "OutFile", "Log File",
352  arg_desc->AddDefaultKey(
353  "m", "mode", "Mode:\n"
354  "\t1 create update file\n"
355  "\t2 generate file for creating new biosample entries\n"
356  "\t3 push source info from one file (-i) to others (-p)\n"
357  "\t4 update with source qualifiers from BioSample unless conflict\n"
358  "\t5 update with source qualifiers from BioSample (continue with conflict))\n"
359  "\t6 report transaction status\n"
360  "\t7 use web API for update (with delete)\n"
361  "\t8 use web API for update (no delete)\n",
364  arg_desc->SetConstraint("m", constraint);
366  arg_desc->AddOptionalKey(
367  "P", "Prefix", "StructuredCommentPrefix", CArgDescriptions::eString);
369  arg_desc->AddOptionalKey(
370  "biosample", "BioSampleAccession", "BioSample Accession to use for sequences in record. Report error if sequences contain a reference to a different BioSample accession.", CArgDescriptions::eString);
371  arg_desc->AddOptionalKey(
372  "bioproject", "BioProjectAccession", "BioProject Accession to use for sequences in record. Report error if sequences contain a reference to a different BioProject accession.", CArgDescriptions::eString);
373  arg_desc->AddOptionalKey("comment", "BioSampleComment", "Comment to use for creating new BioSample xml", CArgDescriptions::eString);
375  arg_desc->AddOptionalKey("apikey_file", "BioSampleWebAPIKey", "File containing Web API Key needed to update BioSample database", CArgDescriptions::eString);
377  // Program description
378  string prog_description = "BioSample Checker\n";
379  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
380  prog_description, false);
382  // Pass argument descriptions to the application
383  SetupArgDescriptions(arg_desc.release());
385 }
389 {
390  // Process file based on its content
391  // Unless otherwise specifien we assume the file in hand is
392  // a Seq-entry ASN.1 file, other option are a Seq-submit or NCBI
393  // Release file (batch processing) where we process each Seq-entry
394  // at a time.
395  string header = m_In->ReadFileHeader();
397  bool unhandled = false;
398  try {
399  if (header == "Seq-submit" ) { // Seq-submit
401  } else if ( header == "Seq-entry" ) { // Seq-entry
402  ProcessSeqEntry();
403  } else if (header == "Bioseq-set" ) { // Bioseq-set
404  ProcessSet();
405  } else {
406  unhandled = true;
407  }
408  } catch (CException& e) {
409  if (NStr::StartsWith(e.GetMsg(), "duplicate Bioseq id")) {
410  *m_LogStream << e.GetMsg();
411  exit(4);
412  } else {
413  throw e;
414  }
415  }
416  if (unhandled) {
417  NCBI_THROW(CException, eUnknown, "Unhandled type " + header);
418  }
420 }
423 void CBiosampleChkApp::ProcessList (const string& fname)
424 {
425  // Process file with list of accessions
429 #ifdef HAVE_NCBI_VDB
431 #endif
432  CScope scope(*objmgr);
433  scope.AddDefaults();
435  CRef<ILineReader> lr = ILineReader::New (fname);
436  while ( !lr->AtEOF() ) {
437  CTempString line = *++*lr;
438  if (!NStr::IsBlank(line)) {
439  try {
440  CRef<CSeq_id> id(new CSeq_id(line));
441  if (id) {
442  CBioseq_Handle bsh = scope.GetBioseqHandle(*id);
443  if (bsh) {
444  ProcessBioseqHandle(bsh);
445  } else {
446  *m_LogStream << "Unable to fetch Bioseq for " << line << endl;
447  string label = "";
448  id->GetLabel(&label);
449  *m_LogStream << " (interpreted as " << label << ")" << endl;
450  m_Unprocessed++;
451  }
452  }
453  } catch (CException& e) {
454  *m_LogStream << e.GetMsg() << endl;
455  m_Unprocessed++;
456  }
457  }
458  }
460 }
463 void CBiosampleChkApp::ProcessFileList (const string& fname)
464 {
465  // Process file with list of files
469  CScope scope(*objmgr);
470  scope.AddDefaults();
472  m_ListType = e_none;
473  CRef<ILineReader> lr = ILineReader::New (fname);
474  while ( !lr->AtEOF() ) {
475  CTempString line = *++*lr;
476  if (!NStr::IsBlank(line)) {
477  ProcessOneFile(line);
478  }
479  }
481 }
485 {
486  const CArgs& args = GetArgs();
488  bool need_to_close_report = false;
489  bool need_to_close_asn = false;
491  if (!m_ReportStream &&
494  string path = fname;
495  size_t pos = NStr::Find(path, ".", NStr::eCase, NStr::eReverseSearch);
496  if (pos != string::npos) {
497  path = path.substr(0, pos);
498  }
499  path = path + ".val";
500  m_Table.Reset(new CSeq_table());
501  m_Table->SetNum_rows(0);
502  m_ReportStream = new CNcbiOfstream(path.c_str());
503  if (!m_ReportStream)
504  {
505  NCBI_THROW(CException, eUnknown, "Unable to open " + path);
506  }
507  need_to_close_report = true;
508  m_NeedReportHeader = true;
511  }
512  }
514  string path = fname;
515  size_t pos = NStr::Find(path, ".", NStr::eCase, NStr::eReverseSearch);
516  if (pos != string::npos) {
517  path = path.substr(0, pos);
518  }
519  path = path + ".out";
520  SaveFile(path, args["b"]);
521  need_to_close_asn = true;
522  }
524  m_Diffs.clear();
525  switch (m_ListType) {
526  case e_accessions:
527  ProcessList (fname);
528  break;
529  case e_files:
530  ProcessFileList (fname);
531  break;
532  case e_none:
533  m_In = OpenFile(fname);
534  if (m_In.get() == nullptr) {
535  NCBI_THROW(CException, eUnknown, "Unable to open " + fname);
536  }
537  if (!m_In->InGoodState()) {
538  NCBI_THROW(CException, eUnknown, "Unable to open " + fname);
539  }
540  ProcessAsnInput();
541  break;
542  }
544  if (m_Mode == e_report_diffs) {
546  }
547  if (m_Mode == e_update_with) {
549  } else if (m_Mode == e_update_no) {
551  }
552  if (m_Handler != NULL) {
554  }
556  // TODO! Must free diffs
557  m_Diffs.clear();
559  if (need_to_close_report) {
560  if (m_Mode == e_take_from_biosample) {
562  m_Table->Reset();
563  m_Table = new CSeq_table();
564  m_Table->SetNum_rows(0);
565  }
566  m_ReportStream->flush();
567  m_ReportStream = 0;
568  }
569  if (need_to_close_asn) {
570  m_AsnOut->flush();
571  m_AsnOut->close();
572  m_AsnOut = 0;
573  }
574 }
578 {
579  // Get seq-entry to process
583 }
587 {
588  vector<CRef<CSeqdesc> > descriptors;
590  CRef<CScope> scope = BuildScope();
591  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(se);
593  if (bi) {
594  CSeqdesc_CI src_desc_ci(*bi, CSeqdesc::e_Source);
595  if (src_desc_ci) {
596  CRef<CSeqdesc> src_desc(new CSeqdesc());
597  src_desc->Assign(*src_desc_ci);
598  descriptors.push_back(src_desc);
599  }
600  }
602  return descriptors;
603 }
607 {
608  vector<CRef<CSeqdesc> > descriptors;
611  // Get seq-submit to process
614  // Validae Seq-submit
615  CRef<CScope> scope = BuildScope();
616  if (ss->GetData().IsEntrys() && ! ss->GetData().GetEntrys().empty()) {
617  descriptors = GetBiosampleDescriptorsFromSeqEntry(**(ss->GetData().GetEntrys().begin()));
618  }
619  return descriptors;
620 }
623 vector<CRef<CSeqdesc> > CBiosampleChkApp::GetBiosampleDescriptors(string fname)
624 {
625  m_In = OpenFile(fname);
627  // Process file based on its content
628  // Unless otherwise specifien we assume the file in hand is
629  // a Seq-entry ASN.1 file, other option are a Seq-submit or NCBI
630  // Release file (batch processing) where we process each Seq-entry
631  // at a time.
633  string header = m_In->ReadFileHeader();
635  vector<CRef<CSeqdesc> > descriptors;
636  if (header == "Seq-submit" ) { // Seq-submit
637  descriptors = GetBiosampleDescriptorsFromSeqSubmit();
638  } else if ( header == "Seq-entry" ) { // Seq-entry
639  descriptors = GetBiosampleDescriptorsFromSeqEntry();
641  } else {
642  NCBI_THROW(CException, eUnknown, "Unhandled type " + header);
643  }
644  return descriptors;
645 }
648 int CBiosampleChkApp::ProcessOneDirectory(const string& dir_name, const string& file_suffix, const string& file_mask, bool recurse)
649 {
650  int num_of_files = 0;
652  CDir dir(dir_name);
653  CDir::TEntries files (dir.GetEntries(file_mask, CDir::eFile));
654  for (const auto &ii : files) {
655  string fname = ii->GetName();
656  if (ii->IsFile() &&
657  (!file_suffix.empty() || NStr::Find (fname, file_suffix) != string::npos)) {
658  ++num_of_files;
659  string fname = CDirEntry::MakePath(dir_name, ii->GetName());
660  ProcessOneFile (fname);
661  }
662  }
663  if (recurse) {
664  CDir::TEntries subdirs (dir.GetEntries("", CDir::eDir));
665  for (const auto &ii : subdirs) {
666  string subdir = ii->GetName();
667  if (ii->IsDir() && !NStr::Equal(subdir, ".") && !NStr::Equal(subdir, "..")) {
668  string subname = CDirEntry::MakePath(dir_name, ii->GetName());
669  num_of_files += ProcessOneDirectory (subname, file_suffix, file_mask, recurse);
670  }
671  }
672  }
673  if (!num_of_files)
674  {
675  NCBI_THROW(CException, eUnknown, "No input '" + file_mask + "' files found in directory '" + dir_name + "'");
676  }
677  return num_of_files;
678 }
682 {
683  const CArgs& args = GetArgs();
684  Setup(args);
686  m_Mode = args["m"].AsInteger();
687  m_FirstSeqOnly = args["M"].AsBoolean();
688  m_IDPrefix = args["R"] ? args["R"].AsString() : "";
689  m_HUPDate = args["HUP"] ? args["HUP"].AsString() : "";
690  m_BioSampleAccession = args["biosample"] ? args["biosample"].AsString() : "";
691  m_BioProjectAccession = args["bioproject"] ? args["bioproject"].AsString() : "";
692  m_Comment = args["comment"] ? args["comment"].AsString() : "";
694  string apikey_file = args["apikey_file"] ? args["apikey_file"].AsString() : "";
695  if (!apikey_file.empty()) {
696  ifstream is(apikey_file.c_str());
697  is >> m_BioSampleWebAPIKey;
698  }
700  if (m_Mode == e_report_status) {
702  }
704  if (args["o"]) {
706  //|| m_Mode == e_take_from_biosample
707  || (m_Handler != NULL && m_Handler->NeedsReportStream())) {
708  m_ReportStream = &(args["o"].AsOutputFile());
709  if (!m_ReportStream)
710  {
711  NCBI_THROW(CException, eUnknown, "Unable to open " + args["o"].AsString());
712  }
713  if (m_Handler) {
715  }
716  if (m_Mode == e_take_from_biosample) {
717  m_Table.Reset(new CSeq_table());
718  m_Table->SetNum_rows(0);
719  }
720  } else {
721  SaveFile(args["o"].AsString(), args["b"]);
722  }
723  } else if (m_Mode == e_update_with || m_Mode == e_update_no) {
725  if (!m_ReportStream)
726  {
727  NCBI_THROW(CException, eUnknown, "Unable to open " + args["o"].AsString());
728  }
729  if (m_Handler) {
731  }
732  if (m_Mode == e_take_from_biosample) {
733  m_Table.Reset(new CSeq_table());
734  m_Table->SetNum_rows(0);
735  }
736  }
738  m_LogStream = args["L"] ? &(args["L"].AsOutputFile()) : &NcbiCout;
739  m_StructuredCommentPrefix = args["P"] ? args["P"].AsString() : "";
742  }
744  m_UseDevServer = args["d"].AsBoolean();
747  // error
748  *m_LogStream << "Structured comment prefix is only appropriate for generating a biosample table." << endl;
749  return 1;
750  }
752  if (m_Mode == e_report_diffs) {
754  }
756  // Process file based on its content
757  // Unless otherwise specified we assume the file in hand is
758  // a Seq-entry ASN.1 file, other option are a Seq-submit or NCBI
759  // Release file (batch processing) where we process each Seq-entry
760  // at a time.
761  if (NStr::Equal(args["a"].AsString(), "l")) {
763  } else if (NStr::Equal(args["a"].AsString(), "f")) {
765  } else {
766  m_ListType = e_none;
767  }
769  string dir_name = (args["p"]) ? args["p"].AsString() : "";
770  string file_suffix = (args["f"]) ? args["f"].AsString() : "";
771  string file_mask = (args["x"]) ? args["x"].AsString() : ".sqn";
772  file_mask = "*" + file_mask;
773  bool dir_recurse = args["u"];
776  if (m_ReportStream) {
778  } else {
780  }
781  } else if ( m_Mode == e_push) {
782  if (m_ListType != e_none) {
783  // error
784  *m_LogStream << "List type (-a l or -a f) is not appropriate for push mode." << endl;
785  return 1;
786  } else if (!args["p"] || !args["i"]) {
787  // error
788  *m_LogStream << "Both directory containing contigs (-p) and master file (-i) are required for push mode." << endl;
789  return 1;
790  } else {
791  m_Descriptors = GetBiosampleDescriptors(args["i"].AsString());
792  ProcessOneDirectory (dir_name, file_suffix, file_mask, dir_recurse);
793  }
794  } else if ( args["p"] ) {
795  ProcessOneDirectory (dir_name, file_suffix, file_mask, dir_recurse);
796  if (m_Mode == e_take_from_biosample) {
797  if (m_Table && m_Table->GetNum_rows() > 0) {
799  }
800  }
801  } else {
802  if (args["i"]) {
803  ProcessOneFile (args["i"].AsString());
804  if (m_Mode == e_take_from_biosample) {
805  if (m_Table && m_Table->GetNum_rows() > 0) {
807  }
808  }
809  }
810  }
812  if (m_Unprocessed > 0) {
813  if (m_Mode != e_report_diffs) {
814  *m_LogStream << m_Unprocessed << " results failed" << endl;
815  }
816  return 1;
817  } else {
818  return m_ReturnCode;
819  }
820 }
824 {
825  CRef<CScope> scope(new CScope (*m_ObjMgr));
826  scope->AddDefaults();
828  return scope;
829 }
834  const CObjectInfo::CMemberIterator& member)
835 {
836  m_Level++;
838  if ( m_Level == 1 ) {
839  size_t n = 0;
840  // Read each element separately to a local TSeqEntry,
841  // process it somehow, and... not store it in the container.
842  for ( CIStreamContainerIterator i(in, member); i; ++i ) {
843  try {
844  // Get seq-entry to process
846  i >> *se;
850  m_Diffs.clear();
851  ProcessSeqEntry(se);
853  // TODO! Must free diffs
854  m_Diffs.clear();
856  if (m_ReportStream) {
857  *m_ReportStream << "Elapsed = " << sw.Elapsed() << endl;
858  }
859  n++;
860  } catch (std::exception e) {
861  if ( !m_Continue ) {
862  throw;
863  }
864  // should we issue some sort of warning?
865  }
866  }
867  } else {
868  in.ReadClassMember(member);
869  }
871  m_Level--;
872 }
876 (const CArgs& args)
877 {
878  CRef<CBioseq_set> seqset(new CBioseq_set);
880  // Register the Seq-entry hook
881  CObjectTypeInfo set_type = CType<CBioseq_set>();
882  set_type.FindMember("seq-set").SetLocalReadHook(*m_In, this);
884  // Read the CBioseq_set, it will call the hook object each time we
885  // encounter a Seq-entry
886  *m_In >> *seqset;
887 }
891 {
895  return se;
896 }
900 {
901  CRef<CBioseq_set> set(new CBioseq_set());
904  return set;
905 }
909 {
910  if (table->GetNum_rows() == 0) {
911  // do nothing
912  return;
913  }
915  for (const auto &it : table->GetColumns()) {
916  *m_ReportStream << it->GetHeader().GetTitle() << "\t";
917  }
918  *m_ReportStream << endl;
919  for (size_t row = 0; row < (size_t)table->GetNum_rows(); row++) {
920  for (const auto &it : table->GetColumns()) {
921  if (row < it->GetData().GetString().size()) {
922  *m_ReportStream << it->GetData().GetString()[row] << "\t";
923  } else {
924  *m_ReportStream << "\t";
925  }
926  }
927  *m_ReportStream << endl;
928  }
929 }
933 {
934  if (diffs.empty()) {
935  if (m_Processed == 0) {
936  *m_ReportStream << "No results processed" << endl;
937  } else {
938  *m_ReportStream << "No differences found" << endl;
939  }
940  } else {
941  if (m_NeedReportHeader) {
942  biosample_util::CBiosampleFieldDiff::PrintHeader(*m_ReportStream, false);
943  m_NeedReportHeader = false;
944  }
946  for (const auto &it : diffs) {
947  it->Print(*m_ReportStream, false);
948  }
949  }
950  if (m_Unprocessed > 0) {
951  *m_ReportStream << m_Unprocessed << " results failed" << endl;
952  }
953 }
957 {
958  PrintDiffs(diffs);
959 }
963 {
964  if (diffs.empty()) {
965  return;
966  }
968  vector< CRef<biosample_util::CBiosampleFieldDiff> > add_item;
969  vector< CRef<biosample_util::CBiosampleFieldDiff> > change_item;
970  vector< CRef<biosample_util::CBiosampleFieldDiff> > delete_item;
971  vector< CRef<biosample_util::CBiosampleFieldDiff> > change_organism;
973  set<string> ids;
975  for (const auto &it : diffs) {
976  string id = it->GetBioSample();
977  string smp = it->GetSampleVal();
978  string src = it->GetSrcVal();
979  string fld = it->GetFieldName();
980  bool blank_smp = NStr::IsBlank(smp);
981  bool blank_src = NStr::IsBlank(src);
982  if (blank_smp && blank_src) {
983  continue;
984  }
985  if (smp == src) {
986  continue;
987  }
988  ids.insert(id);
989  if (fld == "Organism Name") {
990  change_organism.push_back(it);
991  } else if (blank_smp) {
992  add_item.push_back(it);
993  } else if (blank_src) {
994  if (del_okay) {
995  delete_item.push_back(it);
996  }
997  } else {
998  change_item.push_back(it);
999  }
1000  }
1002  CJson_Document req;
1003  CJson_Object top_obj = req.SetObject();
1004  CJson_Array biosample_array = top_obj.insert_array("update");
1006  CJson_Object options_obj = top_obj.insert_object("options");
1007  options_obj.insert("attribute_synonyms", "true");
1009  for (auto& id : ids) {
1010  CJson_Object obj1 = biosample_array.push_back_object();
1011  obj1.insert("samples", id);
1013  if (! add_item.empty()) {
1014  CJson_Object add_obj = obj1.insert_object("add");
1015  CJson_Array add_arr = add_obj.insert_array("attribute");
1016  for (auto& itm : add_item) {
1017  CJson_Object obj2 = add_arr.push_back_object();
1018  obj2.insert("name", itm->GetFieldName());
1019  obj2.insert("new_value", itm->GetSrcVal());
1020  }
1021  }
1023  if (! delete_item.empty()) {
1024  CJson_Object del_obj = obj1.insert_object("delete");
1025  CJson_Array del_arr = del_obj.insert_array("attribute");
1026  for (auto& itm : delete_item) {
1027  CJson_Object obj2 = del_arr.push_back_object();
1028  obj2.insert("name", itm->GetFieldName());
1029  obj2.insert("old_value", itm->GetSampleVal());
1030  }
1031  }
1033  if (! change_item.empty() || ! change_organism.empty()) {
1034  CJson_Object chg_obj = obj1.insert_object("change");
1035  if (! change_organism.empty()) {
1036  CJson_Object chg_org = chg_obj.insert_object("organism");
1037  for (auto& itm : change_organism) {
1038  chg_org.insert("new_value", itm->GetSrcVal());
1039  }
1040  }
1041  if (! change_item.empty()) {
1042  CJson_Array chg_arr = chg_obj.insert_array("attribute");
1043  for (auto& itm : change_item) {
1044  string fld = itm->GetFieldName();
1045  if (fld == "Tax ID") {
1046  continue;
1047  }
1048  CJson_Object obj2 = chg_arr.push_back_object();
1049  obj2.insert("name", fld);
1050  obj2.insert("old_value", itm->GetSampleVal());
1051  obj2.insert("new_value", itm->GetSrcVal());
1052  }
1053  }
1054  }
1055  }
1057  if ( ids.size() > 1 ) {
1058  *m_LogStream << "ERROR: More than one BioSample ID is not supported by -m 7." << endl;
1059  exit(6);
1060  }
1062  string sData = req.ToString();
1064  NcbiCout << sData << endl;
1066  // BioSample update
1067  string sUrl = "";
1068  if (m_UseDevServer) {
1069  sUrl = "";
1070  }
1071  string sContentType = "application/json; charset=utf-8";
1073  CUrl curl(sUrl);
1074  CHttpHeaders headers;
1075  headers.SetValue("NCBI-BioSample-Authorization", m_BioSampleWebAPIKey);
1076  CHttpResponse response = g_HttpPost(curl, headers, sData, sContentType);
1078  if (response.GetStatusCode() != 200) {
1079  NcbiStreamCopy(cout, response.ErrorStream());
1080  cout << endl;
1081  } else {
1082  NcbiStreamCopy(cout, response.ContentStream());
1083  cout << endl;
1084  }
1085 }
1089 {
1090  vector<string> unprocessed_ids;
1094  m_Processed,
1095  unprocessed_ids,
1099  &m_cache);
1100  if (! new_diffs.empty()) {
1101  m_Diffs.insert(m_Diffs.end(), new_diffs.begin(), new_diffs.end());
1102  for (const auto &id : unprocessed_ids) {
1103  *m_LogStream << "Failed to retrieve BioSample data for " << id << endl;
1104  }
1105  m_Unprocessed += unprocessed_ids.size();
1106  }
1107 }
1111 {
1112  for (const auto &it : m_Descriptors) {
1113  if (it->IsSource()) {
1114  UpdateBioSource(bh, it->GetSource());
1115  }
1116  }
1117 }
1121 {
1122  vector<string> biosample_ids = biosample_util::GetBiosampleIDs(bh);
1125  // error
1127  *m_LogStream << label << " has conflicting BioSample Accession " << biosample_ids[0] << endl;
1128  return;
1129  }
1131  if (biosample_ids.empty()) {
1132  // for report mode, do not report if no biosample ID
1133  return;
1134  }
1136  for (const auto &id : biosample_ids) {
1138  if (descr) {
1139  m_Descriptors.clear();
1140  copy(descr->Set().begin(), descr->Set().end(),
1141  back_inserter(m_Descriptors));
1142  PushToRecord(bh);
1143  m_Descriptors.clear();
1144  }
1145  }
1147 }
1151 {
1152  switch (m_Mode) {
1153  case e_report_diffs:
1154  GetBioseqDiffs(bh);
1155  break;
1156  case e_generate_biosample:
1157  try {
1159  bh,
1160  m_IDPrefix,
1163  m_Owner,
1164  m_HUPDate,
1165  m_Comment,
1169  } catch (CException& e) {
1170  *m_LogStream << e.GetMsg() << endl;
1171  }
1172  break;
1173  case e_push:
1174  PushToRecord(bh);
1175  break;
1176  case e_take_from_biosample:
1177  m_Diffs.clear();
1178  GetBioseqDiffs(bh);
1180  m_ReturnCode = 1;
1181  string sequence_id = biosample_util::GetBestBioseqLabel(bh);
1182  *m_LogStream << "Conflicts found for " << sequence_id << endl;
1183  try {
1185  bh, *m_Table,
1186  true,
1189  } catch (CException& e) {
1190  *m_LogStream << e.GetMsg() << endl;
1191  }
1192  } else {
1194  }
1195  break;
1198  break;
1199  case e_update_with:
1200  case e_update_no:
1201  GetBioseqDiffs(bh);
1202  break;
1203  default:
1204  if (m_Handler != NULL) {
1205  m_Handler->ProcessBioseq(bh);
1206  }
1207  break;
1208  }
1210 }
1214 {
1215  CRef<CScope> scope = BuildScope();
1216  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*se);
1217  CBioseq_CI bi(seh, CSeq_inst::eMol_na);
1218  while (bi) {
1219  ProcessBioseqHandle(*bi);
1220  if (m_FirstSeqOnly) {
1221  break;
1222  }
1223  ++bi;
1224  }
1225  scope->RemoveTopLevelSeqEntry(seh);
1226 }
1230 {
1231  // Get seq-entry to process
1234  ProcessSeqEntry(se);
1236  // write out copy after processing, if requested
1237  if (m_AsnOut) {
1238  *m_AsnOut << *se;
1239  }
1240 }
1244 {
1245  // Get Bioseq-set to process
1247  if (set && set->IsSetSeq_set()) {
1248  for (const auto &se : set->GetSeq_set()) {
1249  ProcessSeqEntry(se);
1250  }
1251  }
1253  // write out copy after processing, if requested
1254  if (m_AsnOut) {
1255  *m_AsnOut << *set;
1256  }
1257 }
1261 {
1264  // Get seq-submit to process
1267  m_Owner = "";
1268  // get owner from Seq-submit to use if no pub is found
1269  if (ss->IsSetSub()) {
1270  if (ss->GetSub().IsSetCit()
1271  && ss->GetSub().GetCit().IsSetAuthors()
1272  && ss->GetSub().GetCit().GetAuthors().IsSetAffil()) {
1274  } else if (ss->GetSub().IsSetContact() && ss->GetSub().GetContact().IsSetContact()
1275  && ss->GetSub().GetContact().GetContact().IsSetAffil()) {
1277  }
1278  }
1280  // Process Seq-submit
1281  CRef<CScope> scope = BuildScope();
1282  if (ss->GetData().IsEntrys()) {
1283  for (const auto &se : ss->GetData().GetEntrys()) {
1284  ProcessSeqEntry(se);
1285  }
1286  }
1287  // write out copy after processing, if requested
1288  if (m_AsnOut) {
1289  *m_AsnOut << *ss;
1290  }
1291 }
1293 static bool s_IsEmptyBioSource(const CSeqdesc& src)
1294 {
1295  return !src.GetSource().IsSetSubtype() && !src.GetSource().IsSetGenome() && !src.GetSource().IsSetOrigin() &&
1296  (!src.GetSource().IsSetOrg() || (!src.GetSource().IsSetOrgname() && !src.GetSource().IsSetTaxname() && !src.GetSource().IsSetDivision()));
1297 }
1300 {
1301  CSeqdesc_CI src_desc_ci(bh, CSeqdesc::e_Source);
1303  CBioseq_EditHandle beh = bh.GetEditHandle();
1304  // Removes empty BioSources
1305  for (; src_desc_ci;) {
1307  if (s_IsEmptyBioSource(*src_desc_ci)) {
1308  const CSeqdesc& cur_descr = *src_desc_ci;
1309  ++src_desc_ci;
1310  beh.RemoveSeqdesc(cur_descr);
1311  }
1312  else {
1313  break;
1314  }
1315  }
1317  if (!src_desc_ci) {
1318  CRef<CSeqdesc> new_desc(new CSeqdesc());
1319  new_desc->SetSource().Assign(src);
1322  if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
1323  CBioseq_set_EditHandle bseh = parent.GetEditHandle();
1324  bseh.AddSeqdesc(*new_desc);
1325  } else {
1326  beh.AddSeqdesc(*new_desc);
1327  }
1328  } else {
1330  const CBioSource& bs = src_desc_ci->GetSource();
1331  CBioSource* old_src = const_cast<CBioSource *> (&bs);
1332  old_src->UpdateWithBioSample(src, true, true);
1334  // Removes the rest of empty BioSources
1335  for (++src_desc_ci; src_desc_ci;) {
1337  if (s_IsEmptyBioSource(*src_desc_ci)) {
1338  const CSeqdesc& cur_descr = *src_desc_ci;
1339  ++src_desc_ci;
1340  beh.RemoveSeqdesc(cur_descr);
1341  }
1342  else {
1343  ++src_desc_ci;
1344  }
1345  }
1346  }
1347 }
1351 {
1352  // Setup application registry and logs for CONNECT library
1354  CORE_SetREG(REG_cxx2c(&GetConfig(), false));
1355  // Setup MT-safety for CONNECT library
1356  // CORE_SetLOCK(MT_LOCK_cxx2c());
1358  // Create object manager
1360 }
1363 unique_ptr<CObjectIStream> CBiosampleChkApp::OpenFile(const CArgs& args)
1364 {
1365  string fname = args["i"].AsString();
1366  return CBiosampleChkApp::OpenFile(fname);
1367 }
1369 unique_ptr<CObjectIStream> CBiosampleChkApp::OpenFile(const string &fname)
1370 {
1373  unique_ptr<CNcbiIstream> hold_stream(new CNcbiIfstream (fname.c_str(), ios::binary));
1374  CNcbiIstream* InputStream = hold_stream.get();
1376  CFormatGuess::EFormat formatGuess = CFormatGuess::Format(*InputStream);
1378  CCompressStream::EMethod method;
1379  switch (formatGuess)
1380  {
1381  case CFormatGuess::eGZip: method = CCompressStream::eGZipFile; break;
1382  case CFormatGuess::eBZip2: method = CCompressStream::eBZip2; break;
1383  case CFormatGuess::eLzo: method = CCompressStream::eLZO; break;
1384  default: method = CCompressStream::eNone; break;
1385  }
1386  if (method != CCompressStream::eNone)
1387  {
1388  CDecompressIStream* decompress(new CDecompressIStream(*InputStream, method, CCompressStream::fDefault, eTakeOwnership));
1389  hold_stream.release();
1390  hold_stream.reset(decompress);
1391  InputStream = hold_stream.get();
1392  formatGuess = CFormatGuess::Format(*InputStream);
1393  }
1395  unique_ptr<CObjectIStream> objectStream;
1396  switch (formatGuess)
1397  {
1402  objectStream.reset(CObjectIStream::Open(format, *InputStream, eTakeOwnership));
1403  hold_stream.release();
1404  break;
1405  default:
1406  break;
1407  }
1408  return objectStream;
1409 }
1411 void CBiosampleChkApp::SaveFile(const string &fname, bool useBinaryOutputFormat)
1412 {
1413  ios::openmode mode = ios::out;
1414  m_AsnOut = new CNcbiOfstream(fname.c_str(), mode);
1415  if (!m_AsnOut)
1416  {
1417  NCBI_THROW(CException, eUnknown, "Unable to open " + fname);
1418  }
1419  if ( useBinaryOutputFormat ) {
1421  } else {
1423  }
1424 }
1427 /////////////////////////////////////////////////////////////////////////////
1428 // MAIN
1431 int main(int argc, const char* argv[])
1432 {
1433  return CBiosampleChkApp().AppMain(argc, argv, 0, eDS_Default, 0);
1434 }
Modified on Wed Apr 17 13:08:16 2024 by rev. 669887