35 #include <unordered_map>
39 #define NCBI_USE_ERRCODE_X Util_Diff
45 #define F_ISSET(mask) (((flags) & (mask)) == (mask))
48 #define DIFF_DELETE CDiffOperation::eDelete
49 #define DIFF_EQUAL CDiffOperation::eEqual
50 #define DIFF_INSERT CDiffOperation::eInsert
61 m_Length(
str.length())
81 switch (it->GetOperation()) {
83 len_insert += it->GetString().length();
86 len_delete += it->GetString().length();
90 edit_distance +=
max(len_insert, len_delete);
96 edit_distance +=
max(len_insert, len_delete);
107 TList::const_iterator max_pos = diffs.end();
118 if (!max_len || max_pos == diffs.end()) {
121 return max_pos->GetString();
129 if (it->GetString().length() != it->GetLength()) {
133 "Possible fRemoveEOL flag was used, it is impossible "
134 "to perform cleanup and merge in this mode");
162 switch (it->GetOperation()) {
165 off_second += it->GetLength();
169 off_first += it->GetLength();
173 off_first += it->GetLength();
174 off_second += it->GetLength();
185 size_t count_delete = 0;
186 size_t count_insert = 0;
188 CDiffList::TList::iterator this_diff =
m_List.begin();
189 CDiffList::TList::iterator ins_diff =
m_List.end();
190 CDiffList::TList::iterator del_diff =
m_List.end();
191 CDiffList::TList::iterator equal_diff =
m_List.end();
193 while (this_diff !=
m_List.end()) {
194 const CTempString& current = this_diff->GetString();
195 switch (this_diff->GetOperation()) {
202 this_diff =
m_List.erase(this_diff);
204 ins_diff = this_diff++;
214 this_diff =
m_List.erase(this_diff);
216 del_diff = this_diff++;
223 if (count_delete + count_insert >= 1) {
225 if (count_delete != 0 && count_insert != 0) {
230 if (common_prefix_len) {
232 if (equal_diff ==
m_List.end()) {
239 ins_str = ins_str.
substr(common_prefix_len);
240 del_str = del_str.
substr(common_prefix_len);
245 if (common_suffix_len) {
248 this_diff->SetString(
252 ins_str = ins_str.
substr(0, ins_str.
length() - common_suffix_len);
253 del_str = del_str.
substr(0, del_str.
length() - common_suffix_len);
256 if (common_prefix_len || common_suffix_len) {
257 ins_diff->SetString(ins_str);
258 del_diff->SetString(del_str);
262 equal_diff = this_diff++;
266 if (equal_diff ==
m_List.end()) {
268 equal_diff = this_diff++;
273 this_diff =
m_List.erase(this_diff);
294 CDiffList::TList::iterator next_diff =
m_List.begin();
295 CDiffList::TList::iterator prev_diff = next_diff++;
296 CDiffList::TList::iterator this_diff = next_diff++;
298 bool changes =
false;
300 while (next_diff !=
m_List.end()) {
302 if (prev_diff->GetOperation() ==
DIFF_EQUAL &&
323 prev_diff->SetString(
327 this_diff->SetString(
330 next_diff =
m_List.erase(next_diff);
331 if (next_diff ==
m_List.end()) {
332 next_diff = this_diff;
340 next_diff->SetString(
344 this_diff->SetString(
351 prev_diff = this_diff;
352 this_diff = next_diff;
369 unique_ptr<CDeadline> real_deadline;
386 if (common_prefix_len) {
387 common_prefix = s1.
substr(0, common_prefix_len);
388 s1 = s1.
substr(common_prefix_len);
389 s2 = s2.
substr(common_prefix_len);
395 if (common_suffix_len) {
396 common_suffix = s1.
substr(s1.
length() - common_suffix_len);
404 if (common_prefix_len) {
407 if (common_suffix_len) {
462 if (!hm1_res && !hm2_res) {
464 }
else if (!hm1_res) {
466 }
else if (!hm2_res) {
470 hm = hm1[4].length() > hm2[4].length() ? hm1 : hm2;
520 if (best_common.
length() < suffix_len + prefix_len) {
521 best_common = short_str.
substr(j - suffix_len, suffix_len + prefix_len);
522 best_longtext_a = long_str.
substr(0,
i - suffix_len);
523 best_longtext_b = long_str.
substr(
i + prefix_len);
524 best_shorttext_a = short_str.
substr(0, j - suffix_len);
525 best_shorttext_b = short_str.
substr(j + prefix_len);
529 hm[0] = best_longtext_a;
530 hm[1] = best_longtext_b;
531 hm[2] = best_shorttext_a;
532 hm[3] = best_shorttext_b;
556 const int max_d = (s1_len + s2_len + 1) / 2;
557 const int v_offset = max_d;
558 const int v_length = 2 * max_d;
559 int *v1 =
new int[v_length];
560 int *
v2 =
new int[v_length];
561 for (
int x = 0; x < v_length; x++) {
565 v1[v_offset + 1] = 0;
566 v2[v_offset + 1] = 0;
567 const int delta = s1_len - s2_len;
578 for (
int d = 0; d < max_d; d++)
584 for (
int k1 = -d + k1start; k1 <= d - k1end; k1 += 2) {
585 const int k1_offset = v_offset + k1;
587 if (k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1])) {
588 x1 = v1[k1_offset + 1];
590 x1 = v1[k1_offset - 1] + 1;
593 while (x1 < s1_len && y1 < s2_len && s1[x1] == s2[y1]) {
601 }
else if (y1 > s2_len) {
605 int k2_offset = v_offset +
delta - k1;
606 if (k2_offset >= 0 && k2_offset < v_length &&
v2[k2_offset] != -1) {
608 int x2 = s1_len -
v2[k2_offset];
621 for (
int k2 = -d + k2start; k2 <= d - k2end; k2 += 2) {
622 const int k2_offset = v_offset + k2;
624 if (k2 == -d || (k2 != d &&
v2[k2_offset - 1] <
v2[k2_offset + 1])) {
625 x2 =
v2[k2_offset + 1];
627 x2 =
v2[k2_offset - 1] + 1;
630 while (x2 < s1_len && y2 < s2_len &&
631 s1[s1_len - x2 - 1] == s2[s2_len - y2 - 1]) {
639 }
else if (y2 > s2_len) {
643 int k1_offset = v_offset +
delta - k2;
644 if (k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1) {
645 int x1 = v1[k1_offset];
646 int y1 = v_offset + x1 - k1_offset;
729 if (short_str.
length() == 1) {
743 x_Diff(hm[0], hm[2], diffs_a);
744 x_Diff(hm[1], hm[3], diffs_b);
775 unique_ptr<CDeadline> real_deadline;
783 vector<CTempString> lines;
786 s1_num_lines = lines.size();
788 s2_num_lines = lines.size() - s1_num_lines;
800 if (
i != s1_num_lines-1 &&
i != lines.size()-1) {
808 typedef vector<size_type> TSequence;
809 TSequence idx1(s1_num_lines);
810 TSequence idx2(s2_num_lines);
813 TStringToLineNumMap::const_iterator it;
821 if (
i < s1_num_lines-1) {
826 idx1[
i] = it->second;
830 TStringToLineNumMap::const_iterator it;
831 string s = lines[pos];
838 if (
i < s2_num_lines-1) {
843 idx2[
i] = it->second;
848 if (s1_num_lines > 10000 && s2_num_lines > 10000) {
852 vector< pair<size_type, dtl::elemInfo> > ses_seq = d.
getSes().getSequence();
853 vector< pair<size_type, dtl::elemInfo> >::iterator it;
872 for (it = ses_seq.begin(); it != ses_seq.end(); it++)
880 bool is_last_line =
false;
887 line2 = pos2 - s1_num_lines + 1;
888 is_last_line = (
line2 >= s2_num_lines);
895 is_last_line = (
line1 >= s1_num_lines);
903 line2 = pos2 - s1_num_lines + 1;
904 is_last_line = (
line2 >= s2_num_lines);
918 if (s[
len-1] ==
'\r') {
938 insertions.push_back(op);
942 deletions.push_back(op);
946 if (!deletions.empty()) {
950 if (!insertions.empty()) {
960 if (!deletions.empty()) {
963 if (!insertions.empty()) {
990 CDiffList::TList::const_iterator start_iter,
991 CDiffList::TList::const_iterator end_iter,
995 const char eol =
'\n';
996 unsigned long n1 = 0;
997 unsigned long n2 = 0;
1002 CDiffList::TList::const_iterator it = start_iter;
1003 while (it != end_iter)
1006 switch (it->GetOperation()) {
1022 out <<
"@@ -" << (
unsigned long)hunk_s1 <<
"," << n1 <<
" +"
1023 << (
unsigned long)hunk_s2 <<
"," << n2 <<
" @@" << eol;
1027 while (it != end_iter)
1030 switch (it->GetOperation()) {
1041 out << op << it->GetString() << eol;
1050 unsigned int num_common_lines)
1056 if (diffs.empty()) {
1061 CDiffList::TList::const_iterator it = diffs.begin();
1062 CDiffList::TList::const_iterator hunk_start = diffs.end();
1063 CDiffList::TList::const_iterator hunk_end = diffs.end();
1065 bool is_hunk =
false;
1066 unsigned int num_equal = 0;
1077 while (it != diffs.end()) {
1078 switch (it->GetOperation()) {
1089 if (!num_common_lines || !num_equal) {
1112 if (num_common_lines == 0) {
1118 }
else if (!num_equal) {
1121 }
else if (num_equal <= num_common_lines) {
1139 }
else if (num_equal < num_common_lines) {
1167 case eEmpty:
return "eEmpty";
@ eEmpty
no filtering at all.
CDiffList – the list of diff operations.
CDiffOperation – The storage class for one diff operation.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
diff class template sequence must support random_access_iterator.
void compose()
compose Longest Common Subsequence and Shortest Edit Script.
Ses< elem > getSes() const
unordered_map< string, CDiffList::size_type > TStringToLineNumMap
CNcbiOstream & s_PrintUnifiedHunk(CNcbiOstream &out, CDiffList::TList::const_iterator start_iter, CDiffList::TList::const_iterator end_iter, CDiffList::size_type hunk_s1, CDiffList::size_type hunk_s2)
API to diff strings/texts and to find the differences.
std::ofstream out("events_result.xml")
main entry point for tests
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
void CalculateOffsets(void)
Calculate offsets for all substrings in the difference list and find its position from the start of t...
CDeadline * m_Deadline
Deadline for processing (NULL if not set)
void x_DiffBisect(CTempString s1, CTempString s2, CDiffList &diffs) const
Find the 'middle snake' of a diff, split the problem in two and return the recursively constructed di...
void Prepend(CDiffOperation::EType operation, CTempString str)
Add element to the front of the list.
bool x_DiffHalfMatch(CTempString s1, CTempString s2, TDiffHalfMatchList &hm) const
Do the two texts share a substring which is at least half the length of the longer text?...
unsigned int TFlags
Bitwise OR of "Flags".
void CleanupAndMerge(void)
Reorder and merge like edit sections, merge equalities.
CDiffList & Diff(CTempString text1, CTempString text2, TFlags flags=0)
Find the differences between two texts (line mode).
virtual const char * GetErrCodeString(void) const override
Get error code interpreted as text.
unsigned int TFlags
Bitwise OR of "EFlags".
bool x_CleanupAndMerge_SingleEdits(void)
Look for single edits surrounded on both sides by equalities which can be shifted sideways to elimina...
CTempString GetLongestCommonSubstring(void) const
Find the longest common substring for current difference list.
void x_CleanupAndMerge_Equities(void)
Merge adjacent parts with the same operation.
void Reset(void)
Reset internal state and prepare to next Diff()
bool x_DiffHalfMatchI(CTempString long_str, CTempString short_str, size_type i, TDiffHalfMatchList &hm) const
Does a substring of short string exist within long string such that the substring is at least half th...
void x_DiffBisectSplit(CTempString s1, CTempString s2, int x, int y, CDiffList &diffs) const
Given the location of the 'middle snake', split the diff in two parts and recurse.
void Append(CDiffOperation::EType operation, CTempString str)
Add element to the end of the list.
vector< CTempString > TDiffHalfMatchList
Five element array for the list of strings, returned by x_DiffHalfMatch()
bool IsTimeoutExpired() const
Check if timeout is expired.
CDiffList m_Diffs
The list of differences from the last diff.
list< CDiffOperation > TList
Storage class type for the list of diff operations.
CDiffList & Diff(CTempString s1, CTempString s2, TFlags flags=0)
Find the differences between two texts (character mode).
void x_Diff(CTempString s1, CTempString s2, CDiffList &diffs) const
Find the differences between two texts.
CDiffOperation::size_type size_type
Size type definition.
size_type GetEditDistance(void) const
Compute the edit distance (Levenshtein distance).
TList m_List
List of the differences.
CDiffOperation::size_type size_type
Type definition.
EType
Type of the current diff operation.
CDiffOperation(EType operation, CTempString str)
Constructor.
CTimeout m_Timeout
Relative timeout for processing.
const TList & GetList(void) const
Get list of the diff operations as list<>.
void SetLength(size_type length)
CNcbiOstream & PrintUnifiedDiff(CNcbiOstream &out, CTempString text1, CTempString text2, unsigned int num_common_lines=3)
Find the differences between two texts and print result into output stream in unified-like format.
@ fCalculateOffsets
Automatically call CalculateOffests() for the generated list of differences.
@ fNoCleanup
By default Diff() automatically call CleanupAndMerge() for the generated list of differences to have ...
@ fCleanup
Automatically call CleanupAndMerge() for the generated list of differences after Diff() to have a sho...
@ fCalculateOffsets
Automatically call CalculateOffests() for the generated list of differences.
@ fIgnoreEOL
Ignore differences in end-of-line types on string comparison.
@ fRemoveEOL
Remove end-of-line symbols from each string added to the list, which can be obtained using CDiffOpera...
TErrCode GetErrCode(void) const
Get error code.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
static SIZE_TYPE CommonSuffixSize(const CTempString s1, const CTempString s2)
Determine the common suffix of two strings.
const char * data(void) const
Return a pointer to the array represented.
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
size_type length(void) const
Return the length of the represented array.
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
static SIZE_TYPE CommonPrefixSize(const CTempString s1, const CTempString s2)
Determine the common prefix of two strings.
virtual void Reset(void)
Reset the whole object.
unsigned int
A callback function used to compare two keys in a database.
constexpr auto front(list< Head, As... >, T=T()) noexcept -> Head
int edit_t
type of edit for SES
double value_type
The numeric datatype used by the parser.
The NCBI C++/STL use hints.
Int4 delta(size_t dimension_, const Int4 *score_)
static const char * str(char *buf, int n)
Structure to save offset/length in the compared strings.
static char line1[1024 *16]
static char line2[1024 *16]