## ## Name: edit.py ## Purpose: Compute minimum edits between strings. ## Author: M. J. Fromberger ## Info: $Id: edit.py 122 2006-05-14 21:12:02Z sting $ ## ## Notes: ## As currently written, min_edit_sequence() only works on strings ## whose length will fit in an unsigned short (usu. 16 bits). If you ## want to change this, edit the pack/unpack formats that read HBH3s ## to read IBI3s or LBL3s as appropriate. Doing so will increase the ## memory requirements of the dynamic program, however, and it is ## already pretty slow, due to Python overhead, for strings longer ## than a hundred characters or so. import struct as _struct def fold_edit_sequence(seq): """Compress an edit sequence returned by the min_edit_sequence() function to remove consecutive runs of insertions or deletions that can occur at the same location, and strips out unnecessary edit arguments. Returns a list of tuples, each having the form (pos, edit). The "pos" is an integer offset into the string, and "edit" is a string describing the edit. The first character of "edit" describes what it is: i - insertion of a sequence of characters d - deletion of a sequence of characters c - replacement of a single character with another The remainder of the edit string is either the sequence of chars to be inserted in left-to-right order (i), the sequence of chars to be deleted (d), or the character to replace the current offset with (c). See also: min_edit_sequence().""" iseq = iter(seq) try: last = list(iseq.next()) except StopIteration: return seq out = [] for pos, edit in seq[1:]: if edit[0] == last[1][0] == 'd' and \ pos in (last[0], last[0] + len(last[1])): last[1] += edit[1:] elif edit[0] == last[1][0] == 'i' and \ pos in (last[0], last[0] + len(last[1]) - 1): last[1] += edit[1:] else: if last[1][0] == 'c': last[1] = 'c' + last[1][2:] out.append(tuple(last)) last = [pos, edit] if last[1][0] == 'c': last[1] = 'c' + last[1][2:] out.append(tuple(last)) return out def min_edit_distance(A, B): """Compute the minimum number of point mutations that will convert string A into string B. See min_edit_sequence() for the details of the algorithm.""" return len(min_edit_sequence(A, B)) def min_edit_sequence(A, B): """Compute a minimal-length sequence of point mutations that will convert string A into string B. The result is a list of tuples, each of the form (pos, edit). The "pos" is an integer offset into the string, "edit" is a string describing the edit. The first character of the edit describes what it is: i - insert the specified character d - delete the character at this position c - change the character at this position to another The remainder of the edit string depends upon the edit code: i - second is the character to be inserted d - second is the character to be deleted c - second is the old character, third is the new character If the strings are identical, the empty list is returned. Note that this routine uses a straightforward dynamic programming algorithm, which requires storage proportional to len(A) * len(B) to compute an optimal solution.""" # Directional constants NONE = 0; LEFT = 1; DIAG = 2; DOWN = 3 def P(length, prev, pos, edit): return _struct.pack('HBH3s', length, prev, pos, edit) def U(s): return _struct.unpack('HBH3s', s) a_len = len(A); b_len = len(B) mem = [None] * (a_len + 1) for pos in xrange(a_len + 1): mem[pos] = [None] * (b_len + 1) # Each entry in the table has the form (len, next, pos, action) # len gives length of optimal path of this form # back gives location in table of previous step # pos gives the location of the edit action # action gives editing action to complete this step # # An editing action is a 3-char string. The first char of the # action is 'i' for insert, 'd' for delete, and 'c' for change. # For 'i' and 'd', the second character gives the character to # be inserted or deleted; for 'c', the second and third chars # give the old and new char. The editing action takes place at # offset 'pos' from the beginning of the string. # # Strictly speaking, 'd' does not need this, and 'c' needs only # one argument, but having both helps with debugging. # Fill in the base cases mem[0][0] = P(0, NONE, 0, '') for pos in xrange(1, b_len + 1): mem[0][pos] = P(U(mem[0][pos - 1])[0] + 1, DOWN, pos - 1, 'i' + B[pos - 1]) # Fill in the remainder of the table for a_plen in xrange(1, a_len + 1): apos = a_plen - 1 mem[a_plen][0] = P(U(mem[a_plen - 1][0])[0] + 1, LEFT, 0, 'd' + A[apos]) for b_plen in xrange(1, b_len + 1): bpos = b_plen - 1 opt_len = U(mem[a_plen - 1][b_plen - 1])[0] opt_dir = DIAG act = (0, '') if A[apos] <> B[bpos]: act = (bpos, 'c' + A[apos] + B[bpos]) opt_len += 1 next = U(mem[a_plen - 1][b_plen]) if next[0] + 1 < opt_len: opt_len = next[0] + 1 opt_dir = LEFT act = (bpos + 1, 'd' + A[apos]) next = U(mem[a_plen][b_plen - 1]) if next[0] + 1 < opt_len: opt_len = next[0] + 1 opt_dir = DOWN act = (bpos, 'i' + B[bpos]) mem[a_plen][b_plen] = P(opt_len, opt_dir, act[0], act[1]) # Chase links backward to form an optimal solution # Solution is computed in reverse order out = [] ; xpos = a_len ; ypos = b_len next = DOWN while next <> NONE: (score, next, loc, action) = U(mem[xpos][ypos]) action = action.rstrip('\x00') if action: out.append((loc, action)) if next in (LEFT, DIAG): xpos -= 1 if next in (DIAG, DOWN): ypos -= 1 out.reverse() return out def apply_edit_sequence(rules, input, verb = False): """Apply a sequence of edits to the given input string, in order, and return the string that results. If verb is True, verbose debugging output is sent to stderr.""" if verb: from sys import stderr def dp(msg): if verb: print >> stderr, msg else: def dp(msg): pass dp('in = %s' % input) input = list(input) for pos, rule in rules: if rule[0] == 'i': dp('Pos %d: Insert "%s"' % (pos, rule[1:])) for i, c in enumerate(rule[1:]): input.insert(pos + i, c) elif rule[0] == 'd': dp('Pos %d: Delete "%s"' % (pos, rule[1:])) for i, d in enumerate(rule[1:]): if input[pos] <> d: raise ValueError("Unmatching delete of '%s' at position %d" % (d, pos)) del input[pos] elif rule[0] == 'c': if len(rule) > 2: src, tgt = rule[1], rule[2] else: src, tgt = input[pos], rule[1] dp('Pos %d: Change "%s" to "%s"' % (pos, src, tgt)) if input[pos] <> src: raise ValueError("Unmatching change of '%s' at position %d" % (src, pos)) input[pos] = tgt else: raise TypeError("Unknown rule type: %s" % (rule,)) dp(' ==> %s' % ''.join(input)) return ''.join(input) if __name__ == '__main__': def test(A, B, seq, verb = False): if apply_edit_sequence(seq, A, verb) <> B: raise ValueError("Output does not match requirement:\n" " wanted: %s\n" " but got: %s" % (A, B)) return True from pprint import pprint A = 'This is the picture!' B = 'Those were the coolest dudes!' try: seq = min_edit_sequence(A, B) if test(A, B, seq): print "" cmp = fold_edit_sequence(seq) if test(A, B, cmp): print "" except ValueError, e: print ": %s" % e # Here there be dragons