##
## Name: edit.py
## Purpose: Compute minimum edits between strings.
## Author: M. J. Fromberger
## Info: $Id: edit.py 122 2006-05-14 21:12:02Z sting $
##
## Notes:
## As currently written, min_edit_sequence() only works on strings
## whose length will fit in an unsigned short (usu. 16 bits). If you
## want to change this, edit the pack/unpack formats that read HBH3s
## to read IBI3s or LBL3s as appropriate. Doing so will increase the
## memory requirements of the dynamic program, however, and it is
## already pretty slow, due to Python overhead, for strings longer
## than a hundred characters or so.
import struct as _struct
def fold_edit_sequence(seq):
"""Compress an edit sequence returned by the min_edit_sequence()
function to remove consecutive runs of insertions or deletions
that can occur at the same location, and strips out unnecessary
edit arguments. Returns a list of tuples, each having the form
(pos, edit). The "pos" is an integer offset into the string, and
"edit" is a string describing the edit.
The first character of "edit" describes what it is:
i - insertion of a sequence of characters
d - deletion of a sequence of characters
c - replacement of a single character with another
The remainder of the edit string is either the sequence of chars
to be inserted in left-to-right order (i), the sequence of chars
to be deleted (d), or the character to replace the current offset
with (c).
See also: min_edit_sequence()."""
iseq = iter(seq)
try:
last = list(iseq.next())
except StopIteration:
return seq
out = []
for pos, edit in seq[1:]:
if edit[0] == last[1][0] == 'd' and \
pos in (last[0], last[0] + len(last[1])):
last[1] += edit[1:]
elif edit[0] == last[1][0] == 'i' and \
pos in (last[0], last[0] + len(last[1]) - 1):
last[1] += edit[1:]
else:
if last[1][0] == 'c':
last[1] = 'c' + last[1][2:]
out.append(tuple(last))
last = [pos, edit]
if last[1][0] == 'c':
last[1] = 'c' + last[1][2:]
out.append(tuple(last))
return out
def min_edit_distance(A, B):
"""Compute the minimum number of point mutations that will convert
string A into string B. See min_edit_sequence() for the details
of the algorithm."""
return len(min_edit_sequence(A, B))
def min_edit_sequence(A, B):
"""Compute a minimal-length sequence of point mutations that will
convert string A into string B. The result is a list of tuples,
each of the form (pos, edit). The "pos" is an integer offset into
the string, "edit" is a string describing the edit.
The first character of the edit describes what it is:
i - insert the specified character
d - delete the character at this position
c - change the character at this position to another
The remainder of the edit string depends upon the edit code:
i - second is the character to be inserted
d - second is the character to be deleted
c - second is the old character, third is the new character
If the strings are identical, the empty list is returned.
Note that this routine uses a straightforward dynamic programming
algorithm, which requires storage proportional to len(A) * len(B)
to compute an optimal solution."""
# Directional constants
NONE = 0; LEFT = 1; DIAG = 2; DOWN = 3
def P(length, prev, pos, edit):
return _struct.pack('HBH3s', length, prev, pos, edit)
def U(s):
return _struct.unpack('HBH3s', s)
a_len = len(A); b_len = len(B)
mem = [None] * (a_len + 1)
for pos in xrange(a_len + 1):
mem[pos] = [None] * (b_len + 1)
# Each entry in the table has the form (len, next, pos, action)
# len gives length of optimal path of this form
# back gives location in table of previous step
# pos gives the location of the edit action
# action gives editing action to complete this step
#
# An editing action is a 3-char string. The first char of the
# action is 'i' for insert, 'd' for delete, and 'c' for change.
# For 'i' and 'd', the second character gives the character to
# be inserted or deleted; for 'c', the second and third chars
# give the old and new char. The editing action takes place at
# offset 'pos' from the beginning of the string.
#
# Strictly speaking, 'd' does not need this, and 'c' needs only
# one argument, but having both helps with debugging.
# Fill in the base cases
mem[0][0] = P(0, NONE, 0, '')
for pos in xrange(1, b_len + 1):
mem[0][pos] = P(U(mem[0][pos - 1])[0] + 1,
DOWN, pos - 1, 'i' + B[pos - 1])
# Fill in the remainder of the table
for a_plen in xrange(1, a_len + 1):
apos = a_plen - 1
mem[a_plen][0] = P(U(mem[a_plen - 1][0])[0] + 1,
LEFT, 0, 'd' + A[apos])
for b_plen in xrange(1, b_len + 1):
bpos = b_plen - 1
opt_len = U(mem[a_plen - 1][b_plen - 1])[0]
opt_dir = DIAG
act = (0, '')
if A[apos] <> B[bpos]:
act = (bpos, 'c' + A[apos] + B[bpos])
opt_len += 1
next = U(mem[a_plen - 1][b_plen])
if next[0] + 1 < opt_len:
opt_len = next[0] + 1
opt_dir = LEFT
act = (bpos + 1, 'd' + A[apos])
next = U(mem[a_plen][b_plen - 1])
if next[0] + 1 < opt_len:
opt_len = next[0] + 1
opt_dir = DOWN
act = (bpos, 'i' + B[bpos])
mem[a_plen][b_plen] = P(opt_len, opt_dir, act[0], act[1])
# Chase links backward to form an optimal solution
# Solution is computed in reverse order
out = [] ; xpos = a_len ; ypos = b_len
next = DOWN
while next <> NONE:
(score, next, loc, action) = U(mem[xpos][ypos])
action = action.rstrip('\x00')
if action:
out.append((loc, action))
if next in (LEFT, DIAG):
xpos -= 1
if next in (DIAG, DOWN):
ypos -= 1
out.reverse()
return out
def apply_edit_sequence(rules, input, verb = False):
"""Apply a sequence of edits to the given input string, in order,
and return the string that results.
If verb is True, verbose debugging output is sent to stderr."""
if verb:
from sys import stderr
def dp(msg):
if verb:
print >> stderr, msg
else:
def dp(msg): pass
dp('in = %s' % input)
input = list(input)
for pos, rule in rules:
if rule[0] == 'i':
dp('Pos %d: Insert "%s"' % (pos, rule[1:]))
for i, c in enumerate(rule[1:]):
input.insert(pos + i, c)
elif rule[0] == 'd':
dp('Pos %d: Delete "%s"' % (pos, rule[1:]))
for i, d in enumerate(rule[1:]):
if input[pos] <> d:
raise ValueError("Unmatching delete of '%s' at position %d"
% (d, pos))
del input[pos]
elif rule[0] == 'c':
if len(rule) > 2:
src, tgt = rule[1], rule[2]
else:
src, tgt = input[pos], rule[1]
dp('Pos %d: Change "%s" to "%s"' %
(pos, src, tgt))
if input[pos] <> src:
raise ValueError("Unmatching change of '%s' at position %d" %
(src, pos))
input[pos] = tgt
else:
raise TypeError("Unknown rule type: %s" % (rule,))
dp(' ==> %s' % ''.join(input))
return ''.join(input)
if __name__ == '__main__':
def test(A, B, seq, verb = False):
if apply_edit_sequence(seq, A, verb) <> B:
raise ValueError("Output does not match requirement:\n"
" wanted: %s\n"
" but got: %s" % (A, B))
return True
from pprint import pprint
A = 'This is the picture!'
B = 'Those were the coolest dudes!'
try:
seq = min_edit_sequence(A, B)
if test(A, B, seq):
print ""
cmp = fold_edit_sequence(seq)
if test(A, B, cmp):
print ""
except ValueError, e:
print ": %s" % e
# Here there be dragons