#!perl -w
use strict;
use SelfLoader;
test() if grep(/\-t/, @ARGV);
($#ARGV == 1) or die "Usage: $0 \n";
my ($source, $target) = (@ARGV);
print "The Levenshtein distance between $source and $target is: " .
levenshtein($source, $target) . "\n";
# Return the Levenshtein distance (also called Edit distance)
# between two strings
#
# The Levenshtein distance (LD) is a measure of similarity between two
# strings, source and target. The distance is the number of
# deletions, insertions or substitutions required to transform source
# into the target string.
# The greater the distance, the more different the strings are.
#
# The algorithm employs a proximity matrix, which denotes the distances
# between substrings of the two given strings.
# Read the embedded comments for more info.
# If you want a deep understanding of the algorithm, print
# the matrix for some test strings and study it
#
# The beauty of this system is that nothing is magical - the distance
# is intuitively understandable by humans
#
# The distance is named after Russian scientist Vladimir Levenshtein
# He devised the algorithm in 1965
#
sub levenshtein {
# $source and $target are the two strings
# $slen and $tlen are their respective lengths
#
my ($source, $target) = @_;
my ($slen, $tlen) = (length $source, length $target);
# If one of the strings is empty, the distance is the length
# of the other string
#
return $tlen if ($slen == 0);
return $slen if ($tlen == 0);
my %mat = ( );
# Init the distance matrix
#
# The first row to 0..$slen
# The first column to 0..$tlen
# The rest to 0
#
# The first row and column are initialized so to denote distance
# from the empty string
# b l u e
# b 0 1 2 3
# l 1 0 0 0
# u 2 0 0 0
# e 3 0 0 0
for (my $i = 0; $i <= $slen; ++$i) {
for (my $j = 0; $j <= $tlen; ++$j) {
$mat{$i}{$j} = 0;
$mat{0}{$j} = $j;
}
$mat{$i}{0} = $i;
}
# Some char-by-char processing is ahead, so prepare
# array of chars from the strings
#
my @source = split(//, $source);
my @target = split(//, $target);
for (my $i = 1; $i <= $slen; ++$i) {
for (my $j = 1; $j <= $tlen; ++$j) {
# Set the cost to 1 if the ith char of $source
# equals the jth of $target
#
# Denotes a substitution cost. When the char are equal
# there is no need to substitute, so the cost is 0
#
my $cost = ($source[$i-1] eq $target[$j-1]) ? 0 : 1;
# Cell $mat{$i}{$j} equals the minimum of:
#
# - The cell immediately to the left plus 1
# - The cell immediately above plus 1
# - The cell diagonally above and to the left plus the cost
#
# We can either insert a new char, delete a char or
# substitute an existing char (with an associated cost)
#
$mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
$mat{$i}{$j-1} + 1,
$mat{$i-1}{$j-1} + $cost]);
}
}
# The Levenshtein distance equals the rightmost bottom cell
# of the matrix
#
# $mat{$x}{$y} denotes the distance between the substrings
# 1..$x and 1..$y
#
return $mat{$slen}{$tlen};
}
# minimal element of a list
#
sub min {
my @list = @{$_[0]};
my $min = $list[0];
foreach my $i (@list) {
$min = $i if ($i < $min);
}
return $min;
}
__END__
__DATA__
sub test {
use Test::More tests => 10;
is( min( [ 10, 20, 0, 50 ] ), 0, 'min([10,20,0,50])');
is( min( [ -3, 20, 0, 50 ] ), -3, 'min([-3,20,0,50])');
is( min( [ 3, 20, 30, 50 ] ), 3, 'min([3,20,30,50])');
is( levenshtein("GCAT", ""), 4, 'levenshtein("GCAT", "")');
is( levenshtein("", "GCAT"), 4, 'levenshtein("", "GCAT")');
is( levenshtein("CGTA", "GCAT"), 3, 'levenshtein("CGTA", "GCAT")');
is( levenshtein("xxxx", "yyyy"), 4, 'levenshtein("xxxx", "yyyy")');
is( levenshtein("restrant", "restaurant"), 2,
'levenshtein("restrant", "restaurant") 2 inserts');
is( levenshtein("optamologist", "ophthamologist"), 2,
'levenshtein("optamologist", "ophthamologist") 2 inserts');
is( levenshtein("cheapmonk", "chipmonk"), 2,
'levenshtein("cheapmonk", "chipmonk") 1 inserts, 1 substitute');
exit(0);
}