levenshtein.pl

#!perl -w use strict; use SelfLoader; test() if grep(/\-t/, @ARGV); ($#ARGV == 1) or die "Usage: $0 <source> <target>\n"; my ($source, $target) = (@ARGV); print "The Levenshtein distance between $source and $target is: " . levenshtein($source, $target) . "\n"; # Return the Levenshtein distance (also called Edit distance) # between two strings # # The Levenshtein distance (LD) is a measure of similarity between two # strings, source and target. The distance is the number of # deletions, insertions or substitutions required to transform source # into the target string. # The greater the distance, the more different the strings are. # # The algorithm employs a proximity matrix, which denotes the distances # between substrings of the two given strings. # Read the embedded comments for more info. # If you want a deep understanding of the algorithm, print # the matrix for some test strings and study it # # The beauty of this system is that nothing is magical - the distance # is intuitively understandable by humans # # The distance is named after Russian scientist Vladimir Levenshtein # He devised the algorithm in 1965 # sub levenshtein { # $source and $target are the two strings # $slen and $tlen are their respective lengths # my ($source, $target) = @_; my ($slen, $tlen) = (length $source, length $target); # If one of the strings is empty, the distance is the length # of the other string # return $tlen if ($slen == 0); return $slen if ($tlen == 0); my %mat = ( ); # Init the distance matrix # # The first row to 0..$slen # The first column to 0..$tlen # The rest to 0 # # The first row and column are initialized so to denote distance # from the empty string # b l u e # b 0 1 2 3 # l 1 0 0 0 # u 2 0 0 0 # e 3 0 0 0 for (my $i = 0; $i <= $slen; ++$i) { for (my $j = 0; $j <= $tlen; ++$j) { $mat{$i}{$j} = 0; $mat{0}{$j} = $j; } $mat{$i}{0} = $i; } # Some char-by-char processing is ahead, so prepare # array of chars from the strings # my @source = split(//, $source); my @target = split(//, $target); for (my $i = 1; $i <= $slen; ++$i) { for (my $j = 1; $j <= $tlen; ++$j) { # Set the cost to 1 if the ith char of $source # equals the jth of $target # # Denotes a substitution cost. When the char are equal # there is no need to substitute, so the cost is 0 # my $cost = ($source[$i-1] eq $target[$j-1]) ? 0 : 1; # Cell $mat{$i}{$j} equals the minimum of: # # - The cell immediately to the left plus 1 # - The cell immediately above plus 1 # - The cell diagonally above and to the left plus the cost # # We can either insert a new char, delete a char or # substitute an existing char (with an associated cost) # $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, $mat{$i}{$j-1} + 1, $mat{$i-1}{$j-1} + $cost]); } } # The Levenshtein distance equals the rightmost bottom cell # of the matrix # # $mat{$x}{$y} denotes the distance between the substrings # 1..$x and 1..$y # return $mat{$slen}{$tlen}; } # minimal element of a list # sub min { my @list = @{$_[0]}; my $min = $list[0]; foreach my $i (@list) { $min = $i if ($i < $min); } return $min; } __END__ __DATA__ sub test { use Test::More tests => 10; is( min( [ 10, 20, 0, 50 ] ), 0, 'min([10,20,0,50])'); is( min( [ -3, 20, 0, 50 ] ), -3, 'min([-3,20,0,50])'); is( min( [ 3, 20, 30, 50 ] ), 3, 'min([3,20,30,50])'); is( levenshtein("GCAT", ""), 4, 'levenshtein("GCAT", "")'); is( levenshtein("", "GCAT"), 4, 'levenshtein("", "GCAT")'); is( levenshtein("CGTA", "GCAT"), 3, 'levenshtein("CGTA", "GCAT")'); is( levenshtein("xxxx", "yyyy"), 4, 'levenshtein("xxxx", "yyyy")'); is( levenshtein("restrant", "restaurant"), 2, 'levenshtein("restrant", "restaurant") 2 inserts'); is( levenshtein("optamologist", "ophthamologist"), 2, 'levenshtein("optamologist", "ophthamologist") 2 inserts'); is( levenshtein("cheapmonk", "chipmonk"), 2, 'levenshtein("cheapmonk", "chipmonk") 1 inserts, 1 substitute'); exit(0); }