Algorithm Implementation/Strings/Levenshtein distance

The implementations of the Levenshtein algorithm on this page are illustrative only. Applications will, in most cases, use implementations which use heap allocations sparingly, in particular when large lists of words are compared to each other. The following remarks indicate some of the variations on this and related topics:

  • Most implementations use one- or two-dimensional arrays to store the distances of prefixes of the words compared. In most applications the size of these structures is previously known. This is the case, when, for instance the distance is relevant only if it is below a certain maximally allowed distance (this happens when words are selected from a dictionary to approximately match a given word). In this case the arrays can be preallocated and reused over the various runs of the algorithm over successive words.
  • Using a maximum allowed distance puts an upper bound on the search time. The search can be stopped as soon as the minimum Levenshtein distance between prefixes of the strings exceeds the maximum allowed distance.
  • Deletion, insertion, and replacement of characters can be assigned different weights. The usual choice is to set all three weights to 1. Different values for these weights allows for more flexible search strategies in lists of words.

Action Script 3

function levenshtein(s1:String, s2:String):uint
                {
                     const len1:uint = s1.length, len2:uint = s2.length;
                     var d:Vector.<Vector.<uint> >=new Vector.<Vector.<uint> >(len1+1);
                     for(i=0; i<=len1; ++i) 
                        d[i] = new Vector.<uint>(len2+1);
 
                     d[0][0]=0;
 
                     var i:int;
                     var j:int;
 
                     for(i=1; i<=len1; ++i) d[i][0]=i; //int faster than uint
                     for(i=1; i<=len2; ++i) d[0][i]=i;
 
                     for(i = 1; i <= len1; ++i)
                          for(j = 1; j <= len2; ++j)
                               d[i][j] = Math.min( Math.min(d[i - 1][j] + 1,d[i][j - 1] + 1),
                                           d[i - 1][j - 1] + (s1.charAt(i - 1) == s2.charAt(j - 1) ? 0 : 1) );
                     return(d[len1][len2]);
 
                }
↑Jump back a section

Ada

function Levenshtein(Left, Right : String) return Natural is
    D : array(0 .. Left'Last, 0 .. Right'Last) of Natural;
  begin
    for I in D'range(1) loop D(I, 0) := I;end loop;
 
    for J in D'range(2) loop D(0, J) := J;end loop;
 
    for I in Left'range loop
      for J in Right'range loop
        D(I, J) := Natural'Min(D(I - 1, J), D(I, J - 1)) + 1;
        D(I, J) := Natural'Min(D(I, J), D(I - 1, J - 1) + Boolean'Pos(Left(I) /= Right(J)));
      end loop;
    end loop;
 
    return D(D'Last(1), D'Last(2));
  end Levenshtein;
↑Jump back a section

Common Lisp

(defun levenshtein-distance (str1 str2)
  "Calculates the Levenshtein distance between str1 and str2, returns an editing distance (int)."
  (let ((n (length str1))
        (m (length str2)))
    ;; Check trivial cases
    (cond ((= 0 n) (return-from levenshtein-distance m))
          ((= 0 m) (return-from levenshtein-distance n)))
    (let ((col (make-array (1+ m) :element-type 'integer))
          (prev-col (make-array (1+ m) :element-type 'integer)))
      ;; We need to store only two columns---the current one that
      ;; is being built and the previous one
      (dotimes (i (1+ m))
        (setf (svref prev-col i) i))
      ;; Loop across all chars of each string
      (dotimes (i n)
        (setf (svref col 0) (1+ i))
        (dotimes (j m)
          (setf (svref col (1+ j))
                (min (1+ (svref col j))
                     (1+ (svref prev-col (1+ j)))
                     (+ (svref prev-col j)
                        (if (char-equal (schar str1 i) (schar str2 j)) 0 1)))))
        (rotatef col prev-col))
      (svref prev-col m))))
↑Jump back a section

C

#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
 
int levenshtein(char *s1, char *s2) {
    unsigned int x, y, s1len, s2len;
    s1len = strlen(s1);
    s2len = strlen(s2);
    unsigned int matrix[s2len+1][s1len+1];
    matrix[0][0] = 0;
    for (x = 1; x <= s2len; x++)
        matrix[x][0] = matrix[x-1][0] + 1;
    for (y = 1; y <= s1len; y++)
        matrix[0][y] = matrix[0][y-1] + 1;
    for (x = 1; x <= s2len; x++)
        for (y = 1; y <= s1len; y++)
            matrix[x][y] = MIN3(matrix[x-1][y] + 1, matrix[x][y-1] + 1, matrix[x-1][y-1] + (s1[y-1] == s2[x-1] ? 0 : 1));
 
    return(matrix[s2len][s1len]);
}

The above can be optimized to use O(min(m,n)) space instead of O(mn). The key observation is that we only need to access the contents of the previous column when filling the matrix column-by-column. Hence, we can re-use a single column over and over, overwriting its contents as we proceed.

#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
 
int levenshtein(char *s1, char *s2) {
    unsigned int s1len, s2len, x, y, lastdiag, olddiag;
    s1len = strlen(s1);
    s2len = strlen(s2);
    unsigned int column[s1len+1];
    for (y = 1; y <= s1len; y++)
        column[y] = y;
    for (x = 1; x <= s2len; x++) {
        column[0] = x;
        for (y = 1, lastdiag = x-1; y <= s1len; y++) {
            olddiag = column[y];
            column[y] = MIN3(column[y] + 1, column[y-1] + 1, lastdiag + (s1[y-1] == s2[x-1] ? 0 : 1));
            lastdiag = olddiag;
        }
    }
    return(column[s1len]);
}
↑Jump back a section

C++

template <class T> unsigned int edit_distance(const T& s1, const T& s2)
{
        const size_t len1 = s1.size(), len2 = s2.size();
        vector<vector<unsigned int> > d(len1 + 1, vector<unsigned int>(len2 + 1));
 
        d[0][0] = 0;
        for(unsigned int i = 1; i <= len1; ++i) d[i][0] = i;
        for(unsigned int i = 1; i <= len2; ++i) d[0][i] = i;
 
        for(unsigned int i = 1; i <= len1; ++i)
                for(unsigned int j = 1; j <= len2; ++j)
 
                      d[i][j] = std::min( std::min(d[i - 1][j] + 1,d[i][j - 1] + 1),
                                          d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1) );
        return d[len1][len2];
}

Please note, that using

vector<vector< > >

to represent 2 dimensional matrices in C++ is an anti-pattern. The allocation wastes space and each access introduces an unnecessary memory access (which is expensive).

Here's another implementation, based on the Common Lisp implementation. (It uses the insight that only two columns of the matrix are actually used, avoiding the vector<vector<> > (anti-)pattern, and it uses vector::swap to efficiently swap the workspace after processing each column.)

template<class T>
unsigned int levenshtein_distance(const T &s1, const T & s2) {
        const size_t len1 = s1.size(), len2 = s2.size();
        vector<unsigned int> col(len2+1), prevCol(len2+1);
 
        for (unsigned int i = 0; i < prevCol.size(); i++)
                prevCol[i] = i;
        for (unsigned int i = 0; i < len1; i++) {
                col[0] = i+1;
                for (unsigned int j = 0; j < len2; j++)
                        col[j+1] = min( min( 1 + col[j], 1 + prevCol[1 + j]),
                                                                prevCol[j] + (s1[i]==s2[j] ? 0 : 1) );
                col.swap(prevCol);
        }
        return prevCol[len2];
}
↑Jump back a section

C#

        private Int32 levenshtein(String a, String b)
        {
 
            if (string.IsNullOrEmpty(a))
            {
                if (!string.IsNullOrEmpty(b))
                {
                    return b.Length;
                }
                return 0;
            }
 
            if (string.IsNullOrEmpty(b))
            {
                if (!string.IsNullOrEmpty(a))
                {
                    return a.Length;
                }
                return 0;
            }
 
            Int32 cost;
            Int32[,] d = new int[a.Length + 1, b.Length + 1];
            Int32 min1;
            Int32 min2;
            Int32 min3;
 
            for (Int32 i = 0; i <= d.GetUpperBound(0); i += 1)
            {
                d[i, 0] = i;
            }
 
            for (Int32 i = 0; i <= d.GetUpperBound(1); i += 1)
            {
                d[0, i] = i;
            }
 
            for (Int32 i = 1; i <= d.GetUpperBound(0); i += 1)
            {
                for (Int32 j = 1; j <= d.GetUpperBound(1); j += 1)
                {
                    cost = Convert.ToInt32(!(a[i-1] == b[j - 1]));
 
                    min1 = d[i - 1, j] + 1;
                    min2 = d[i, j - 1] + 1;
                    min3 = d[i - 1, j - 1] + cost;
                    d[i, j] = Math.Min(Math.Min(min1, min2), min3);
                }
            }
 
            return d[d.GetUpperBound(0), d.GetUpperBound(1)];
 
        }

An implementation with reduced memory useage

public int LevenshteinDistance(string source, string target){
        if(String.IsNullOrEmpty(source)){
                if(String.IsNullOrEmpty(target)) return 0;
                return target.Length;
        }
        if(String.IsNullOrEmpty(target)) return source.Length;
 
        if(source.Length > target.Length){
                var temp = target;
                target = source;
                source = temp;
        }
 
        var m = target.Length;
        var n = source.Length;
        var distance = new int[2, m + 1];
        // Initialize the distance 'matrix'
        for(var j = 1; j <= m; j++) distance[0, j] = j;
 
        var currentRow = 0;
        for(var i = 1; i <= n; ++i){
                currentRow = i & 1;
                distance[currentRow, 0] = i;
                var previousRow = currentRow ^ 1;
                for(var j = 1; j <= m; j++){
                        var cost = (target[j - 1] == source[i - 1] ? 0 : 1);
                        distance[currentRow, j] = Math.Min(Math.Min(
                                                distance[previousRow, j] + 1,
                                                distance[currentRow, j - 1] + 1),
                                                distance[previousRow, j - 1] + cost);
                }
        }
        return distance[currentRow, m];
}

Damerau-Levenshtein distance is computed in the asymptotic time O ((max + 1) * min (first.length (), second.length ()))

    /// <summary>
    /// Метрика Дамерау-Левенштейна.
    /// </summary>
    public class DamerauLevensteinMetric
    {
        private const int DEFAULT_LENGTH = 255;
            private int[] _currentRow;
        private int[] _previousRow;
        private int[] _transpositionRow;
 
        /// <summary>
        /// 
        /// </summary>
        public DamerauLevensteinMetric()
            : this(DEFAULT_LENGTH)
        {
        }
 
        /// <summary>
        /// 
        /// </summary>
        /// <param name="maxLength"></param>
        public DamerauLevensteinMetric(int maxLength)
        {
            _currentRow = new int[maxLength + 1];
            _previousRow = new int[maxLength + 1];
            _transpositionRow = new int[maxLength + 1];
        }
 
        /// <summary>
        /// Расстояние Дамерау-Левенштейна вычисляется за асимптотическое время O((max + 1) * min(first.length(), second.length()))
        /// </summary>
        /// <param name="first"></param>
        /// <param name="second"></param>
        /// <param name="max"></param>
        /// <returns></returns>
        public int GetDistance(string first, string second, int max)
        {
            int firstLength = first.Length;
            int secondLength = second.Length;
 
            if (firstLength == 0)
                return secondLength;
 
            if (secondLength == 0) return firstLength;
 
            if (firstLength > secondLength)
            {
                string tmp = first;
                first = second;
                second = tmp;
                firstLength = secondLength;
                secondLength = second.Length;
            }
 
            if (max < 0) max = secondLength;
            if (secondLength - firstLength > max) return max + 1;
 
            if (firstLength > _currentRow.Length)
            {
                _currentRow = new int[firstLength + 1];
                _previousRow = new int[firstLength + 1];
                _transpositionRow = new int[firstLength + 1];
            }
 
            for (int i = 0; i <= firstLength; i++)
                _previousRow[i] = i;
 
            char lastSecondCh = (char) 0;
            for (int i = 1; i <= secondLength; i++)
            {
                char secondCh = second[i - 1];
                _currentRow[0] = i;
 
                // Вычисляем только диагональную полосу шириной 2 * (max + 1)
                int from = Math.Max(i - max - 1, 1);
                int to = Math.Min(i + max + 1, firstLength);
 
                char lastFirstCh = (char) 0;
                for (int j = from; j <= to; j++)
                {
                    char firstCh = first[j - 1];
 
                    // Вычисляем минимальную цену перехода в текущее состояние из предыдущих среди удаления, вставки и
                    // замены соответственно.
                    int cost = firstCh == secondCh ? 0 : 1;
                    int value = Math.Min(Math.Min(_currentRow[j - 1] + 1, _previousRow[j] + 1), _previousRow[j - 1] + cost);
 
                    // Если вдруг была транспозиция, надо также учесть и её стоимость.
                    if (firstCh == lastSecondCh && secondCh == lastFirstCh)
                        value = Math.Min(value, _transpositionRow[j - 2] + cost);
 
                    _currentRow[j] = value;
                    lastFirstCh = firstCh;
                }
                lastSecondCh = secondCh;
 
                int[] tempRow = _transpositionRow;
                _transpositionRow = _previousRow;
                _previousRow = _currentRow;
                _currentRow = tempRow;
            }
 
            return _previousRow[firstLength];
        }
    }
↑Jump back a section

Clojure

(defn nextelt
  "Given two characters, the previous row, and a row we are
  building, determine out the next element for this row."
  [char1 char2 prevrow thisrow position]
  (if (= char1 char2)
    (prevrow (- position 1))
    (+ 1 (min
      (prevrow (- position 1))
      (prevrow position)
      (last thisrow))))
  )
 
(defn nextrow
  "Based on the next character from string1 and the whole of string2
  calculate the next row. Initially thisrow contains one number."
  [char1 str2 prevrow thisrow]
  (let [char2 (first str2)
        position (count thisrow)]
    (if (= (count thisrow) (count prevrow))
      thisrow
      (recur
        char1
        (rest str2)
        prevrow
        (conj thisrow (nextelt char1 char2 prevrow thisrow position))))))
 
(defn levenshtein
  "Calculate the Levenshtein distance between two strings."
  ([str1 str2]
  (let [row0 (vec (map first (map vector (iterate inc 1) str2)))]
    (levenshtein 1 (vec (cons 0 row0)) str1 str2)))
  ([row-nr prevrow str1 str2]
    (let [next-row (nextrow (first str1) str2 prevrow (vector row-nr))
          str1-remainder (.substring str1 1)]
      (if (= "" str1-remainder)
        (last next-row)
        (recur (inc row-nr) next-row str1-remainder str2))))
  )
↑Jump back a section

Delphi

A simple implementation that can certainly be improved upon.

function Levenshtein(Word1, Word2: String): integer;
var lev : array of array of integer;
    i,j : integer;
    s : string;
begin
  result := 0;
  // If the words are identical, do nothing
  if LowerCase(Word1) = LowerCase(Word2) then
  exit;
 
  SetLength(lev, length(Word1) + 1);
  for i := low(lev) to high(lev) do
    setLength(lev[i], length(Word2) + 1);
 
  for i := low(lev) to high(lev) do lev[i][0] := i;
  for j := low(lev[low(lev)]) to high(lev[low(lev)]) do lev[0][j] := j;
 
  for i := low(lev)+1 to high(lev) do
    for j := low(lev[i])+1 to high(lev[i]) do
      lev[i][j] := min(min(lev[i-1][j] + 1,lev[i][j-1] + 1)
                      ,lev[i-1][j-1] + ifthen(Word1[i] = Word2[j], 0, 1));
 
  result := lev[length(word1)][length(word2)];
 
end;
↑Jump back a section

Emacs Lisp

 (defun levenshtein-distance (str1 str2)
   "Return the edit distance between strings STR1 and STR2."
   (if (not (stringp str1))
       (error "Argument was not a string: %s" str1))
   (if (not (stringp str2))
       (error "Argument was not a string: %s" str2))
   (let* ((make-table (function (lambda (columns rows init)
            (make-vector rows (make-vector columns init)))))
          (tref (function (lambda (table x y)
            (aref (aref table y) x))))
          (tset (function (lambda (table x y object)
            (let ((row (copy-sequence (aref table y))))
              (aset row x object)
              (aset table y row) object))))
          (length-str1 (length str1))
          (length-str2 (length str2))
          (d (funcall make-table (1+ length-str1) (1+ length-str2) 0)))
     (let ((i 0) (j 0))
       (while (<= i length-str1)
         (funcall tset d i 0 i)
         (setq i (1+ i)))
       (while (<= j length-str2)
         (funcall tset d 0 j j)
         (setq j (1+ j))))
     (let ((i 1))
       (while (<= i length-str1)
         (let ((j 1))
           (while (<= j length-str2)
             (let* ((cost (if (equal (aref str1 (1- i)) (aref str2 (1- j)))
                              0
                            1))
                    (deletion (1+ (funcall tref d (1- i) j)))
                    (insertion (1+ (funcall tref d i (1- j))))
                    (substitution (+ (funcall tref d (1- i) (1- j)) cost)))
               (funcall tset d i j (min insertion deletion substitution)))
             (setq j (1+ j))))
         (setq i (1+ i))))
     (funcall tref d length-str1 length-str2)))
↑Jump back a section

Erlang

levenshtein(A, B) ->
        LenA = length(A), LenB = length(B),
        case LenA == 0 of
                true -> LenB;
                false -> 
                case LenB == 0 of
                        true -> LenA; false ->
                        Cost = case hd(A) == hd(B) of
                                true ->  0; false -> 1
                        end,
                        lists:min([
                                levenshtein(tl(A), B) + 1,
                                levenshtein(A, tl(B)) + 1,
                                levenshtein(tl(A), tl(B)) + Cost
                                ])
                end
        end.
↑Jump back a section

F#

The inlined min function gives a big speed boost.

let inline min3 one two three = 
    if one < two && one < three then one
    elif two < three then two
    else three
 
let wagnerFischer (s: string) (t: string) =
    let m = s.Length
    let n = t.Length
    let d = Array2D.create (m + 1) (n + 1) 0
 
    for i = 0 to m do d.[i, 0] <- i
    for j = 0 to n do d.[0, j] <- j    
 
    for j = 1 to n do
        for i = 1 to m do
            if s.[i-1] = t.[j-1] then
                d.[i, j] <- d.[i-1, j-1]
            else
                d.[i, j] <-
                    min3
                        (d.[i-1, j  ] + 1) // a deletion
                        (d.[i  , j-1] + 1) // an insertion
                        (d.[i-1, j-1] + 1) // a substitution
    d.[m,n]

and here's a slightly faster lazy version.

let wagnerFischerLazy (s: string) (t: string) =
    let m = s.Length
    let n = t.Length
    let d = Array2D.create (m+1) (n+1) -1
    let rec dist =
        function
        | i, 0 -> i
        | 0, j -> j
        | i, j when d.[i,j] <> -1 -> d.[i,j]
        | i, j ->
            let dval = 
                if s.[i-1] = t.[j-1] then dist (i-1, j-1)
                else
                    min3
                        (dist (i-1, j)   + 1) // a deletion
                        (dist (i,   j-1) + 1) // an insertion
                        (dist (i-1, j-1) + 1) // a substitution
            d.[i, j] <- dval; dval 
    dist (m, n)
↑Jump back a section

Groovy

This version is based on the Java version below

public class Levenshtein {
    public static int distance(String str1, String str2) {
        int[][] distance = new int[str1.size() + 1][str2.size() + 1]
        for (int i in 0..str1.size()) distance[i][0] = i
        for (int j in 0..str2.size()) distance[0][j] = j
        for (int i in 1..str1.size())
            for (int j in 1..str2.size())
                distance[i][j] = [distance[i-1][j]+1,distance[i][j-1]+1,distance[i-1][j-1]+((str1[i-1]==str2[j-1])?0:1)].min()
        return distance[str1.size()][str2.size()]
    }
}
↑Jump back a section

Haskell

Tested with GHCi.

levenshtein::String->String->Int
levenshtein s t = 
    d!!(length s)!!(length t) 
    where d = [[distance m n|n<-[0..length t]]|m<-[0..length s]]
          distance i 0 = i
          distance 0 j = j
          distance i j = minimum [d!!(i-1)!!j+1, d!!i!!(j-1)+1, d!!(i-1)!!(j-1) + (if s!!(i-1)==t!!(j-1) then 0 else 1)]

For large strings, using arrays is much faster

import Data.Array.Unboxed
import Data.Array.ST
import Control.Monad.ST

for_ xs f =  mapM_ f xs

levenshtein :: [Char] -> [Char] -> Int
levenshtein s t = d ! (ls , lt)
    where s' = array (0,ls) [ (i,x) | (i,x) <- zip [0..] s ]::UArray Int Char
          t' = array (0,lt) [ (i,x) | (i,x) <- zip [0..] t ]::UArray Int Char
          ls = length s
          lt = length t
          (l,h) = ((0,0),(length s,length t))
          d = runSTUArray $ do
                m <- newArray (l,h) 0 :: ST s (STUArray s (Int,Int) Int)
                for_ [0..ls] $ \i -> writeArray m (i,0) i
                for_ [0..lt] $ \j -> writeArray m (0,j) j
                for_ [1..lt] $ \j -> do
                              for_ [1..ls] $ \i -> do
                                  let c = if s'!(i-1)==t'! (j-1) 
                                          then 0 else 1
                                  x <- readArray m (i-1,j)
                                  y <- readArray m (i,j-1)
                                  z <- readArray m (i-1,j-1)
                                  writeArray m (i,j) $ minimum [x+1, y+1, z+c ]
                return m

As recursively defined array:

import Array
toArray l = listArray (0, length l - 1) l                  -- makes an Array from a List
mkArray f bnds = array bnds [ (i, f i) | i <- range bnds ] -- defines an Array over its indices

levenshtein sa sb = table
  where
    arrA = toArray sa
    arrB = toArray sb
    table = mkArray f ((0,0), (length sa , length sb))
    f (ia, 0) = ia
    f (0 ,ib) = ib
    f (ia,ib)
      | a == b    = table ! (ia-1,ib-1)
      | otherwise = 1 + minimum [ table ! x | x <- [(ia-1,ib-1),(ia-1,ib),(ia,ib-1)] ]
      where
        a = arrA ! (ia - 1)
        b = arrB ! (ib - 1)

And finally: fast but cryptic implementation

levenshtein2 sa sb = last $ foldl transform [0..length sa] sb 
   where
      transform xs@(x:xs') c = scanl compute (x+1) (zip3 sa xs xs') 
         where
            compute z (c', x, y) = minimum [y+1, z+1, x + fromEnum (c' /= c)]
↑Jump back a section

Io

 Levenshtein := method(left, right,
   if(right size < left size, return Levenshtein(right, left))
   current := 0 to(left size) asList
   right foreach(i, righti,
     previous := current
     current = List clone with(i + 1)
     left foreach(j, leftj,
       current append((current at(j) + 1) min(previous at(j + 1) + 1) min(previous at(j) + if(leftj == righti, 0, 1)))
     )
   )
   current last
 )
↑Jump back a section

JavaScript

Faster version:

function levenshteinDistance (s, t) {
        if (!s.length) return t.length;
        if (!t.length) return s.length;
 
        return Math.min(
                levenshteinDistance(s.substr(1), t) + 1,
                levenshteinDistance(t.substr(1), s) + 1,
                levenshteinDistance(s.substr(1), t.substr(1)) + (s[0] !== t[0] ? 1 : 0)
        );
}

Another approach

// Compute the edit distance between the two given strings
exports.getEditDistance = function(a, b){
  if(a.length == 0) return b.length; 
  if(b.length == 0) return a.length; 
 
  var matrix = [];
 
  // increment along the first column of each row
  var i;
  for(i = 0; i <= b.length; i++){
    matrix[i] = [i];
  }
 
  // increment each column in the first row
  var j;
  for(j = 0; j <= a.length; j++){
    matrix[0][j] = j;
  }
 
  // Fill in the rest of the matrix
  for(i = 1; i <= b.length; i++){
    for(j = 1; j <= a.length; j++){
      if(b.charAt(i-1) == a.charAt(j-1)){
        matrix[i][j] = matrix[i-1][j-1];
      } else {
        matrix[i][j] = Math.min(matrix[i-1][j-1] + 1, // substitution
                                Math.min(matrix[i][j-1] + 1, // insertion
                                         matrix[i-1][j] + 1)); // deletion
      }
    }
  }
 
  return matrix[b.length][a.length];
};

(source)

↑Jump back a section

Java

public class LevenshteinDistance {
        private static int minimum(int a, int b, int c) {
                return Math.min(Math.min(a, b), c);
        }
 
        public static int computeLevenshteinDistance(CharSequence str1,
                        CharSequence str2) {
                int[][] distance = new int[str1.length() + 1][str2.length() + 1];
 
                for (int i = 0; i <= str1.length(); i++)
                        distance[i][0] = i;
                for (int j = 1; j <= str2.length(); j++)
                        distance[0][j] = j;
 
                for (int i = 1; i <= str1.length(); i++)
                        for (int j = 1; j <= str2.length(); j++)
                                distance[i][j] = minimum(
                                                distance[i - 1][j] + 1,
                                                distance[i][j - 1] + 1,
                                                distance[i - 1][j - 1]
                                                                + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0
                                                                                : 1));
 
                return distance[str1.length()][str2.length()];
        }
}
↑Jump back a section

OCaml

(* Minimum of three integers. This function is deliberately
 * not polymorphic because (1) we only need to compare integers 
 * and (2) the OCaml compilers do not perform type specialization 
 * for user-defined functions. *)
let minimum (x:int) y z =
  let m' (a:int) b = if a < b then a else b in
    m' (m' x y) z
 
(* Matrix initialization. *)
let init_matrix n m =
  let init_col = Array.init m in
  Array.init n (function
    | 0 -> init_col (function j -> j)
    | i -> init_col (function 0 -> i | _ -> 0)
  )
 
(* Computes the Levenshtein distance between two unistring.
 * If you want to run it faster, add the -unsafe option when
 * compiling or use Array.unsafe_* functions (but be carefull 
 * with these well-named unsafe features). *)
let distance_utf8 x y =
  match Array.length x, Array.length y with
    | 0, n -> n
    | m, 0 -> m
    | m, n ->
       let matrix = init_matrix (m + 1) (n + 1) in
         for i = 1 to m do
           let s = matrix.(i) and t = matrix.(i - 1) in
             for j = 1 to n do
               let cost = abs (compare x.(i - 1) y.(j - 1)) in
                 s.(j) <- minimum (t.(j) + 1) (s.(j - 1) + 1) (t.(j - 1) + cost)
             done
         done;
         matrix.(m).(n)
 
(* This function takes two strings, convert them to unistring (int array)
 * and then call distance_utf8, so we can compare utf8 strings. Please
 * note that you need Glib (see LablGTK). *)
let distance x y =
  distance_utf8 (Glib.Utf8.to_unistring x) (Glib.Utf8.to_unistring y)
↑Jump back a section

Octave And MATLAB

function [dist,L]=levenshtein_distance(str1,str2)
    L1=length(str1)+1;
    L2=length(str2)+1;
    L=zeros(L1,L2);
 
    g=+1;%just constant
    m=+0;%match is cheaper, we seek to minimize
    d=+1;%not-a-match is more costly.
 
    %do BC's
    L(:,1)=([0:L1-1]*g)';
    L(1,:)=[0:L2-1]*g;
 
    m4=0;%loop invariant
    for idx=2:L1;
        for idy=2:L2
            if(str1(idx-1)==str2(idy-1))
                score=m;
            else
                score=d;
            end
            m1=L(idx-1,idy-1) + score;
            m2=L(idx-1,idy) + g;
            m3=L(idx,idy-1) + g;
            L(idx,idy)=min(m1,min(m2,m3));
        end
    end
 
    dist=L(L1,L2);
    return
end
%I think this function generates errors: 
%>> levenshtein('aaa','ab')
% ans =
%     1
%correct answer here is 2, I believe: 1 deletion, 1 replacement?
function q = levenshtein(s1,s2)
d = toeplitz(find(s1),find(s2))-1;
for j = 1:numel(s2)-1
    for i = 1:numel(s1)-1
           d(i+1,j+1) = min(min(d(i,j+1),d(i+1,j)) + 1,d(i,j) + ne(s1(i), s2(j)));
    end
end
q = d(end) + eq(i,j)*ne(s1(end),s2(end)); % check if last chars are different
↑Jump back a section

Perl

use List::Util qw(min);
 
sub levenshtein {
    my ($str1, $str2) = @_;
    my @ar1 = split //, $str1;
    my @ar2 = split //, $str2;
 
    my @dist;
    $dist[$_][0] = $_ foreach (0 .. @ar1);
    $dist[0][$_] = $_ foreach (0 .. @ar2);
 
    foreach my $i (1 .. @ar1){
        foreach my $j (1 .. @ar2){
            my $cost = $ar1[$i - 1] eq $ar2[$j - 1] ? 0 : 1;
            $dist[$i][$j] = min(
                        $dist[$i - 1][$j] + 1, 
                        $dist[$i][$j - 1] + 1, 
                        $dist[$i - 1][$j - 1] + $cost );
        }
    }
 
    return $dist[@ar1][@ar2];
}
↑Jump back a section

PHP

Please note that there is a standard library call levenshtein() in PHP as of version 4.0.1. It is limited to comparing strings of no more than 255 characters in length, however, limiting its utility.

function lev($s,$t) {
        $m = strlen($s);
        $n = strlen($t);
 
        for($i=0;$i<=$m;$i++) $d[$i][0] = $i;
        for($j=0;$j<=$n;$j++) $d[0][$j] = $j;
 
        for($i=1;$i<=$m;$i++) {
                for($j=1;$j<=$n;$j++) {
                        $c = ($s[$i-1] == $t[$j-1])?0:1;
                        $d[$i][$j] = min($d[$i-1][$j]+1,$d[$i][$j-1]+1,$d[$i-1][$j-1]+$c);
                }
        }
 
        return $d[$m][$n];
}


Variation with lineary memory usage.

function leven($s1,$s2){
        $l1 = strlen($s1);                    // Länge des $s1 Strings
        $l2 = strlen($s2);                    // Länge des $s2 Strings
        $dis = range(0,$l2);                  // Erste Zeile mit (0,1,2,...,n) erzeugen 
                                              // $dis stellt die vorrangeganene Zeile da. 
        for($x=1;$x<=$l1;$x++){        
                $dis_new[0]=$x;               // Das erste element der darauffolgenden Zeile ist $x, $dis_new ist damit die aktuelle Zeile mit der gearbeitet wird
                for($y=1;$y<=$l2;$y++){
                        $c = ($s1[$x-1] == $s2[$y-1])?0:1;
                        $dis_new[$y] = min($dis[$y]+1,$dis_new[$y-1]+1,$dis[$y-1]+$c);   
                }
                $dis = $dis_new;              
        }       
 
        return $dis[$l2];
}
↑Jump back a section

Python

First version:

def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
 
    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)
 
    previous_row = xrange(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
 
    return previous_row[-1]

Second version:

def lev(a, b):
    if not a: return len(b)
    if not b: return len(a)
    return min(lev(a[1:], b[1:])+(a[0] != b[0]), lev(a[1:], b)+1, lev(a, b[1:])+1)

(Note that while very compact, the runtime of this implementation is really poor, making it unusable in practical usage.)

Third version (works):

def LD(s,t):
    s = ' ' + s
    t = ' ' + t
    d = {}
    S = len(s)
    T = len(t)
    for i in range(S):
        d[i, 0] = i
    for j in range (T):
        d[0, j] = j
    for j in range(1,T):
        for i in range(1,S):
            if s[i] == t[j]:
                d[i, j] = d[i-1, j-1]
            else:
                d[i, j] = min(d[i-1, j] + 1, d[i, j-1] + 1, d[i-1, j-1] + 1)
    return d[S-1, T-1]

(Note that while compact, the runtime of this implementation is relatively poor.)

4th version:

def levenshtein(seq1, seq2):
    oneago = None
    thisrow = range(1, len(seq2) + 1) + [0]
    for x in xrange(len(seq1)):
        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
    return thisrow[len(seq2) - 1]

(Note this implementation is O(N*M) time and O(M) space, for N and M the lengths of the two sequences.)

↑Jump back a section

R/S+

function (str1, str2) 
{
    if (typeof(str1) != "character" && class(str1) != "factor") 
        stop(sprintf("Illegal data type: %s", typeof(str1)))
    if (class(str1) == "factor") 
        str = as.character(str1)
    if (typeof(str2) != "character" && class(str2) != "factor") 
        stop(sprintf("Illegal data type: %s", typeof(str2)))
    if (class(str2) == "factor") 
        str = as.character(str2)
    if ((is.array(str1) || is.array(str2)) && dim(str1) != dim(str2)) 
        stop("non-conformable arrays")
    if (length(str1) == 0 || length(str2) == 0) 
        return(integer(0))
    l1 <- length(str1)
    l2 <- length(str2)
    out <- .C("levenshtein", as.character(str1), as.character(str2), 
        l1, l2, ans = integer(max(l1, l2)), PACKAGE = "RecordLinkage")
    if (any(is.na(str1), is.na(str2))) 
        out$ans[is.na(str1) | is.na(str2)] = NA
    if (is.array(str1)) 
        return(array(out$ans, dim(str1)))
    if (is.array(str2)) 
        return(array(out$ans, dim(str2)))
    return(out$ans)
}

"(Note) this is just one of many implementations of the Levenshtein Distance, but I've not been able to get the others to work

↑Jump back a section

rexx

/* rexx levenshtein calculates the edit distance   Karlocai 2009-01-18
    between 2 strings s and t
 
   This implementation of the Levenshtein algorithm 
   uses one row only (0..n), containing 
   - values of the previous line in columns [i-1]..n and 
   - values of the current  line in columns 1..[i-2]
   The current left value is kept in the variable lc
   i: column 1..n of s, 1st index
   j: row    1..m of t, 2nd index
*/
 
 Parse Arg s,t                -- gets 2 strings as parameter
 
 n = Length(s)                -- checks the parameters
 m = Length(t)
 If n = 0 Then
   Return m
 If m = 0 Then
   Return n
 
 Do i = 0 To n                -- initializes the 1st row
   r.i = i
 End
 
 Do j = 1 To m                -- for each row
   lc = j                     -- left column start value
   Do i = 1 To n              -- for each column
     nv = Min(r.i + 1,   ,
              lc  + 1,   ,
              r.[i-1] + (Substr(s,i,1) <> Substr(t,j,1)))  
     r.[i-1] = lc             -- sets previous left column
     lc = nv                  -- current left column
   End
   r.n = lc                   -- sets last current value
 End
 Return r.n
↑Jump back a section

Ruby

This Ruby version is simple, but extremely slow, though it works with any Array with elements that implement '=='.

def levenshtein(a, b)
  case
    when a.empty? then b.length
    when b.empty? then a.length
    else [(a[0] == b[0] ? 0 : 1) + levenshtein(a[1..-1], b[1..-1]),
          1 + levenshtein(a[1..-1], b),
          1 + levenshtein(a, b[1..-1])].min
  end
end

A faster implementation for the Levenshtein distance of strings is available here

↑Jump back a section

Scala

Imperative version. A functional version would likely be far more concise.

def levenshtein(str1: String, str2: String): Int = {
    val lenStr1 = str1.length
    val lenStr2 = str2.length
 
    val d: Array[Array[Int]] = Array.ofDim(lenStr1 + 1, lenStr2 + 1)
 
    for (i <- 0 to lenStr1) d(i)(0) = i
    for (j <- 0 to lenStr2) d(0)(j) = j
 
    for (i <- 1 to lenStr1; j <- 1 to lenStr2) {
      val cost = if (str1(i - 1) == str2(j-1)) 0 else 1
 
      d(i)(j) = min(
        d(i-1)(j  ) + 1,     // deletion
        d(i  )(j-1) + 1,     // insertion
        d(i-1)(j-1) + cost   // substitution
      )
    }
 
    d(lenStr1)(lenStr2)
  }
 
  def min(nums: Int*): Int = nums.min
↑Jump back a section

VBScript

This version is identical to JavaScript and PHP implementations in this article.

Function levenshtein( a, b )
        Dim i,j,cost,d,min1,min2,min3
 
 ' Avoid calculations where there there are empty words
        If Len( a ) = 0 Then levenshtein = Len( b ): Exit Function
        If Len( b ) = 0 Then levenshtein = Len( a ): Exit Function
 
 ' Array initialization 
        ReDim d( Len( a ), Len( b ) )
 
        For i = 0 To Len( a ): d( i, 0 ) = i: Next
        For j = 0 To Len( b ): d( 0, j ) = j: Next
 
 ' Actual calculation
        For i = 1 To Len( a )
                For j = 1 To Len( b )
                        If Mid(a, i, 1) = Mid(b, j, 1) Then cost = 0 Else cost = 1 End If
 
                        ' Since min() function is not a part of VBScript, we'll "emulate" it below
                        min1 = ( d( i - 1, j ) + 1 )
                        min2 = ( d( i, j - 1 ) + 1 )
                        min3 = ( d( i - 1, j - 1 ) + cost )
 
                        If min1 <= min2 And min1 <= min3 Then
                                d( i, j ) = min1
                        ElseIf min2 <= min1 And min2 <= min3 Then
                                d( i, j ) = min2
                        Else
                                d( i, j ) = min3
                        End If
                Next
        Next
 
        levenshtein = d( Len( a ), Len( b ) )
End Function
↑Jump back a section

Visual Basic for Applications (no Damerau extension)

This version is identical to JavaScript and PHP implementations in this article. I had problems when I tried to use the other VBA implementation in this article, so I had to adopt the version below.

Application.WorksheetFunction.Min() method is Excel-specific. If you implement it with other VBA-enabled applications, uncomment the conditional block and comment out the Application.WorksheetFunction.Min() line.

Function levenshtein(a As String, b As String) As Integer
 
    Dim i As Integer
    Dim j As Integer
    Dim cost As Integer
    Dim d() As Integer
    Dim min1 As Integer
    Dim min2 As Integer
    Dim min3 As Integer
 
    If Len( a ) = 0 Then
        levenshtein = Len( b )
        Exit Function
    End If
 
    If Len( b ) = 0 Then
        levenshtein = Len( a )
        Exit Function
    End If
 
    ReDim d(Len(a), Len(b))
 
    For i = 0 To Len(a)
        d(i, 0) = i
    Next
 
    For j = 0 To Len(b)
        d(0, j) = j
    Next
 
    For i = 1 To Len(a)
        For j = 1 To Len(b)
            If Mid(a, i, 1) = Mid(b, j, 1) Then
                cost = 0
            Else
                cost = 1
            End If
 
            ' Since Min() function is not a part of VBA, we'll "emulate" it below
            min1 = (d(i - 1, j) + 1)
            min2 = (d(i, j - 1) + 1)
            min3 = (d(i - 1, j - 1) + cost)
 
'            If min1 <= min2 And min1 <= min3 Then
'                d(i, j) = min1
'            ElseIf min2 <= min1 And min2 <= min3 Then
'                d(i, j) = min2
'            Else
'                d(i, j) = min3
'            End If
            
            ' In Excel we can use Min() function that is included
            ' as a method of WorksheetFunction object
            d(i, j) = Application.WorksheetFunction.Min(min1, min2, min3)
        Next
    Next
 
    levenshtein = d(Len(a), Len(b))
 
End Function
↑Jump back a section

MapBasic

This version is identical to VB implementations in this article.

type twoDimArrayType
        secondArray() as integer
end type
dim FirstArray() as twoDimArrayType
 
Dim i As Integer
Dim j As Integer
Dim cost As Integer
Dim d() As Integer
Dim min1 As Integer
Dim min2 As Integer
Dim min3 As Integer
 
declare Function calculateLevenshteinDistance(byval firstString As String, byval secondString As String) As Integer
Function calculateLevenshteinDistance(byval firstString As String, byval secondString As String) As Integer
        print chr$(12)
'If one of the parameter is null, then the result will be the length of the other parameter...
  if Len(firstString) = 0 Then
      calculateLevenshteinDistance = Len(secondString)
      exit Function
  end if
  if Len(secondString) = 0 Then
      calculateLevenshteinDistance = Len(firstString)
       exit Function
  end if
 
'Initializing array...
        redim FirstArray(len(firstString)+1)
        for i=1 to ubound(FirstArray)
                redim FirstArray(i).secondArray(len(secondString)+1)
        next
 
'Deletion...
  For i = 1 To ubound(FirstArray)
      FirstArray(i).secondArray(1) = i-1
  Next
 
'Insertion ...
  For i = 1 To ubound(FirstArray(ubound(FirstArray)).secondArray)
      FirstArray(1).secondArray(i) = i-1
  Next
 
'Actual calculation...  
  for i=2 to ubound(FirstArray)
        for j=2 to ubound(FirstArray(i).secondArray)    
        If StringCompare(Mid$(firstString, i-1, 1), Mid$(secondString, j-1, 1))=0  Then
        cost = 0
      Else
        cost = 1
      End If
                        min1 =  FirstArray(i-1).secondArray(j) + 1 
                        min2 =  FirstArray(i).secondArray(j-1) + 1 
                        min3 =  FirstArray(i-1).secondArray(j-1) + cost    
                        If min1 <= min2 And min1 <= min3 Then
                                FirstArray(i).secondArray(j) = min1
                        ElseIf min2 <= min1 And min2 <= min3 Then
                                FirstArray(i).secondArray(j) = min2
                        Else
                                FirstArray(i).secondArray(j) = min3
                        End If                  
        Next
  Next
 
'Calculating Return Value...
  calculateLevenshteinDistance = FirstArray(ubound(FirstArray)).secondArray(ubound(FirstArray(ubound(FirstArray)).secondArray))
End Function
↑Jump back a section

Teslock

This is the Levenshtein distance calculation in Teslock Machine Language.

.declare singlecall virtual LevenshteinDistance[args(2) string s1, string s2]: out unsigned int
 main
    (
    define unsigned int constant "cost_ins" == 1;
    define unsigned int constant "cost_del" == 1;
    define unsigned int constant "cost_sub" == 1;

    define unsigned int variable "n1" == calculate::string_operations>length("s1");
    define unsigned int variable "n2" == calculate::string_operations>length("s2");

    define unsigned int_array(calculate::string_operations>array_instantiation("p")>fixed_length("n2", ++1));
    define unsigned int_array(calculate::string_operations>array_instantiation("q")>fixed_length("n2", ++1));
    define unsigned int_array(calculate::string_operations>array_instantiation("r")>variable_length);

    p>array_vector>0 == 0;
    loop_for>finalized(define unsigned int variable "j" == 1)>break_condition("j" <= "n2")>forward_condition(++j)
    (
        p>array_vector>j == p>array_vector>j::access>+constants::cost_ins;
    )

    q>array_vector>0 == 0;
    loop_for>finalized(define unsigned int variable "i" == 1)>break_condition("i" <= "n1")>forward_condition(++i)
    (
        q>array_vector>0 == p>array_vector>0::access>+constants::cost_del;
        
        loop for>finalized(define unsigned int variable "j" == 1)>break_condition("j" <= "n2")>forward_condition(++j)
        (
            define unsigned int variable "d_del" == p>array_vector>j::access>+constants::cost_del;
            define unsigned int variable "d_ins" == q>array_vector>j[delegate_handle::j::access>-1]>+constants::cost_ins;
            define unsigned int veriable "d_sub" == p>array_vector>j[delegate_handle::j::access>-1]>+logical_operations>xor_result["s1"::access>"s1"[delegate_handle::j::access>-1]?0>return_handle_as_result constants::"cost_sub"];
            q>array_vector>j == dll_extern::math_interop_singlecall(min)::[args(3) (d_del, d_ins), d_sub;
        )
        local>"r" == "p"(self_typecast::ignore_unsafe_condition);
        local>"p" == "q"(self_typecast);
        local>"q" == "r"(self_typecast);    
    )
    
    logical_result(param p::singlecall)::define>"return" == p>array_vector>"n2";
)

↑Jump back a section

Abap

REPORT  zlevenshtein.
*----------------------------------------------------------------------*
*       CLASS lcl_levenshtein DEFINITION
*----------------------------------------------------------------------*
*
*----------------------------------------------------------------------*
CLASS lcl_levenshtein DEFINITION.
  PUBLIC SECTION.
    CLASS-METHODS:
      distance IMPORTING i_s TYPE csequence
                         i_t TYPE csequence
               RETURNING value(r) TYPE i.
ENDCLASS.                    "lcl_c DEFINITION
*----------------------------------------------------------------------*
*       CLASS lcl_levenshtein IMPLEMENTATION
*----------------------------------------------------------------------*
*
*----------------------------------------------------------------------*
CLASS lcl_levenshtein IMPLEMENTATION.
  METHOD distance.
    DEFINE m_get.
      l_m_index = ( ( l_l_t * ( l_m_i + ( &2 ) ) ) + l_m_j + ( &1 ) ) + 1 .
      read table l_d into r index l_m_index.
      add &3 to r.
      insert r into table l_v.
    END-OF-DEFINITION.
    DATA: l_d       TYPE STANDARD TABLE OF i,
          l_v       TYPE SORTED TABLE OF i WITH UNIQUE KEY table_line,
          l_cost    TYPE i,
          l_m_i     TYPE i,
          l_m_j     TYPE i,
          l_m_index TYPE i,
          l_l_s     TYPE i,
          l_l_t     TYPE i.

    l_l_s = STRLEN( i_s ).
    l_l_t = STRLEN( i_t ).

    DO l_l_s TIMES.
      l_m_i = sy-index - 1.

      DO l_l_t TIMES. "#EC CI_NESTED
        l_m_j = sy-index - 1.

        IF l_m_j = 0.
          r = l_m_i.
        ELSEIF l_m_i = 0.
          r = l_m_j.
        ELSE.
          IF i_s+l_m_i(1) = i_t+l_m_j(1).
            l_cost = 0.
          ELSE.
            l_cost = 1.
          ENDIF.

          CLEAR l_v.
          m_get: -1  0 1, 0 -1 1, -1 -1 l_cost.
          READ TABLE l_v INTO r INDEX 1.
        ENDIF.
        APPEND r TO l_d.

      ENDDO.
    ENDDO.
  ENDMETHOD.                    "distance
ENDCLASS.                    "lcl_levenshtein IMPLEMENTATION

START-OF-SELECTION.
  DATA: d TYPE i.

  d = lcl_levenshtein=>distance( i_s = 'sitting' i_t = 'kitten' ).
  WRITE: / d.
↑Jump back a section

Pick Basic

       IF STRING1 = STRING2 THEN
          LD = 0
       END ELSE
          S.LEN = LEN(STRING1)
          C.LEN = LEN(STRING2)
          MAT LD.MTX = ''
          DIM LD.MTX(100,100)
          FOR I = 3 TO S.LEN + 2
             LD.MTX(I,1) = STRING1[I-2,1]
          NEXT I
          FOR I = 3 TO S.LEN + 2
             LD.MTX(I,2) = I - 2
          NEXT I
          FOR I = 3 TO C.LEN + 2
             LD.MTX(1,I) = STRING2[I-2,1]
          NEXT I
          FOR I = 3 TO C.LEN + 2
             LD.MTX(2,I) = I - 2
          NEXT I
          FOR I = 3 TO (S.LEN+2)
             S.LETTER = LD.MTX(I,1)
             FOR J = 3 TO (C.LEN+2)
                C.LETTER = LD.MTX(1,J)
                IF C.LETTER = S.LETTER THEN COST = 0 ELSE COST = 1
                P1 = LD.MTX(I-1,J) + 1
                P2 = LD.MTX(I,J-1) + 1
                P3 = LD.MTX(I-1,J-1) + COST
                IF P1 < P2 THEN LD.NUM = P1 ELSE LD.NUM = P2
                IF P3 < P2 THEN LD.NUM = P3
                LD.MTX(I,J) = LD.NUM
             NEXT J
          NEXT I
          LD = LD.MTX(S.LEN+2,C.LEN+2)
       END
↑Jump back a section
Last modified on 7 March 2013, at 17:23