Algorithm Implementation/Strings/Dice's coefficient

Dice's coefficient measures how similar a set and another set are. It can be used to measure how similar two strings are in terms of the number of common bigrams (a bigram is a pair of adjacent letters in the string). Below we give implementations of Dice's coefficient of two strings in different programming languages.

C++

/*
 * dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
 * (C) 2007 Francis Tyers 
 * Modifications made by Stefan Koshiw 2010
 * Now it outputs values [0..1]
 * Released under the terms of the GNU GPL.
 */
 
float dice_coefficient(wstring string1, wstring string2)
{
 
        set<string> string1_bigrams;
        set<string> string2_bigrams;
 
        //base case
        if(string1.length() == 0 || string2.length() == 0)
        {
                return 0;
        }        
 
        for(unsigned int i = 0; i < (string1.length() - 1); i++) {      // extract character bigrams from string1
                string1_bigrams.insert(string1.substr(i, 2));
        }
        for(unsigned int i = 0; i < (string2.length() - 1); i++) {      // extract character bigrams from string2
                string2_bigrams.insert(string2.substr(i, 2));
        }
 
        int intersection = 0;
 
        // find the intersection between the two sets
 
        for(set<string>::iterator IT = string2_bigrams.begin(); 
            IT != string2_bigrams.end(); 
            IT++)
        {    
                intersection += string1_bigrams.count((*IT));
        }
 
        // calculate dice coefficient
        int total = string1_bigrams.size() + string2_bigrams.size();
        float dice = (float)(intersection * 2) / (float)total;
 
        return dice;
}


↑Jump back a section

Haskell

a naive, non-optimized version

import  Data.List       (intersect, nub)
 
diceCoefficient sa sb =
        fromIntegral (2 * length (intersect agrams bgrams)) / fromIntegral (length agrams + length bgrams)      
        where
                agrams          = bigrams sa
                bgrams          = bigrams sb
                bigrams xs      = nub $ zip xs (tail xs)
↑Jump back a section

Java

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
 
//Note that this implementation is case-sensitive!
public static double diceCoefficient(String s1, String s2)
{
        Set<String> nx = new HashSet<String>();
        Set<String> ny = new HashSet<String>();
 
        for (int i=0; i < s1.length()-1; i++) {
                char x1 = s1.charAt(i);
                char x2 = s1.charAt(i+1);
                String tmp = "" + x1 + x2;
                nx.add(tmp);
        }
        for (int j=0; j < s2.length()-1; j++) {
                char y1 = s2.charAt(j);
                char y2 = s2.charAt(j+1);
                String tmp = "" + y1 + y2;
                ny.add(tmp);
        }
 
        Set<String> intersection = new HashSet<String>(nx);
        intersection.retainAll(ny);
        double totcombigrams = intersection.size();
 
        return (2*totcombigrams) / (nx.size()+ny.size());
}
 
/**
 * Here's an optimized version of the dice coefficient calculation. It takes
 * advantage of the fact that a bigram of 2 chars can be stored in 1 int, and
 * applies a matching algorithm of O(n*log(n)) instead of O(n*n).
 * 
 * <p>Note that, at the time of writing, this implementation differs from the
 * other implementations on this page. Where the other algorithms incorrectly
 * store the generated bigrams in a set (discarding duplicates), this
 * implementation actually treats multiple occurrences of a bigram as unique.
 * The correctness of this behavior is most easily seen when getting the
 * similarity between "GG" and "GGGGGGGG", which should obviously not be 1.
 * 
 * @param s The first string
 * @param t The second String
 * @return The dice coefficient between the two input strings. Returns 0 if one
 *         or both of the strings are {@code null}. Also returns 0 if one or both
 *         of the strings contain less than 2 characters and are not equal.
 * @author Jelle Fresen
 */
public static double diceCoefficientOptimized(String s, String t)
{
        // Verifying the input:
        if (s == null || t == null)
                return 0;
        // Quick check to catch identical objects:
        if (s == t)
                return 1;
        // avoid exception for single character searches
        if (s.length() < 2 || t.length() < 2)
            return 0;
 
        // Create the bigrams for string s:
        final int n = s.length()-1;
        final int[] sPairs = new int[n];
        for (int i = 0; i <= n; i++)
                if (i == 0)
                        sPairs[i] = s.charAt(i) << 16;
                else if (i == n)
                        sPairs[i-1] |= s.charAt(i);
                else
                        sPairs[i] = (sPairs[i-1] |= s.charAt(i)) << 16;
 
        // Create the bigrams for string t:
        final int m = t.length()-1;
        final int[] tPairs = new int[m];
        for (int i = 0; i <= m; i++)
                if (i == 0)
                        tPairs[i] = t.charAt(i) << 16;
                else if (i == m)
                        tPairs[i-1] |= t.charAt(i);
                else
                        tPairs[i] = (tPairs[i-1] |= t.charAt(i)) << 16;
 
        // Sort the bigram lists:
        Arrays.sort(sPairs);
        Arrays.sort(tPairs);
 
        // Count the matches:
        int matches = 0, i = 0, j = 0;
        while (i < n && j < m)
        {
                if (sPairs[i] == tPairs[j])
                {
                        matches += 2;
                        i++;
                        j++;
                }
                else if (sPairs[i] < tPairs[j])
                        i++;
                else
                        j++;
        }
        return (double)matches/(n+m);
}
↑Jump back a section

Javascript

function dice_coefficient(string1, string2) {
  var intersection = 0;
  var length1 = string1.length - 1;
  var length2 = string2.length - 1;
  if(length1 < 1 || length2 < 1) return 0;
  var bigrams2 = [];
  for(var i = 0; i < length2; i++) {
    bigrams2.push(string2.substr(i,2));
  }
  for(var i = 0; i < length1; i++) {
    var bigram1 = string1.substr(i, 2);
    for(var j = 0; j < length2; j++) {
      if(bigram1 == bigrams2[j]) {
        intersection++;
        bigrams2[j] = null;
        break;
  }}} 
  return (2.0 * intersection) / (length1 + length2);  
}
↑Jump back a section

Perl

sub dice_coefficient(){
     my ($str1, $str2) = @_;
 
     return 0 if (! length $str1 || ! length $str2 );
 
     #### bigrams using REGEX.
     ##my @bigrams_str1 = $str1 =~ m/ (?= (..) ) /xmsg;
     ##my @bigrams_str2 = $str2 =~ m/ (?= (..) ) /xmsg;
 
     my $len1 = ( length($str1) -  1 );
     for (my $i=0; $i<$len1; $i++){
         push @bigrams_str1, substr($str1, $i, 2);
     }
 
     my $len2 = ( length($str2) -  1 );
     for (my $i=0; $i<$len2; $i++){
         push @bigrams_str2, substr($str2, $i, 2);
     }
 
     my $intersect = 0;
     for (my $i=0; $i<=$#bigrams_str1; $i++){
          if ( grep /^$bigrams_str1[$i]$/, @bigrams_str2){
               $intersect++;
          }
     }
 
     my $combinedLength = ($#bigram_str1+1 + $#bigrams_str2+1);
     my $dice = ($intersect * 2 / $combinedLength);
 
     return $dice;
}
↑Jump back a section

Python

def dice_coefficient(a, b):
    """dice coefficient 2nt/na + nb."""
    a_bigrams = set(a)
    b_bigrams = set(b)
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0/(len(a_bigrams) + len(b_bigrams))
 
""" more orthodox and robust implementation """
def dice_coefficient(a, b):
    """dice coefficient 2nt/na + nb."""
    if not len(a) or not len(b): return 0.0
    if len(a) == 1:  a=a+u'.'
    if len(b) == 1:  b=b+u'.'
 
    a_bigram_list=[]
    for i in range(len(a)-1):
      a_bigram_list.append(a[i:i+2])
    b_bigram_list=[]
    for i in range(len(b)-1):
      b_bigram_list.append(b[i:i+2])
 
    a_bigrams = set(a_bigram_list)
    b_bigrams = set(b_bigram_list)
    overlap = len(a_bigrams & b_bigrams)
    dice_coeff = overlap * 2.0/(len(a_bigrams) + len(b_bigrams))
    return dice_coeff
↑Jump back a section

Ruby

require 'set'
 
# dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
def dice_coefficient(a, b)
        a_bigrams = a.each_char.each_cons(2).to_set
        b_bigrams = b.each_char.each_cons(2).to_set
 
        overlap = (a_bigrams & b_bigrams).size
 
        total = a_bigrams.size + b_bigrams.size
        dice  = overlap * 2.0 / total
 
        dice
end
↑Jump back a section

C#

public static double DiceCoefficient(string stOne, string stTwo)
        {
            HashSet<string> nx = new HashSet<string>();
            HashSet<string> ny = new HashSet<string>();
 
            for(int i = 0; i < stOne.Length - 1; i++)
            {
                char x1 = stOne[i];
                char x2 = stOne[i + 1];
                string temp = x1.ToString() + x2.ToString();
                nx.Add(temp);
            }
            for(int j = 0; j < stTwo.Length - 1; j++)
            {
                char y1 = stTwo[j];
                char y2 = stTwo[j + 1];
                string temp = y1.ToString() + y2.ToString();
                ny.Add(temp);
            }
 
            HashSet<string> intersection = new HashSet<string>(nx);
            intersection.IntersectWith(ny);
 
            double dbOne = intersection.Count;
            return (2 * dbOne) / (nx.Count + ny.Count);
 
        }
↑Jump back a section
Last modified on 27 February 2013, at 21:56