Algorithm Implementation/Strings/Dice's coefficient
Dice's coefficient measures how similar a set and another set are. It can be used to measure how similar two strings are in terms of the number of common bigrams (a bigram is a pair of adjacent letters in the string). Below we give implementations of Dice's coefficient of two strings in different programming languages.
C++
/* * dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b * (C) 2007 Francis Tyers * Modifications made by Stefan Koshiw 2010 * Now it outputs values [0..1] * Released under the terms of the GNU GPL. */ float dice_coefficient(wstring string1, wstring string2) { set<string> string1_bigrams; set<string> string2_bigrams; //base case if(string1.length() == 0 || string2.length() == 0) { return 0; } for(unsigned int i = 0; i < (string1.length() - 1); i++) { // extract character bigrams from string1 string1_bigrams.insert(string1.substr(i, 2)); } for(unsigned int i = 0; i < (string2.length() - 1); i++) { // extract character bigrams from string2 string2_bigrams.insert(string2.substr(i, 2)); } int intersection = 0; // find the intersection between the two sets for(set<string>::iterator IT = string2_bigrams.begin(); IT != string2_bigrams.end(); IT++) { intersection += string1_bigrams.count((*IT)); } // calculate dice coefficient int total = string1_bigrams.size() + string2_bigrams.size(); float dice = (float)(intersection * 2) / (float)total; return dice; }
Haskell
a naive, non-optimized version
import Data.List (intersect, nub) diceCoefficient sa sb = fromIntegral (2 * length (intersect agrams bgrams)) / fromIntegral (length agrams + length bgrams) where agrams = bigrams sa bgrams = bigrams sb bigrams xs = nub $ zip xs (tail xs)
Java
import java.util.Arrays; import java.util.HashSet; import java.util.Set; //Note that this implementation is case-sensitive! public static double diceCoefficient(String s1, String s2) { Set<String> nx = new HashSet<String>(); Set<String> ny = new HashSet<String>(); for (int i=0; i < s1.length()-1; i++) { char x1 = s1.charAt(i); char x2 = s1.charAt(i+1); String tmp = "" + x1 + x2; nx.add(tmp); } for (int j=0; j < s2.length()-1; j++) { char y1 = s2.charAt(j); char y2 = s2.charAt(j+1); String tmp = "" + y1 + y2; ny.add(tmp); } Set<String> intersection = new HashSet<String>(nx); intersection.retainAll(ny); double totcombigrams = intersection.size(); return (2*totcombigrams) / (nx.size()+ny.size()); } /** * Here's an optimized version of the dice coefficient calculation. It takes * advantage of the fact that a bigram of 2 chars can be stored in 1 int, and * applies a matching algorithm of O(n*log(n)) instead of O(n*n). * * <p>Note that, at the time of writing, this implementation differs from the * other implementations on this page. Where the other algorithms incorrectly * store the generated bigrams in a set (discarding duplicates), this * implementation actually treats multiple occurrences of a bigram as unique. * The correctness of this behavior is most easily seen when getting the * similarity between "GG" and "GGGGGGGG", which should obviously not be 1. * * @param s The first string * @param t The second String * @return The dice coefficient between the two input strings. Returns 0 if one * or both of the strings are {@code null}. Also returns 0 if one or both * of the strings contain less than 2 characters and are not equal. * @author Jelle Fresen */ public static double diceCoefficientOptimized(String s, String t) { // Verifying the input: if (s == null || t == null) return 0; // Quick check to catch identical objects: if (s == t) return 1; // avoid exception for single character searches if (s.length() < 2 || t.length() < 2) return 0; // Create the bigrams for string s: final int n = s.length()-1; final int[] sPairs = new int[n]; for (int i = 0; i <= n; i++) if (i == 0) sPairs[i] = s.charAt(i) << 16; else if (i == n) sPairs[i-1] |= s.charAt(i); else sPairs[i] = (sPairs[i-1] |= s.charAt(i)) << 16; // Create the bigrams for string t: final int m = t.length()-1; final int[] tPairs = new int[m]; for (int i = 0; i <= m; i++) if (i == 0) tPairs[i] = t.charAt(i) << 16; else if (i == m) tPairs[i-1] |= t.charAt(i); else tPairs[i] = (tPairs[i-1] |= t.charAt(i)) << 16; // Sort the bigram lists: Arrays.sort(sPairs); Arrays.sort(tPairs); // Count the matches: int matches = 0, i = 0, j = 0; while (i < n && j < m) { if (sPairs[i] == tPairs[j]) { matches += 2; i++; j++; } else if (sPairs[i] < tPairs[j]) i++; else j++; } return (double)matches/(n+m); }
Javascript
function dice_coefficient(string1, string2) { var intersection = 0; var length1 = string1.length - 1; var length2 = string2.length - 1; if(length1 < 1 || length2 < 1) return 0; var bigrams2 = []; for(var i = 0; i < length2; i++) { bigrams2.push(string2.substr(i,2)); } for(var i = 0; i < length1; i++) { var bigram1 = string1.substr(i, 2); for(var j = 0; j < length2; j++) { if(bigram1 == bigrams2[j]) { intersection++; bigrams2[j] = null; break; }}} return (2.0 * intersection) / (length1 + length2); }
Perl
sub dice_coefficient(){ my ($str1, $str2) = @_; return 0 if (! length $str1 || ! length $str2 ); #### bigrams using REGEX. ##my @bigrams_str1 = $str1 =~ m/ (?= (..) ) /xmsg; ##my @bigrams_str2 = $str2 =~ m/ (?= (..) ) /xmsg; my $len1 = ( length($str1) - 1 ); for (my $i=0; $i<$len1; $i++){ push @bigrams_str1, substr($str1, $i, 2); } my $len2 = ( length($str2) - 1 ); for (my $i=0; $i<$len2; $i++){ push @bigrams_str2, substr($str2, $i, 2); } my $intersect = 0; for (my $i=0; $i<=$#bigrams_str1; $i++){ if ( grep /^$bigrams_str1[$i]$/, @bigrams_str2){ $intersect++; } } my $combinedLength = ($#bigram_str1+1 + $#bigrams_str2+1); my $dice = ($intersect * 2 / $combinedLength); return $dice; }
Python
def dice_coefficient(a, b): """dice coefficient 2nt/na + nb.""" a_bigrams = set(a) b_bigrams = set(b) overlap = len(a_bigrams & b_bigrams) return overlap * 2.0/(len(a_bigrams) + len(b_bigrams)) """ more orthodox and robust implementation """ def dice_coefficient(a, b): """dice coefficient 2nt/na + nb.""" if not len(a) or not len(b): return 0.0 if len(a) == 1: a=a+u'.' if len(b) == 1: b=b+u'.' a_bigram_list=[] for i in range(len(a)-1): a_bigram_list.append(a[i:i+2]) b_bigram_list=[] for i in range(len(b)-1): b_bigram_list.append(b[i:i+2]) a_bigrams = set(a_bigram_list) b_bigrams = set(b_bigram_list) overlap = len(a_bigrams & b_bigrams) dice_coeff = overlap * 2.0/(len(a_bigrams) + len(b_bigrams)) return dice_coeff
Ruby
require 'set' # dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b def dice_coefficient(a, b) a_bigrams = a.each_char.each_cons(2).to_set b_bigrams = b.each_char.each_cons(2).to_set overlap = (a_bigrams & b_bigrams).size total = a_bigrams.size + b_bigrams.size dice = overlap * 2.0 / total dice end
C#
public static double DiceCoefficient(string stOne, string stTwo) { HashSet<string> nx = new HashSet<string>(); HashSet<string> ny = new HashSet<string>(); for(int i = 0; i < stOne.Length - 1; i++) { char x1 = stOne[i]; char x2 = stOne[i + 1]; string temp = x1.ToString() + x2.ToString(); nx.Add(temp); } for(int j = 0; j < stTwo.Length - 1; j++) { char y1 = stTwo[j]; char y2 = stTwo[j + 1]; string temp = y1.ToString() + y2.ToString(); ny.Add(temp); } HashSet<string> intersection = new HashSet<string>(nx); intersection.IntersectWith(ny); double dbOne = intersection.Count; return (2 * dbOne) / (nx.Count + ny.Count); }