# Algorithm Implementation/Strings/Dice's coefficient

Dice's coefficient measures how similar a set and another set are. It can be used to measure how similar two strings are in terms of the number of common bigrams (a bigram is a pair of adjacent letters in the string). Below we give implementations of Dice's coefficient of two strings in different programming languages.

## C++Edit

```/*
* dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
* (C) 2007 Francis Tyers
* Modifications made by Stefan Koshiw 2010
* Now it outputs values [0..1]
* Released under the terms of the GNU GPL.
*/

float dice_coefficient(wstring string1, wstring string2)
{

set<string> string1_bigrams;
set<string> string2_bigrams;

//base case
if(string1.length() == 0 || string2.length() == 0)
{
return 0;
}

for(unsigned int i = 0; i < (string1.length() - 1); i++) {      // extract character bigrams from string1
string1_bigrams.insert(string1.substr(i, 2));
}
for(unsigned int i = 0; i < (string2.length() - 1); i++) {      // extract character bigrams from string2
string2_bigrams.insert(string2.substr(i, 2));
}

int intersection = 0;

// find the intersection between the two sets

for(set<string>::iterator IT = string2_bigrams.begin();
IT != string2_bigrams.end();
IT++)
{
intersection += string1_bigrams.count((*IT));
}

// calculate dice coefficient
int total = string1_bigrams.size() + string2_bigrams.size();
float dice = (float)(intersection * 2) / (float)total;

return dice;
}
```

a naive, non-optimized version

```import	Data.List	(intersect, nub)

diceCoefficient sa sb =
fromIntegral (2 * length (intersect agrams bgrams)) / fromIntegral (length agrams + length bgrams)
where
agrams		= bigrams sa
bgrams		= bigrams sb
bigrams xs	= nub \$ zip xs (tail xs)
```

## JavaEdit

```import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

//Note that this implementation is case-sensitive!
public static double diceCoefficient(String s1, String s2)
{
Set<String> nx = new HashSet<String>();
Set<String> ny = new HashSet<String>();

for (int i=0; i < s1.length()-1; i++) {
char x1 = s1.charAt(i);
char x2 = s1.charAt(i+1);
String tmp = "" + x1 + x2;
}
for (int j=0; j < s2.length()-1; j++) {
char y1 = s2.charAt(j);
char y2 = s2.charAt(j+1);
String tmp = "" + y1 + y2;
}

Set<String> intersection = new HashSet<String>(nx);
intersection.retainAll(ny);
double totcombigrams = intersection.size();

return (2*totcombigrams) / (nx.size()+ny.size());
}

/**
* Here's an optimized version of the dice coefficient calculation. It takes
* advantage of the fact that a bigram of 2 chars can be stored in 1 int, and
* applies a matching algorithm of O(n*log(n)) instead of O(n*n).
*
* <p>Note that, at the time of writing, this implementation differs from the
* store the generated bigrams in a set (discarding duplicates), this
* implementation actually treats multiple occurrences of a bigram as unique.
* The correctness of this behavior is most easily seen when getting the
* similarity between "GG" and "GGGGGGGG", which should obviously not be 1.
*
* @param s The first string
* @param t The second String
* @return The dice coefficient between the two input strings. Returns 0 if one
*         or both of the strings are {@code null}. Also returns 0 if one or both
*         of the strings contain less than 2 characters and are not equal.
* @author Jelle Fresen
*/
public static double diceCoefficientOptimized(String s, String t)
{
// Verifying the input:
if (s == null || t == null)
return 0;
// Quick check to catch identical objects:
if (s == t)
return 1;
// avoid exception for single character searches
if (s.length() < 2 || t.length() < 2)
return 0;

// Create the bigrams for string s:
final int n = s.length()-1;
final int[] sPairs = new int[n];
for (int i = 0; i <= n; i++)
if (i == 0)
sPairs[i] = s.charAt(i) << 16;
else if (i == n)
sPairs[i-1] |= s.charAt(i);
else
sPairs[i] = (sPairs[i-1] |= s.charAt(i)) << 16;

// Create the bigrams for string t:
final int m = t.length()-1;
final int[] tPairs = new int[m];
for (int i = 0; i <= m; i++)
if (i == 0)
tPairs[i] = t.charAt(i) << 16;
else if (i == m)
tPairs[i-1] |= t.charAt(i);
else
tPairs[i] = (tPairs[i-1] |= t.charAt(i)) << 16;

// Sort the bigram lists:
Arrays.sort(sPairs);
Arrays.sort(tPairs);

// Count the matches:
int matches = 0, i = 0, j = 0;
while (i < n && j < m)
{
if (sPairs[i] == tPairs[j])
{
matches += 2;
i++;
j++;
}
else if (sPairs[i] < tPairs[j])
i++;
else
j++;
}
return (double)matches/(n+m);
}
```

## JavascriptEdit

```function dice_coefficient(string1, string2) {
var intersection = 0;
var length1 = string1.length - 1;
var length2 = string2.length - 1;
if(length1 < 1 || length2 < 1) return 0;
var bigrams2 = [];
for(var i = 0; i < length2; i++) {
bigrams2.push(string2.substr(i,2));
}
for(var i = 0; i < length1; i++) {
var bigram1 = string1.substr(i, 2);
for(var j = 0; j < length2; j++) {
if(bigram1 == bigrams2[j]) {
intersection++;
bigrams2[j] = null;
break;
}}}
return (2.0 * intersection) / (length1 + length2);
}

// The following version reduces the time complexity to O(n) rather than O(n^2) and

// QQQ HOWEVER, the following is not JavaScript. It is a version of BASIC. QQQ

const diceCoefficient = (l, r) => {
if (l.length < 2 || r.length < 2) return 0;

let lBigrams = new Map();
for (let i = 0; i < l.length - 1; i++) {
const bigram = l.substr(i, 2);
const count = lBigrams.has(bigram)
? lBigrams.get(bigram) + 1
: 1;

lBigrams.set(bigram, count);
};

let intersectionSize = 0;
for (let i = 0; i < r.length - 1; i++) {
const bigram = r.substr(i + 2);
const count = lBigrams.has(bigram)
? lBigrams.get(bigram)
: 0;

if (count > 0) {
lBigrams.set(bigram, count - 1);
intersectionSize++;
}
}

return (2.0 * intersectionSize) / (l.length + r.length - 2);
}
```

## PerlEdit

```sub dice_coefficient(){
my (\$str1, \$str2) = @_;

return 0 if (! length \$str1 || ! length \$str2 );

#### bigrams using REGEX.
##my @bigrams_str1 = \$str1 =~ m/ (?= (..) ) /xmsg;
##my @bigrams_str2 = \$str2 =~ m/ (?= (..) ) /xmsg;

my \$len1 = ( length(\$str1) -  1 );
for (my \$i=0; \$i<\$len1; \$i++){
push @bigrams_str1, substr(\$str1, \$i, 2);
}

my \$len2 = ( length(\$str2) -  1 );
for (my \$i=0; \$i<\$len2; \$i++){
push @bigrams_str2, substr(\$str2, \$i, 2);
}

my \$intersect = 0;
STR1: for my \$bigram (@bigrams_str1){
for my \$i (0 .. \$#bigrams_str2){
next unless \$bigrams_str2[\$i];
if (\$bigram eq \$bigrams_str2[\$i]){
\$intersect++;
\$bigrams_str2[\$i] = undef;
next STR1;
}
}
}

my \$combinedLength = (\$#bigrams_str1+1 + \$#bigrams_str2+1);
my \$dice = (\$intersect * 2 / \$combinedLength);

return \$dice;
}
```

## PythonEdit

```def dice_coefficient(a, b):
"""dice coefficient 2nt/na + nb."""
a_bigrams = set(a)
b_bigrams = set(b)
overlap = len(a_bigrams & b_bigrams)
return overlap * 2.0/(len(a_bigrams) + len(b_bigrams))

""" more orthodox and robust implementation """
def dice_coefficient(a, b):
"""dice coefficient 2nt/na + nb."""
if not len(a) or not len(b): return 0.0
if len(a) == 1:  a=a+u'.'
if len(b) == 1:  b=b+u'.'

a_bigram_list=[]
for i in range(len(a)-1):
a_bigram_list.append(a[i:i+2])
b_bigram_list=[]
for i in range(len(b)-1):
b_bigram_list.append(b[i:i+2])

a_bigrams = set(a_bigram_list)
b_bigrams = set(b_bigram_list)
overlap = len(a_bigrams & b_bigrams)
dice_coeff = overlap * 2.0/(len(a_bigrams) + len(b_bigrams))
return dice_coeff

""" duplicate bigrams in a word should be counted distinctly
(per discussion), otherwise 'AA' and 'AAAA' would have a
dice coefficient of 1...
"""
def dice_coefficient(a,b):
if not len(a) or not len(b): return 0.0
""" quick case for true duplicates """
if a == b: return 1.0
""" if a != b, and a or b are single chars, then they can't possibly match """
if len(a) == 1 or len(b) == 1: return 0.0

""" use python list comprehension, preferred over list.append() """
a_bigram_list = [a[i:i+2] for i in range(len(a)-1)]
b_bigram_list = [b[i:i+2] for i in range(len(b)-1)]

a_bigram_list.sort()
b_bigram_list.sort()

# assignments to save function calls
lena = len(a_bigram_list)
lenb = len(b_bigram_list)
# initialize match counters
matches = i = j = 0
while (i < lena and j < lenb):
if a_bigram_list[i] == b_bigram_list[j]:
matches += 2
i += 1
j += 1
elif a_bigram_list[i] < b_bigram_list[j]:
i += 1
else:
j += 1

score = float(matches)/float(lena + lenb)
return score
```

## Objective CEdit

```int compare (const void * a, const void * b)
{
return ( *(int*)a - *(int*)b );
}

+(float)diceCoefficient:(NSString*)s s2:(NSString*)t
{
// Verifying the input:
if (s == nil || t == nil) return 0;
// Quick check to catch identical objects:
if (s == t) return 1;
// avoid exception for single character searches
if (s.length < 2 || t.length < 2) return 0;

// Create the bigrams for string s:
int n = s.length-1;
int sPairs[n];
for (int i = 0; i <= n; i++)
if (i == 0) sPairs[i] = [s characterAtIndex:i] << 16;
else if (i == n) sPairs[i-1] |= [s characterAtIndex:i];
else sPairs[i] = (sPairs[i-1] |= [s characterAtIndex:i]) << 16;

// Create the bigrams for string t:
int m = t.length-1;
int tPairs[m];
for (int i = 0; i <= m; i++)
if (i == 0) tPairs[i] = [t characterAtIndex:i] << 16;
else if (i == m) tPairs[i-1] |= [t characterAtIndex:i];
else tPairs[i] = (tPairs[i-1] |= [t characterAtIndex:i]) << 16;

// Sort the bigram lists:
qsort(sPairs, n, sizeof(int), compare);
qsort(tPairs, m, sizeof(int), compare);

// Count the matches:
int matches = 0, i = 0, j = 0;
while (i < n && j < m) {
if (sPairs[i] == tPairs[j]) {
matches += 2;
i++;
j++;
} else
if (sPairs[i] < tPairs[j]) i++; else j++;
}
return (float)matches/(n+m);
}
```

## RubyEdit

```# dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
def dice_coefficient(a, b)
a_bigrams = a.each_char.each_cons(2).to_a
b_bigrams = b.each_char.each_cons(2).to_a

overlap = (a_bigrams & b_bigrams).size

total = a_bigrams.size + b_bigrams.size
dice  = overlap * 2.0 / total

dice
end
```

## C#Edit

```        public static double DiceCoefficient(string strA, string strB)
{
HashSet<string> setA = new HashSet<string>();
HashSet<string> setB = new HashSet<string>();

for (int i = 0; i < strA.Length - 1; ++i)

for (int i = 0; i < strB.Length - 1; ++i)

HashSet<string> intersection = new HashSet<string>(setA);
intersection.IntersectWith(setB);

return (2.0 * intersection.Count) / (setA.Count + setB.Count);
}
```

## JEdit

```bigrams =: (_1}.])(<@,"0)(1}.])

dice_coefficient =: (2&*@~.@%@#)@(bigrams,bigrams)
```

## CEdit

```#include <string.h>

double dice_match(const char *string1, const char *string2) {

//check fast cases
if (((string1 != NULL) && (string1[0] == '\0')) ||
((string2 != NULL) && (string2[0] == '\0'))) {
return 0;
}
if (string1 == string2) {
return 1;
}

size_t strlen1 = strlen(string1);
size_t strlen2 = strlen(string2);
if (strlen1 < 2 || strlen2 < 2) {
return 0;
}

size_t length1 = strlen1 - 1;
size_t length2 = strlen2 - 1;

double matches = 0;
int i = 0, j = 0;

//get bigrams and compare
while (i < length1 && j < length2) {
char a[3] = {string1[i], string1[i + 1], '\0'};
char b[3] = {string2[j], string2[j + 1], '\0'};
int cmp = strcmpi(a, b);
if (cmp == 0) {
matches += 2;
}
i++;
j++;
}

return matches / (length1 + length2);
}
```

A possible problem with the above example is that it considers the position of the bigrams as well as their presence/absence. An alternative implementation which is independent of bigram position is given below:

```#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/* unsafe macro hashes bigrams into single int */
#define HASH(x,y) (((int) (x) << 16) | (y))

/* comparison function for qsort */
int
cmp ( const void *a, const void *b ) {
int x = *(int*)a, y = *(int*)b;
return x-y;
}

double
dice ( const char *str1, const char *str2 ) {
int *bg1, *bg2;
double matches;
size_t i, strlen1, strlen2, setsize1, setsize2;

/* make sure we've been given strings to compare and that they point to */
/* two distinct places in memory                                        */
if ( str1 == NULL || str2 == NULL ) { return 0; }
if ( str1 == str2 ) { return 1; }

/* make sure strings are long enough (must have at least two chars) */
strlen1 = strlen(str1);
strlen2 = strlen(str2);
if (strlen1 < 2 || strlen2 < 2) { return 0; }

/* allocate memory for bigram sets */
setsize1 = strlen1-1;
bg1 = calloc(setsize1, sizeof(int));
if (!bg1) { return -1; }

setsize2 = strlen2-1;
bg2 = calloc(setsize2, sizeof(int));
if (!bg2) { free(bg1); return -1; }

/* hash the strings to produce bigram sets */
for (i=0; i<setsize1; i++) {
bg1[i] = HASH(str1[i], str1[i+1]);
}

for (i=0; i<setsize2; i++) {
bg2[i] = HASH(str2[i], str2[i+1]);
}

/* sort sets for ease of comparison */
qsort(bg1, setsize1, sizeof(int), cmp);
qsort(bg2, setsize2, sizeof(int), cmp);

/* compute the size of the intersection of bigram sets */
matches = 0;
for ( size_t i=0, j=0; i<setsize1 && j<setsize2; ) {
if ( bg1[i] == bg2[j] ) {
matches++;
i++; j++;
} else if (bg1[i] < bg2[j]) {
i++;
} else {
j++;
}
}

/* always remember to free your memory */
free(bg1);
free(bg2);

/* compute dice */
return (2*matches)/(setsize1+setsize2);
}
```