In [1]:
# pip install rapidfuzz scikit-learn numpy
import numpy as np
from collections import Counter
import string
from rapidfuzz import fuzz, distance
test_pairs = [
# Direct copies and near-copies
("The cat sat on the mat.", "The cat sat on the mat."), # Exact copy
("The cat sat on the mat.", "The cat sat on the mat"), # No punctuation
("The cat sat on the mat.", "The cat sat on the mat."), # Extra spaces
# Paraphrases with same meaning
("The cat sat on the mat.", "On the mat, the cat was sitting."), # Structural change
("The cat sat on the mat.", "The feline rested on the rug."), # Synonym replacement
("The quick brown fox jumps.", "A fast brown fox leaps."), # Partial synonym
# Different sentences
("The cat sat on the mat.", "The dog ran in the park."), # Different content
("I love programming.", "She enjoys reading books."), # Completely different
("The weather is nice today.", "It's raining outside."), # Opposite meaning
# Edge cases
("Short.", "Short."), # Very short
("A B C D E F G", "A B C D E F G"), # Repeated words
("", ""), # Empty strings
]
print("done")
done
Jaccard Similarity¶
In [58]:
def jaccard_similarity(sent1, sent2):
# make lowercase and split into words
words1 = set(sent1.lower().split())
words2 = set(sent2.lower().split())
intersection = words1.intersection(words2)
union = words1.union(words2)
return float(len(intersection)) / len(union) if union else 0.0
small_test_pairs = [
("The cat sat on the mat.", "The cat sat on the mat."), # Copy
("The cat sat on the mat.", "On the mat sat the cat."), # Same words rearranged
("The cat sat on the mat.", "The dog ran in the park") # Different
]
print(f"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}")
print("=" * 100)
for sent1, sent2 in test_pairs:
similarity = jaccard_similarity(sent1, sent2)
print(f"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}") # 3 decimal places
#print("-"* 50)
Sentence 1 Sentence 2: Similarity Score:
====================================================================================================
The cat sat on the mat. vs The cat sat on the mat. 1.000
The cat sat on the mat. vs The cat sat on the mat 0.667
The cat sat on the mat. vs The cat sat on the mat. 1.000
The cat sat on the mat. vs On the mat, the cat was sitting. 0.375
The cat sat on the mat. vs The feline rested on the rug. 0.250
The quick brown fox jumps. vs A fast brown fox leaps. 0.250
The cat sat on the mat. vs The dog ran in the park. 0.111
I love programming. vs She enjoys reading books. 0.000
The weather is nice today. vs It's raining outside. 0.000
Short. vs Short. 1.000
A B C D E F G vs A B C D E F G 1.000
vs 0.000
--- Leneshtein Similarity ---¶
Character & Word
In [57]:
def char_levenshtein_similarity(sent1, sent2):
""" Character based edit-distance similarity """
if not sent1 and not sent2:
return 1.0
if not sent1 or not sent2:
return 0.0
max_len = max(len(sent1), len(sent2))
edit_distance = distance.Levenshtein.distance(sent1, sent2)
return 1 - (edit_distance / max_len)
def word_levenshtein_similarity(sent1, sent2):
""" Word based edit-distance similarity """
words1 = sent1.lower().split()
words2 = sent2.lower().split()
if not words1 and not words2:
return 1.0
if not words1 or not words2:
return 0.0
max_len = max(len(words1), len(words2))
edit_distance = distance.Levenshtein.distance(words1, words2)
return 1 - (edit_distance / max_len)
print(f"{'Sentence 1':<41} {'Sentence 2:':<20} {'Similarities ->':<19} {'Char:':<10} {'Word:'}")
print("=" * 100)
for sent1, sent2 in test_pairs:
char_similarity = char_levenshtein_similarity(sent1, sent2)
word_similarity = word_levenshtein_similarity(sent1, sent2)
print(f"{sent1:<30} {'vs':<10} {sent2:<40} {char_similarity:.3f}{'':<5} {word_similarity:.3F} ") # 3 decimal places
#print("-"* 50
Sentence 1 Sentence 2: Similarities -> Char: Word:
====================================================================================================
The cat sat on the mat. vs The cat sat on the mat. 1.000 1.000
The cat sat on the mat. vs The cat sat on the mat 0.957 0.833
The cat sat on the mat. vs The cat sat on the mat. 0.821 1.000
The cat sat on the mat. vs On the mat, the cat was sitting. 0.344 0.143
The cat sat on the mat. vs The feline rested on the rug. 0.517 0.500
The quick brown fox jumps. vs A fast brown fox leaps. 0.577 0.400
The cat sat on the mat. vs The dog ran in the park. 0.625 0.333
I love programming. vs She enjoys reading books. 0.200 0.000
The weather is nice today. vs It's raining outside. 0.192 0.000
Short. vs Short. 1.000 1.000
A B C D E F G vs A B C D E F G 1.000 1.000
vs 1.000 1.000
--- Cosine Similarity ---¶
In [38]:
def cosine_similarity_bow(sent1, sent2):
""" dosine similarity using bag-of-words """
words1 = sent1.lower().translate(str.maketrans('', '', string.punctuation)).split()
words2 = sent2.lower().translate(str.maketrans('', '', string.punctuation)).split()
if not words1 and not words2:
return 1.0
vocabulary = set(words1 + words2)
if not vocabulary:
return 0.0
# Create frequency vectors
freq1 = Counter(words1)
freq2 = Counter(words2)
# Convert to vectors
vec1 = np.array([freq1[word] for word in vocabulary])
vec2 = np.array([freq2[word] for word in vocabulary])
# Compute cosine similarity
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
if norm1 == 0 or norm2 == 0:
return 0.0
return dot_product / (norm1 * norm2)
print(f"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}")
print("=" * 100)
for sent1, sent2 in test_pairs:
similarity = cosine_similarity_bow(sent1, sent2)
print(f"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}") # 3 decimal places
#print("-"* 50)
Sentence 1 Sentence 2: Similarity Score:
====================================================================================================
The cat sat on the mat. vs The cat sat on the mat. 1.000
The cat sat on the mat. vs The cat sat on the mat 1.000
The cat sat on the mat. vs The cat sat on the mat. 1.000
The cat sat on the mat. vs On the mat, the cat was sitting. 0.825
The cat sat on the mat. vs The feline rested on the rug. 0.625
The quick brown fox jumps. vs A fast brown fox leaps. 0.400
The cat sat on the mat. vs The dog ran in the park. 0.500
I love programming. vs She enjoys reading books. 0.000
The weather is nice today. vs It's raining outside. 0.000
Short. vs Short. 1.000
A B C D E F G vs A B C D E F G 1.000
vs 1.000
--- Fuzzy ratios ---¶
In [5]:
def fuzzy_ratio_similarity(sent1, sent2):
"""Fuzzy string matching ratio"""
return fuzz.ratio(sent1.lower(), sent2.lower()) / 100.0
def fuzzy_partial_ratio(sent1, sent2):
"""Fuzzy partial string matching"""
return fuzz.partial_ratio(sent1.lower(), sent2.lower()) / 100.0
def fuzzy_token_sort_ratio(sent1, sent2):
"""Fuzzy token sorting ratio (ignore order)"""
return fuzz.token_sort_ratio(sent1.lower(), sent2.lower()) / 100.0
def fuzzy_token_set_ratio(sent1, sent2):
""" Fuzzy token set ratio (duplicates) """
return fuzz.token_set_ratio(sent1.lower(), sent2.lower()) / 100.0
print("Fuzzy ratio Examples")
print("=" * 90)
print(f"{'Case':<30} {'Ratio':<8} {'Partial':<8} {'Token Sort':<12} {'Token Set':<8} Description")
print("=" * 90)
for sent1, sent2 in test_pairs:
ratio = int(fuzz.ratio(sent1, sent2))
partial = int(fuzz.partial_ratio(sent1, sent2))
token_sort = int(fuzz.token_sort_ratio(sent1, sent2))
token_set = int(fuzz.token_set_ratio(sent1, sent2))
# description
if sent1 == sent2:
desc = "Exact copy"
elif sorted(sent1.split()) == sorted(sent2.split()):
desc = "Same words, different order"
elif set(sent1.split()).issubset(set(sent2.split())) or set(sent2.split()).issubset(set(sent1.split())):
desc = "Subset relationship"
else:
desc = "Different content"
print(f"{sent1[:28]:<30} {ratio:<8} {partial:<8} {token_sort:<12} {token_set:<8} {desc}")
Fuzzy ratio Examples
==========================================================================================
Case Ratio Partial Token Sort Token Set Description
==========================================================================================
The cat sat on the mat. 100 100 100 100 Exact copy
The cat sat on the mat. 97 100 97 97 Different content
The cat sat on the mat. 90 82 100 100 Same words, different order
The cat sat on the mat. 47 57 58 62 Different content
The cat sat on the mat. 61 60 53 60 Different content
The quick brown fox jumps. 61 70 57 57 Different content
The cat sat on the mat. 63 63 55 55 Different content
I love programming. 40 43 40 40 Different content
The weather is nice today. 38 47 29 29 Different content
Short. 100 100 100 100 Exact copy
A B C D E F G 100 100 100 100 Exact copy
100 100 100 0 Exact copy
--- Longest common sub-sequence ----¶
In [37]:
def longest_common_subsequence(sent1, sent2):
""" Longest common subsequence similarity """
if not sent1 or not sent2:
return 0.0
# Remove punctuation for better matching
sent1_clean = sent1.lower().translate(str.maketrans('', '', string.punctuation))
sent2_clean = sent2.lower().translate(str.maketrans('', '', string.punctuation))
m, n = len(sent1_clean), len(sent2_clean)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if sent1_clean[i-1] == sent2_clean[j-1]:
dp[i][j] = dp[i-1][j-1] + 1
else:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
lcs_length = dp[m][n]
return lcs_length / max(m, n) if max(m, n) > 0 else 0.0
print(f"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}")
print("=" * 100)
for sent1, sent2 in test_pairs:
similarity = longest_common_subsequence(sent1, sent2)
print(f"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}") # 3 decimal places
#print("-"* 50)
Sentence 1 Sentence 2: Similarity Score:
====================================================================================================
The cat sat on the mat. vs The cat sat on the mat. 1.000
The cat sat on the mat. vs The cat sat on the mat 1.000
The cat sat on the mat. vs The cat sat on the mat. 0.815
The cat sat on the mat. vs On the mat, the cat was sitting. 0.433
The cat sat on the mat. vs The feline rested on the rug. 0.536
The quick brown fox jumps. vs A fast brown fox leaps. 0.560
The cat sat on the mat. vs The dog ran in the park. 0.609
I love programming. vs She enjoys reading books. 0.333
The weather is nice today. vs It's raining outside. 0.360
Short. vs Short. 1.000
A B C D E F G vs A B C D E F G 1.000
vs 0.000
--- Containment Similarity ---¶
Percentage of Sentance A in Sentance B
containment(A, B) = |words(A) ∩ words(B)| / |words(A)|
In [ ]:
def containment_similarity(sent1, sent2):
# sent1 in sent2 (asymmetric)
""" What percentage of sent1's words are in sent2 """
words1 = set(sent1.lower().translate(str.maketrans('', '', string.punctuation)).split())
words2 = set(sent2.lower().translate(str.maketrans('', '', string.punctuation)).split())
if not words1:
return 0.0
common = words1.intersection(words2)
return len(common) / len(words1)
Evaluate BaseLine Methods¶
In [25]:
def evaluate_baseline_methods(pairs):
""" Evaluate all baseline methods on test pairs"""
methods = {
'Jaccard': jaccard_similarity,
'Levenshtein (char)': char_levenshtein_similarity,
'Levenshtein (word)': word_levenshtein_similarity,
'Cosine BOW': cosine_similarity_bow,
'Fuzzy Ratio': fuzzy_ratio_similarity,
'Fuzzy Partial': fuzzy_partial_ratio,
'Fuzzy Token Sort': fuzzy_token_sort_ratio,
'Fuzzy Token Set': fuzzy_token_set_ratio,
'LCS': longest_common_subsequence,
'Containment': containment_similarity,
}
results = {method: [] for method in methods}
for sent1, sent2 in pairs:
for method_name, method_func in methods.items():
similarity = method_func(sent1, sent2)
results[method_name].append(similarity)
return results, methods
def print_comparison_table(results, pairs):
""" Print a formatted comparison table """
print(f"{'Pair':<30} {'Sentence 1':<30} {'Sentence 2':<30}")
print("=" * 100)
for i, (sent1, sent2) in enumerate(pairs):
# if sentence too long
display_sent1 = sent1[:27] + "..." if len(sent1) > 30 else sent1
display_sent2 = sent2[:27] + "..." if len(sent2) > 30 else sent2
print(f"{f'Pair {i+1}:':<30} {display_sent1:<30} {display_sent2:<30}")
# Print similarities for this pair
for method_name in results:
similarity = results[method_name][i]
print(f"{'':<30} {method_name + ':':<20} {similarity:.3f}")
print("-" * 100)
results, methods = evaluate_baseline_methods(test_pairs)
print_comparison_table(results, test_pairs)
Pair Sentence 1 Sentence 2
====================================================================================================
Pair 1: The cat sat on the mat. The cat sat on the mat.
Jaccard: 1.000
Levenshtein (char): 1.000
Levenshtein (word): 1.000
Cosine BOW: 1.000
Fuzzy Ratio: 1.000
Fuzzy Partial: 1.000
Fuzzy Token Sort: 1.000
Fuzzy Token Set: 1.000
LCS: 1.000
Containment: 1.000
----------------------------------------------------------------------------------------------------
Pair 2: The cat sat on the mat. The cat sat on the mat
Jaccard: 0.667
Levenshtein (char): 0.957
Levenshtein (word): 0.833
Cosine BOW: 1.000
Fuzzy Ratio: 0.978
Fuzzy Partial: 1.000
Fuzzy Token Sort: 0.978
Fuzzy Token Set: 0.973
LCS: 1.000
Containment: 1.000
----------------------------------------------------------------------------------------------------
Pair 3: The cat sat on the mat. The cat sat on the mat.
Jaccard: 1.000
Levenshtein (char): 0.821
Levenshtein (word): 1.000
Cosine BOW: 1.000
Fuzzy Ratio: 0.902
Fuzzy Partial: 0.826
Fuzzy Token Sort: 1.000
Fuzzy Token Set: 1.000
LCS: 0.815
Containment: 1.000
----------------------------------------------------------------------------------------------------
Pair 4: The cat sat on the mat. On the mat, the cat was sit...
Jaccard: 0.375
Levenshtein (char): 0.344
Levenshtein (word): 0.143
Cosine BOW: 0.825
Fuzzy Ratio: 0.509
Fuzzy Partial: 0.619
Fuzzy Token Sort: 0.764
Fuzzy Token Set: 0.723
LCS: 0.433
Containment: 0.800
----------------------------------------------------------------------------------------------------
Pair 5: The cat sat on the mat. The feline rested on the rug.
Jaccard: 0.250
Levenshtein (char): 0.517
Levenshtein (word): 0.500
Cosine BOW: 0.625
Fuzzy Ratio: 0.615
Fuzzy Partial: 0.605
Fuzzy Token Sort: 0.538
Fuzzy Token Set: 0.480
LCS: 0.536
Containment: 0.400
----------------------------------------------------------------------------------------------------
Pair 6: The quick brown fox jumps. A fast brown fox leaps.
Jaccard: 0.250
Levenshtein (char): 0.577
Levenshtein (word): 0.400
Cosine BOW: 0.400
Fuzzy Ratio: 0.612
Fuzzy Partial: 0.700
Fuzzy Token Sort: 0.531
Fuzzy Token Set: 0.562
LCS: 0.560
Containment: 0.400
----------------------------------------------------------------------------------------------------
Pair 7: The cat sat on the mat. The dog ran in the park.
Jaccard: 0.111
Levenshtein (char): 0.625
Levenshtein (word): 0.333
Cosine BOW: 0.500
Fuzzy Ratio: 0.638
Fuzzy Partial: 0.636
Fuzzy Token Sort: 0.553
Fuzzy Token Set: 0.462
LCS: 0.609
Containment: 0.200
----------------------------------------------------------------------------------------------------
Pair 8: I love programming. She enjoys reading books.
Jaccard: 0.000
Levenshtein (char): 0.200
Levenshtein (word): 0.000
Cosine BOW: 0.000
Fuzzy Ratio: 0.409
Fuzzy Partial: 0.432
Fuzzy Token Sort: 0.364
Fuzzy Token Set: 0.364
LCS: 0.333
Containment: 0.000
----------------------------------------------------------------------------------------------------
Pair 9: The weather is nice today. It's raining outside.
Jaccard: 0.000
Levenshtein (char): 0.192
Levenshtein (word): 0.000
Cosine BOW: 0.000
Fuzzy Ratio: 0.426
Fuzzy Partial: 0.514
Fuzzy Token Sort: 0.340
Fuzzy Token Set: 0.340
LCS: 0.360
Containment: 0.000
----------------------------------------------------------------------------------------------------
Pair 10: Short. Short.
Jaccard: 1.000
Levenshtein (char): 1.000
Levenshtein (word): 1.000
Cosine BOW: 1.000
Fuzzy Ratio: 1.000
Fuzzy Partial: 1.000
Fuzzy Token Sort: 1.000
Fuzzy Token Set: 1.000
LCS: 1.000
Containment: 1.000
----------------------------------------------------------------------------------------------------
Pair 11: A B C D E F G A B C D E F G
Jaccard: 1.000
Levenshtein (char): 1.000
Levenshtein (word): 1.000
Cosine BOW: 1.000
Fuzzy Ratio: 1.000
Fuzzy Partial: 1.000
Fuzzy Token Sort: 1.000
Fuzzy Token Set: 1.000
LCS: 1.000
Containment: 1.000
----------------------------------------------------------------------------------------------------
Pair 12:
Jaccard: 0.000
Levenshtein (char): 1.000
Levenshtein (word): 1.000
Cosine BOW: 1.000
Fuzzy Ratio: 1.000
Fuzzy Partial: 1.000
Fuzzy Token Sort: 1.000
Fuzzy Token Set: 0.000
LCS: 0.000
Containment: 0.000
----------------------------------------------------------------------------------------------------