# pip install rapidfuzz scikit-learn numpy

import numpy as np
from collections import Counter
import string
from rapidfuzz import fuzz, distance

test_pairs = [
    # Direct copies and near-copies
    ("The cat sat on the mat.", "The cat sat on the mat."),           # Exact copy
    ("The cat sat on the mat.", "The cat sat on the mat"),            # No punctuation
    ("The cat sat on the mat.", "The  cat  sat  on  the  mat."),      # Extra spaces
    
    # Paraphrases with same meaning
    ("The cat sat on the mat.", "On the mat, the cat was sitting."),  # Structural change
    ("The cat sat on the mat.", "The feline rested on the rug."),     # Synonym replacement
    ("The quick brown fox jumps.", "A fast brown fox leaps."),        # Partial synonym
    
    # Different sentences
    ("The cat sat on the mat.", "The dog ran in the park."),          # Different content
    ("I love programming.", "She enjoys reading books."),             # Completely different
    ("The weather is nice today.", "It's raining outside."),          # Opposite meaning
    
    # Edge cases
    ("Short.", "Short."),                                             # Very short
    ("A B C D E F G", "A B C D E F G"),                              # Repeated words
    ("", ""),                                                         # Empty strings
]
print("done")

done

def jaccard_similarity(sent1, sent2):
    # make lowercase and split into words
    words1 = set(sent1.lower().split())
    words2 = set(sent2.lower().split())
    intersection = words1.intersection(words2)
    union = words1.union(words2)
    return float(len(intersection)) / len(union) if union else 0.0

small_test_pairs = [
    ("The cat sat on the mat.", "The cat sat on the mat."),     # Copy
    ("The cat sat on the mat.", "On the mat sat the cat."),     # Same words rearranged
    ("The cat sat on the mat.", "The dog ran in the park")      # Different
]

print(f"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}")
print("=" * 100)

for sent1, sent2 in test_pairs:
    similarity = jaccard_similarity(sent1, sent2)
    print(f"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}") # 3 decimal places
    #print("-"* 50)

Sentence 1                                Sentence 2:                              Similarity Score:
====================================================================================================
The cat sat on the mat.        vs         The cat sat on the mat.                  1.000
The cat sat on the mat.        vs         The cat sat on the mat                   0.667
The cat sat on the mat.        vs         The  cat  sat  on  the  mat.             1.000
The cat sat on the mat.        vs         On the mat, the cat was sitting.         0.375
The cat sat on the mat.        vs         The feline rested on the rug.            0.250
The quick brown fox jumps.     vs         A fast brown fox leaps.                  0.250
The cat sat on the mat.        vs         The dog ran in the park.                 0.111
I love programming.            vs         She enjoys reading books.                0.000
The weather is nice today.     vs         It's raining outside.                    0.000
Short.                         vs         Short.                                   1.000
A B C D E F G                  vs         A B C D E F G                            1.000
                               vs                                                  0.000

def char_levenshtein_similarity(sent1, sent2):
    """ Character based edit-distance similarity """
    if not sent1 and not sent2:
        return 1.0
    if not sent1 or not sent2:
        return 0.0
    
    max_len = max(len(sent1), len(sent2))
    edit_distance = distance.Levenshtein.distance(sent1, sent2)
    return 1 - (edit_distance / max_len)

def word_levenshtein_similarity(sent1, sent2):
    """ Word based edit-distance similarity """
    words1 = sent1.lower().split()
    words2 = sent2.lower().split()
    
    if not words1 and not words2:
        return 1.0
    if not words1 or not words2:
        return 0.0
    
    max_len = max(len(words1), len(words2))
    edit_distance = distance.Levenshtein.distance(words1, words2)
    return 1 - (edit_distance / max_len)

    

print(f"{'Sentence 1':<41} {'Sentence 2:':<20} {'Similarities ->':<19} {'Char:':<10} {'Word:'}")
print("=" * 100)

for sent1, sent2 in test_pairs:
    char_similarity = char_levenshtein_similarity(sent1, sent2)
    word_similarity = word_levenshtein_similarity(sent1, sent2)
    print(f"{sent1:<30} {'vs':<10} {sent2:<40} {char_similarity:.3f}{'':<5} {word_similarity:.3F} ") # 3 decimal places
    #print("-"* 50

Sentence 1                                Sentence 2:          Similarities ->     Char:      Word:
====================================================================================================
The cat sat on the mat.        vs         The cat sat on the mat.                  1.000      1.000 
The cat sat on the mat.        vs         The cat sat on the mat                   0.957      0.833 
The cat sat on the mat.        vs         The  cat  sat  on  the  mat.             0.821      1.000 
The cat sat on the mat.        vs         On the mat, the cat was sitting.         0.344      0.143 
The cat sat on the mat.        vs         The feline rested on the rug.            0.517      0.500 
The quick brown fox jumps.     vs         A fast brown fox leaps.                  0.577      0.400 
The cat sat on the mat.        vs         The dog ran in the park.                 0.625      0.333 
I love programming.            vs         She enjoys reading books.                0.200      0.000 
The weather is nice today.     vs         It's raining outside.                    0.192      0.000 
Short.                         vs         Short.                                   1.000      1.000 
A B C D E F G                  vs         A B C D E F G                            1.000      1.000 
                               vs                                                  1.000      1.000

def cosine_similarity_bow(sent1, sent2):
    """ dosine similarity using bag-of-words """
    words1 = sent1.lower().translate(str.maketrans('', '', string.punctuation)).split()
    words2 = sent2.lower().translate(str.maketrans('', '', string.punctuation)).split()
    
    if not words1 and not words2:
        return 1.0
    
    vocabulary = set(words1 + words2)
    if not vocabulary:
        return 0.0
    
    # Create frequency vectors
    freq1 = Counter(words1)
    freq2 = Counter(words2)
    
    # Convert to vectors
    vec1 = np.array([freq1[word] for word in vocabulary])
    vec2 = np.array([freq2[word] for word in vocabulary])
    
    # Compute cosine similarity
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    return dot_product / (norm1 * norm2)

print(f"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}")
print("=" * 100)

for sent1, sent2 in test_pairs:
    similarity = cosine_similarity_bow(sent1, sent2)
    print(f"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}") # 3 decimal places
    #print("-"* 50)

Sentence 1                                Sentence 2:                              Similarity Score:
====================================================================================================
The cat sat on the mat.        vs         The cat sat on the mat.                  1.000
The cat sat on the mat.        vs         The cat sat on the mat                   1.000
The cat sat on the mat.        vs         The  cat  sat  on  the  mat.             1.000
The cat sat on the mat.        vs         On the mat, the cat was sitting.         0.825
The cat sat on the mat.        vs         The feline rested on the rug.            0.625
The quick brown fox jumps.     vs         A fast brown fox leaps.                  0.400
The cat sat on the mat.        vs         The dog ran in the park.                 0.500
I love programming.            vs         She enjoys reading books.                0.000
The weather is nice today.     vs         It's raining outside.                    0.000
Short.                         vs         Short.                                   1.000
A B C D E F G                  vs         A B C D E F G                            1.000
                               vs                                                  1.000

def fuzzy_ratio_similarity(sent1, sent2):
    """Fuzzy string matching ratio"""
    return fuzz.ratio(sent1.lower(), sent2.lower()) / 100.0

def fuzzy_partial_ratio(sent1, sent2):
    """Fuzzy partial string matching"""
    return fuzz.partial_ratio(sent1.lower(), sent2.lower()) / 100.0

def fuzzy_token_sort_ratio(sent1, sent2):
    """Fuzzy token sorting ratio (ignore order)"""
    return fuzz.token_sort_ratio(sent1.lower(), sent2.lower()) / 100.0

def fuzzy_token_set_ratio(sent1, sent2):
    """ Fuzzy token set ratio (duplicates) """
    return fuzz.token_set_ratio(sent1.lower(), sent2.lower()) / 100.0

print("Fuzzy ratio Examples")
print("=" * 90)
print(f"{'Case':<30} {'Ratio':<8} {'Partial':<8} {'Token Sort':<12} {'Token Set':<8} Description")
print("=" * 90)

for sent1, sent2 in test_pairs:
        ratio = int(fuzz.ratio(sent1, sent2))
        partial = int(fuzz.partial_ratio(sent1, sent2)) 
        token_sort = int(fuzz.token_sort_ratio(sent1, sent2))
        token_set = int(fuzz.token_set_ratio(sent1, sent2))
        
        # description
        if sent1 == sent2:
            desc = "Exact copy"
        elif sorted(sent1.split()) == sorted(sent2.split()):
            desc = "Same words, different order"
        elif set(sent1.split()).issubset(set(sent2.split())) or set(sent2.split()).issubset(set(sent1.split())):
            desc = "Subset relationship"
        else:
            desc = "Different content"
            
        print(f"{sent1[:28]:<30} {ratio:<8} {partial:<8} {token_sort:<12} {token_set:<8} {desc}")

Fuzzy ratio Examples
==========================================================================================
Case                           Ratio    Partial  Token Sort   Token Set Description
==========================================================================================
The cat sat on the mat.        100      100      100          100      Exact copy
The cat sat on the mat.        97       100      97           97       Different content
The cat sat on the mat.        90       82       100          100      Same words, different order
The cat sat on the mat.        47       57       58           62       Different content
The cat sat on the mat.        61       60       53           60       Different content
The quick brown fox jumps.     61       70       57           57       Different content
The cat sat on the mat.        63       63       55           55       Different content
I love programming.            40       43       40           40       Different content
The weather is nice today.     38       47       29           29       Different content
Short.                         100      100      100          100      Exact copy
A B C D E F G                  100      100      100          100      Exact copy
                               100      100      100          0        Exact copy

def longest_common_subsequence(sent1, sent2):
    """ Longest common subsequence similarity """
    if not sent1 or not sent2:
        return 0.0
    
    # Remove punctuation for better matching
    sent1_clean = sent1.lower().translate(str.maketrans('', '', string.punctuation))
    sent2_clean = sent2.lower().translate(str.maketrans('', '', string.punctuation))
    
    m, n = len(sent1_clean), len(sent2_clean)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if sent1_clean[i-1] == sent2_clean[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    
    lcs_length = dp[m][n]
    return lcs_length / max(m, n) if max(m, n) > 0 else 0.0

print(f"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}")
print("=" * 100)

for sent1, sent2 in test_pairs:
    similarity = longest_common_subsequence(sent1, sent2)
    print(f"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}") # 3 decimal places
    #print("-"* 50)

Sentence 1                                Sentence 2:                              Similarity Score:
====================================================================================================
The cat sat on the mat.        vs         The cat sat on the mat.                  1.000
The cat sat on the mat.        vs         The cat sat on the mat                   1.000
The cat sat on the mat.        vs         The  cat  sat  on  the  mat.             0.815
The cat sat on the mat.        vs         On the mat, the cat was sitting.         0.433
The cat sat on the mat.        vs         The feline rested on the rug.            0.536
The quick brown fox jumps.     vs         A fast brown fox leaps.                  0.560
The cat sat on the mat.        vs         The dog ran in the park.                 0.609
I love programming.            vs         She enjoys reading books.                0.333
The weather is nice today.     vs         It's raining outside.                    0.360
Short.                         vs         Short.                                   1.000
A B C D E F G                  vs         A B C D E F G                            1.000
                               vs                                                  0.000

def containment_similarity(sent1, sent2):
    # sent1 in sent2 (asymmetric)
    """ What percentage of sent1's words are in sent2 """
    words1 = set(sent1.lower().translate(str.maketrans('', '', string.punctuation)).split())
    words2 = set(sent2.lower().translate(str.maketrans('', '', string.punctuation)).split())
    
    if not words1:
        return 0.0
    
    common = words1.intersection(words2)
    return len(common) / len(words1)

def evaluate_baseline_methods(pairs):
    """ Evaluate all baseline methods on test pairs"""
    methods = {
        'Jaccard': jaccard_similarity,
        'Levenshtein (char)': char_levenshtein_similarity,
        'Levenshtein (word)': word_levenshtein_similarity,
        'Cosine BOW': cosine_similarity_bow,
        'Fuzzy Ratio': fuzzy_ratio_similarity,
        'Fuzzy Partial': fuzzy_partial_ratio,
        'Fuzzy Token Sort': fuzzy_token_sort_ratio,
        'Fuzzy Token Set': fuzzy_token_set_ratio,
        'LCS': longest_common_subsequence,
        'Containment': containment_similarity,
    }
    
    results = {method: [] for method in methods}
    
    for sent1, sent2 in pairs:
        for method_name, method_func in methods.items():
            similarity = method_func(sent1, sent2)
            results[method_name].append(similarity)
    
    return results, methods

def print_comparison_table(results, pairs):
    """ Print a formatted comparison table """
    print(f"{'Pair':<30} {'Sentence 1':<30} {'Sentence 2':<30}")
    print("=" * 100)
    
    for i, (sent1, sent2) in enumerate(pairs):
        # if sentence too long
        display_sent1 = sent1[:27] + "..." if len(sent1) > 30 else sent1
        display_sent2 = sent2[:27] + "..." if len(sent2) > 30 else sent2
        
        print(f"{f'Pair {i+1}:':<30} {display_sent1:<30} {display_sent2:<30}")
        
        # Print similarities for this pair
        for method_name in results:
            similarity = results[method_name][i]
            print(f"{'':<30} {method_name + ':':<20} {similarity:.3f}")
        print("-" * 100)

results, methods = evaluate_baseline_methods(test_pairs)
print_comparison_table(results, test_pairs)

Pair                           Sentence 1                     Sentence 2                    
====================================================================================================
Pair 1:                        The cat sat on the mat.        The cat sat on the mat.       
                               Jaccard:             1.000
                               Levenshtein (char):  1.000
                               Levenshtein (word):  1.000
                               Cosine BOW:          1.000
                               Fuzzy Ratio:         1.000
                               Fuzzy Partial:       1.000
                               Fuzzy Token Sort:    1.000
                               Fuzzy Token Set:     1.000
                               LCS:                 1.000
                               Containment:         1.000
----------------------------------------------------------------------------------------------------
Pair 2:                        The cat sat on the mat.        The cat sat on the mat        
                               Jaccard:             0.667
                               Levenshtein (char):  0.957
                               Levenshtein (word):  0.833
                               Cosine BOW:          1.000
                               Fuzzy Ratio:         0.978
                               Fuzzy Partial:       1.000
                               Fuzzy Token Sort:    0.978
                               Fuzzy Token Set:     0.973
                               LCS:                 1.000
                               Containment:         1.000
----------------------------------------------------------------------------------------------------
Pair 3:                        The cat sat on the mat.        The  cat  sat  on  the  mat.  
                               Jaccard:             1.000
                               Levenshtein (char):  0.821
                               Levenshtein (word):  1.000
                               Cosine BOW:          1.000
                               Fuzzy Ratio:         0.902
                               Fuzzy Partial:       0.826
                               Fuzzy Token Sort:    1.000
                               Fuzzy Token Set:     1.000
                               LCS:                 0.815
                               Containment:         1.000
----------------------------------------------------------------------------------------------------
Pair 4:                        The cat sat on the mat.        On the mat, the cat was sit...
                               Jaccard:             0.375
                               Levenshtein (char):  0.344
                               Levenshtein (word):  0.143
                               Cosine BOW:          0.825
                               Fuzzy Ratio:         0.509
                               Fuzzy Partial:       0.619
                               Fuzzy Token Sort:    0.764
                               Fuzzy Token Set:     0.723
                               LCS:                 0.433
                               Containment:         0.800
----------------------------------------------------------------------------------------------------
Pair 5:                        The cat sat on the mat.        The feline rested on the rug. 
                               Jaccard:             0.250
                               Levenshtein (char):  0.517
                               Levenshtein (word):  0.500
                               Cosine BOW:          0.625
                               Fuzzy Ratio:         0.615
                               Fuzzy Partial:       0.605
                               Fuzzy Token Sort:    0.538
                               Fuzzy Token Set:     0.480
                               LCS:                 0.536
                               Containment:         0.400
----------------------------------------------------------------------------------------------------
Pair 6:                        The quick brown fox jumps.     A fast brown fox leaps.       
                               Jaccard:             0.250
                               Levenshtein (char):  0.577
                               Levenshtein (word):  0.400
                               Cosine BOW:          0.400
                               Fuzzy Ratio:         0.612
                               Fuzzy Partial:       0.700
                               Fuzzy Token Sort:    0.531
                               Fuzzy Token Set:     0.562
                               LCS:                 0.560
                               Containment:         0.400
----------------------------------------------------------------------------------------------------
Pair 7:                        The cat sat on the mat.        The dog ran in the park.      
                               Jaccard:             0.111
                               Levenshtein (char):  0.625
                               Levenshtein (word):  0.333
                               Cosine BOW:          0.500
                               Fuzzy Ratio:         0.638
                               Fuzzy Partial:       0.636
                               Fuzzy Token Sort:    0.553
                               Fuzzy Token Set:     0.462
                               LCS:                 0.609
                               Containment:         0.200
----------------------------------------------------------------------------------------------------
Pair 8:                        I love programming.            She enjoys reading books.     
                               Jaccard:             0.000
                               Levenshtein (char):  0.200
                               Levenshtein (word):  0.000
                               Cosine BOW:          0.000
                               Fuzzy Ratio:         0.409
                               Fuzzy Partial:       0.432
                               Fuzzy Token Sort:    0.364
                               Fuzzy Token Set:     0.364
                               LCS:                 0.333
                               Containment:         0.000
----------------------------------------------------------------------------------------------------
Pair 9:                        The weather is nice today.     It's raining outside.         
                               Jaccard:             0.000
                               Levenshtein (char):  0.192
                               Levenshtein (word):  0.000
                               Cosine BOW:          0.000
                               Fuzzy Ratio:         0.426
                               Fuzzy Partial:       0.514
                               Fuzzy Token Sort:    0.340
                               Fuzzy Token Set:     0.340
                               LCS:                 0.360
                               Containment:         0.000
----------------------------------------------------------------------------------------------------
Pair 10:                       Short.                         Short.                        
                               Jaccard:             1.000
                               Levenshtein (char):  1.000
                               Levenshtein (word):  1.000
                               Cosine BOW:          1.000
                               Fuzzy Ratio:         1.000
                               Fuzzy Partial:       1.000
                               Fuzzy Token Sort:    1.000
                               Fuzzy Token Set:     1.000
                               LCS:                 1.000
                               Containment:         1.000
----------------------------------------------------------------------------------------------------
Pair 11:                       A B C D E F G                  A B C D E F G                 
                               Jaccard:             1.000
                               Levenshtein (char):  1.000
                               Levenshtein (word):  1.000
                               Cosine BOW:          1.000
                               Fuzzy Ratio:         1.000
                               Fuzzy Partial:       1.000
                               Fuzzy Token Sort:    1.000
                               Fuzzy Token Set:     1.000
                               LCS:                 1.000
                               Containment:         1.000
----------------------------------------------------------------------------------------------------
Pair 12:                                                                                    
                               Jaccard:             0.000
                               Levenshtein (char):  1.000
                               Levenshtein (word):  1.000
                               Cosine BOW:          1.000
                               Fuzzy Ratio:         1.000
                               Fuzzy Partial:       1.000
                               Fuzzy Token Sort:    1.000
                               Fuzzy Token Set:     0.000
                               LCS:                 0.000
                               Containment:         0.000
----------------------------------------------------------------------------------------------------

Jaccard Similarity¶

--- Leneshtein Similarity ---¶

--- Cosine Similarity ---¶

--- Fuzzy ratios ---¶

--- Longest common sub-sequence ----¶

--- Containment Similarity ---¶

Evaluate BaseLine Methods¶