Import Cell

In [4]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import string
from IPython.display import display, HTML

nlp_lg = spacy.load("en_core_web_lg")
nlp_trf = spacy.load("en_core_web_trf")

nlp = nlp_lg

# Test sentences covering different similarity types
test_sentences = [
    # Direct copies and near-copies
    "The cat sat on the mat.",
    "The cat sat on the mat",
    "The  cat  sat  on  the  mat.",
    
    # Paraphrases
    "On the mat, the cat was sitting.",
    "The feline rested on the rug.",
    
    # Structural changes
    "The quick brown fox jumps over the lazy dog.",
    "Over the lazy dog jumps the quick brown fox.",
    
    # Different content
    "The dog ran in the park.",
    "I love programming.",
    "She enjoys reading books.",
    
    # Edge cases
    "Short.",
    "A B C D E F G",
    ""
]

print(f"Loaded {len(test_sentences)} test sentences")
Loaded 13 test sentences
In [5]:
def analyze_text_statistics(sentences):
    stats = []
    for i, sent in enumerate(sentences):
        doc = nlp(sent)
        words = [token.text for token in doc if not token.is_space]
        alpha_words = [token.text for token in doc if token.is_alpha]
        
        stats.append({
            'sentence_id': i,
            'sentence': sent,
            'char_length': len(sent),
            'word_count': len(words),
            'alpha_word_count': len(alpha_words),
            'avg_word_length': np.mean([len(word) for word in alpha_words]) if alpha_words else 0,
            'has_punctuation': any(token.is_punct for token in doc),
            'is_empty': len(sent.strip()) == 0
        })
    
    return pd.DataFrame(stats)

stats_df = analyze_text_statistics(test_sentences)
display(stats_df)

# Visualize basic statistics
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Character length distribution
axes[0,0].bar(range(len(stats_df)), stats_df['char_length'])
axes[0,0].set_title('Character Length by Sentence')
axes[0,0].set_xlabel('Sentence ID')
axes[0,0].set_ylabel('Characters')

# Word count distribution
axes[0,1].bar(range(len(stats_df)), stats_df['word_count'])
axes[0,1].set_title('Word Count by Sentence')
axes[0,1].set_xlabel('Sentence ID')
axes[0,1].set_ylabel('Words')

# Average word length
axes[1,0].bar(range(len(stats_df)), stats_df['avg_word_length'])
axes[1,0].set_title('Average Word Length')
axes[1,0].set_xlabel('Sentence ID')
axes[1,0].set_ylabel('Characters')

# Sentence type breakdown
types = ['With Punctuation' if x else 'No Punctuation' for x in stats_df['has_punctuation']]
type_counts = pd.Series(types).value_counts()
axes[1,1].pie(type_counts.values, labels=type_counts.index, autopct='%1.1f%%')
axes[1,1].set_title('Punctuation Distribution')

plt.tight_layout()
plt.show()
sentence_id sentence char_length word_count alpha_word_count avg_word_length has_punctuation is_empty
0 0 The cat sat on the mat. 23 7 6 2.833333 True False
1 1 The cat sat on the mat 22 6 6 2.833333 False False
2 2 The cat sat on the mat. 28 7 6 2.833333 True False
3 3 On the mat, the cat was sitting. 32 9 7 3.428571 True False
4 4 The feline rested on the rug. 29 7 6 3.833333 True False
5 5 The quick brown fox jumps over the lazy dog. 44 10 9 3.888889 True False
6 6 Over the lazy dog jumps the quick brown fox. 44 10 9 3.888889 True False
7 7 The dog ran in the park. 24 7 6 3.000000 True False
8 8 I love programming. 19 4 3 5.333333 True False
9 9 She enjoys reading books. 25 5 4 5.250000 True False
10 10 Short. 6 2 1 5.000000 True False
11 11 A B C D E F G 13 7 7 1.000000 False False
12 12 0 0 0 0.000000 False True
No description has been provided for this image
In [15]:
class TextPreprocessor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_lg")
    
    def extract_linguistic_features(self, text):
        """ Extract linguistic features """
        doc = self.nlp(text)
        
        features = {
            'tokens': [token.text for token in doc],
            'lemmas': [token.lemma_ for token in doc if token.is_alpha],
            'pos_tags': [token.pos_ for token in doc],
            'dep_tags': [token.dep_ for token in doc],
            'is_stop': [token.is_stop for token in doc],
            'is_alpha': [token.is_alpha for token in doc],
            'is_punct': [token.is_punct for token in doc]
        }
        
        return features, doc
    
    def direct_preprocessing(self, text):
        """ For direct copy detection """
        return text.lower().strip()
    
    def semantic_preprocessing(self, text):
        """ For semantic similarity """
        # Remove noise like stop words
        doc = self.nlp(text)
        processed_tokens = []
        for token in doc:
            if not token.is_punct and not token.is_space and token.is_alpha and not token.is_stop:
                processed_tokens.append(token.lemma_.lower())
        return " ".join(processed_tokens)
    
    def syntactic_preprocessing(self, text):
        """ For syntactic similarity """
        # Normalize
        doc = self.nlp(text)
        processed_tokens = []
        for token in doc:
            if token.is_space:
                continue
            elif token.is_punct:
                processed_tokens.append(token.text)
            elif token.is_stop:
                processed_tokens.append(token.lemma_.lower())
            else:
                processed_tokens.append(token.lemma_.lower())
        return " ".join(processed_tokens)


# Initialize preprocessor
preprocessor = TextPreprocessor()

# Analyze all sentences
print("Linguistic Analysis")
print("=" * 60)

for i, sentence in enumerate(test_sentences):
    print(f"\n{i+1:2d}. {'ORIGINAL: ':<20}'{sentence}'")
    features, doc = preprocessor.extract_linguistic_features(sentence)
    
    print(f"    {'Direct:     ':<20}'{preprocessor.direct_preprocessing(sentence)}'")
    print(f"    {'Semantic:   ':<20}'{preprocessor.semantic_preprocessing(sentence)}'")
    print(f"    {'Syntactic:  ':<20}'{preprocessor.syntactic_preprocessing(sentence)}'")
    
    # Basic counts
    print(f"    {'Tokens: ':<20}{len(features['tokens']):2d} | "
          f"Alpha: {sum(features['is_alpha']):2d} | "
          f"Stopwords: {sum(features['is_stop']):2d} | "
          f"Punctuation: {sum(features['is_punct']):2d}")
    
    # POS distribution
    pos_counts = Counter(features['pos_tags'])
    print(f"    {'POS:':<20}{dict(pos_counts)}")
Linguistic Analysis
============================================================

 1. ORIGINAL:           'The cat sat on the mat.'
    Direct:             'the cat sat on the mat.'
    Semantic:           'cat sit mat'
    Syntactic:          'the cat sit on the mat .'
    Tokens:              7 | Alpha:  6 | Stopwords:  3 | Punctuation:  1
    POS:                {'DET': 2, 'NOUN': 2, 'VERB': 1, 'ADP': 1, 'PUNCT': 1}

 2. ORIGINAL:           'The cat sat on the mat'
    Direct:             'the cat sat on the mat'
    Semantic:           'cat sit mat'
    Syntactic:          'the cat sit on the mat'
    Tokens:              6 | Alpha:  6 | Stopwords:  3 | Punctuation:  0
    POS:                {'DET': 2, 'NOUN': 2, 'VERB': 1, 'ADP': 1}

 3. ORIGINAL:           'The  cat  sat  on  the  mat.'
    Direct:             'the  cat  sat  on  the  mat.'
    Semantic:           'cat sit mat'
    Syntactic:          'the cat sit on the mat .'
    Tokens:             12 | Alpha:  6 | Stopwords:  3 | Punctuation:  1
    POS:                {'DET': 2, 'SPACE': 5, 'NOUN': 2, 'VERB': 1, 'ADP': 1, 'PUNCT': 1}

 4. ORIGINAL:           'On the mat, the cat was sitting.'
    Direct:             'on the mat, the cat was sitting.'
    Semantic:           'mat cat sit'
    Syntactic:          'on the mat , the cat be sit .'
    Tokens:              9 | Alpha:  7 | Stopwords:  4 | Punctuation:  2
    POS:                {'ADP': 1, 'DET': 2, 'NOUN': 2, 'PUNCT': 2, 'AUX': 1, 'VERB': 1}

 5. ORIGINAL:           'The feline rested on the rug.'
    Direct:             'the feline rested on the rug.'
    Semantic:           'feline rest rug'
    Syntactic:          'the feline rest on the rug .'
    Tokens:              7 | Alpha:  6 | Stopwords:  3 | Punctuation:  1
    POS:                {'DET': 2, 'NOUN': 2, 'VERB': 1, 'ADP': 1, 'PUNCT': 1}

 6. ORIGINAL:           'The quick brown fox jumps over the lazy dog.'
    Direct:             'the quick brown fox jumps over the lazy dog.'
    Semantic:           'quick brown fox jump lazy dog'
    Syntactic:          'the quick brown fox jump over the lazy dog .'
    Tokens:             10 | Alpha:  9 | Stopwords:  3 | Punctuation:  1
    POS:                {'DET': 2, 'ADJ': 3, 'PROPN': 1, 'VERB': 1, 'ADP': 1, 'NOUN': 1, 'PUNCT': 1}

 7. ORIGINAL:           'Over the lazy dog jumps the quick brown fox.'
    Direct:             'over the lazy dog jumps the quick brown fox.'
    Semantic:           'lazy dog jump quick brown fox'
    Syntactic:          'over the lazy dog jump the quick brown fox .'
    Tokens:             10 | Alpha:  9 | Stopwords:  3 | Punctuation:  1
    POS:                {'ADP': 1, 'DET': 2, 'ADJ': 3, 'NOUN': 1, 'VERB': 1, 'PROPN': 1, 'PUNCT': 1}

 8. ORIGINAL:           'The dog ran in the park.'
    Direct:             'the dog ran in the park.'
    Semantic:           'dog run park'
    Syntactic:          'the dog run in the park .'
    Tokens:              7 | Alpha:  6 | Stopwords:  3 | Punctuation:  1
    POS:                {'DET': 2, 'NOUN': 2, 'VERB': 1, 'ADP': 1, 'PUNCT': 1}

 9. ORIGINAL:           'I love programming.'
    Direct:             'i love programming.'
    Semantic:           'love programming'
    Syntactic:          'i love programming .'
    Tokens:              4 | Alpha:  3 | Stopwords:  1 | Punctuation:  1
    POS:                {'PRON': 1, 'VERB': 1, 'NOUN': 1, 'PUNCT': 1}

10. ORIGINAL:           'She enjoys reading books.'
    Direct:             'she enjoys reading books.'
    Semantic:           'enjoy read book'
    Syntactic:          'she enjoy read book .'
    Tokens:              5 | Alpha:  4 | Stopwords:  1 | Punctuation:  1
    POS:                {'PRON': 1, 'VERB': 2, 'NOUN': 1, 'PUNCT': 1}

11. ORIGINAL:           'Short.'
    Direct:             'short.'
    Semantic:           'short'
    Syntactic:          'short .'
    Tokens:              2 | Alpha:  1 | Stopwords:  0 | Punctuation:  1
    POS:                {'ADJ': 1, 'PUNCT': 1}

12. ORIGINAL:           'A B C D E F G'
    Direct:             'a b c d e f g'
    Semantic:           'b c d e f g'
    Syntactic:          'a b c d e f g'
    Tokens:              7 | Alpha:  7 | Stopwords:  1 | Punctuation:  0
    POS:                {'DET': 1, 'NOUN': 2, 'PROPN': 4}

13. ORIGINAL:           ''
    Direct:             ''
    Semantic:           ''
    Syntactic:          ''
    Tokens:              0 | Alpha:  0 | Stopwords:  0 | Punctuation:  0
    POS:                {}
In [16]:
def analyze_dependency_structure(text, sentence_id):
    """Analyze and display dependency parse information"""
    if not text.strip():
        print(f"Sentence {sentence_id}: [EMPTY]")
        return None
    
    doc = nlp(text)
    
    print(f"\nSentence {sentence_id}: '{text}'")
    print("Dependency Parse:")
    print("-" * 50)
    
    # Extract dependency relationships
    dependencies = []
    for token in doc:
        dependencies.append({
            'token': token.text,
            'lemma': token.lemma_,
            'pos': token.pos_,
            'dep': token.dep_,
            'head': token.head.text,
            'children': [child.text for child in token.children]
        })
        print(f"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}")
    
    return dependencies

def visualize_dependency_tree(text, sentence_id):
    """Create dependency tree visualization"""
    if not text.strip():
        return
    
    doc = nlp(text)
    
    # Use displacy for visualization
    from spacy import displacy
    
    print(f"\nDependency Tree Visualization - Sentence {sentence_id}:")
    html = displacy.render(doc, style="dep", jupyter=True, options={'distance': 120})
    
    return html

# Analyze dependency structures for key sentence pairs
key_pairs = [
    (0, "The cat sat on the mat."),
    (3, "On the mat, the cat was sitting."),
    (4, "The feline rested on the rug."),
    (5, "The quick brown fox jumps over the lazy dog.")
]

for sentence_id, sentence in key_pairs:
    deps = analyze_dependency_structure(sentence, sentence_id)
    visualize_dependency_tree(sentence, sentence_id)
    print("\n" + "="*70 + "\n")
Sentence 0: 'The cat sat on the mat.'
Dependency Parse:
--------------------------------------------------
The          det          cat          []
cat          nsubj        sat          ['The']
sat          ROOT         sat          ['cat', 'on', '.']
on           prep         sat          ['mat']
the          det          mat          []
mat          pobj         on           ['the']
.            punct        sat          []

Dependency Tree Visualization - Sentence 0:
The DET cat NOUN sat VERB on ADP the DET mat. NOUN det nsubj prep det pobj
======================================================================


Sentence 3: 'On the mat, the cat was sitting.'
Dependency Parse:
--------------------------------------------------
On           prep         sitting      ['mat']
the          det          mat          []
mat          pobj         On           ['the']
,            punct        sitting      []
the          det          cat          []
cat          nsubj        sitting      ['the']
was          aux          sitting      []
sitting      ROOT         sitting      ['On', ',', 'cat', 'was', '.']
.            punct        sitting      []

Dependency Tree Visualization - Sentence 3:
On ADP the DET mat, NOUN the DET cat NOUN was AUX sitting. VERB prep det pobj det nsubj aux
======================================================================


Sentence 4: 'The feline rested on the rug.'
Dependency Parse:
--------------------------------------------------
The          det          feline       []
feline       nsubj        rested       ['The']
rested       ROOT         rested       ['feline', 'on', '.']
on           prep         rested       ['rug']
the          det          rug          []
rug          pobj         on           ['the']
.            punct        rested       []

Dependency Tree Visualization - Sentence 4:
The DET feline NOUN rested VERB on ADP the DET rug. NOUN det nsubj prep det pobj
======================================================================


Sentence 5: 'The quick brown fox jumps over the lazy dog.'
Dependency Parse:
--------------------------------------------------
The          det          fox          []
quick        amod         fox          []
brown        amod         fox          []
fox          nsubj        jumps        ['The', 'quick', 'brown']
jumps        ROOT         jumps        ['fox', 'over', '.']
over         prep         jumps        ['dog']
the          det          dog          []
lazy         amod         dog          []
dog          pobj         over         ['the', 'lazy']
.            punct        jumps        []

Dependency Tree Visualization - Sentence 5:
The DET quick ADJ brown ADJ fox PROPN jumps VERB over ADP the DET lazy ADJ dog. NOUN det amod amod nsubj prep det amod pobj
======================================================================

In [17]:
def analyze_vocabulary_distribution(sentences):
    """Analyze word frequencies and vocabulary across sentences"""
    all_words = []
    all_lemmas = []
    content_words = []
    
    for sent in sentences:
        doc = nlp(sent)
        for token in doc:
            if token.is_alpha and not token.is_space:
                all_words.append(token.text.lower())
                all_lemmas.append(token.lemma_.lower())
                if not token.is_stop and token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']:
                    content_words.append(token.lemma_.lower())
    
    # Calculate frequencies
    word_freq = Counter(all_words)
    lemma_freq = Counter(all_lemmas)
    content_freq = Counter(content_words)
    
    return word_freq, lemma_freq, content_freq

word_freq, lemma_freq, content_freq = analyze_vocabulary_distribution(test_sentences)

print("VOCABULARY ANALYSIS")
print("=" * 50)
print(f"Total unique words: {len(word_freq)}")
print(f"Total unique lemmas: {len(lemma_freq)}")
print(f"Total content words: {len(content_freq)}")

# Plot most common words
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Original words
common_words = word_freq.most_common(10)
axes[0].bar([w[0] for w in common_words], [w[1] for w in common_words])
axes[0].set_title('Top 10 Most Common Words')
axes[0].tick_params(axis='x', rotation=45)

# Lemmas
common_lemmas = lemma_freq.most_common(10)
axes[1].bar([w[0] for w in common_lemmas], [w[1] for w in common_lemmas])
axes[1].set_title('Top 10 Most Common Lemmas')
axes[1].tick_params(axis='x', rotation=45)

# Content words
common_content = content_freq.most_common(10)
axes[2].bar([w[0] for w in common_content], [w[1] for w in common_content])
axes[2].set_title('Top 10 Content Words')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()
VOCABULARY ANALYSIS
==================================================
Total unique words: 35
Total unique lemmas: 34
Total content words: 21
No description has been provided for this image
In [19]:
def create_similarity_matrix(sentences, method='direct'):
    """Create similarity matrix between all sentences"""
    n = len(sentences)
    similarity_matrix = np.zeros((n, n))
    
    preprocessor = TextPreprocessor()
    
    for i in range(n):
        for j in range(n):
            if i == j:
                similarity_matrix[i][j] = 1.0
            else:
                if method == 'direct':
                    sent1 = preprocessor.direct_preprocessing(sentences[i])
                    sent2 = preprocessor.direct_preprocessing(sentences[j])
                    # Simple character-based similarity
                    if not sent1 or not sent2:
                        similarity = 0.0
                    else:
                        min_len = min(len(sent1), len(sent2))
                        max_len = max(len(sent1), len(sent2))
                        similarity = min_len / max_len if max_len > 0 else 0.0
                
                elif method == 'semantic':
                    sent1 = preprocessor.semantic_preprocessing(sentences[i])
                    sent2 = preprocessor.semantic_preprocessing(sentences[j])
                    # Jaccard similarity on semantic tokens
                    words1 = set(sent1.split())
                    words2 = set(sent2.split())
                    if not words1 and not words2:
                        similarity = 1.0
                    elif not words1 or not words2:
                        similarity = 0.0
                    else:
                        intersection = words1.intersection(words2)
                        union = words1.union(words2)
                        similarity = len(intersection) / len(union)
                
                similarity_matrix[i][j] = similarity
    
    return similarity_matrix

# Create and visualize similarity matrices
direct_matrix = create_similarity_matrix(test_sentences, 'direct')
semantic_matrix = create_similarity_matrix(test_sentences, 'semantic')

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Direct similarity heatmap
im1 = axes[0].imshow(direct_matrix, cmap='viridis', aspect='auto', vmin=0, vmax=1)
axes[0].set_title('Direct Preprocessing Similarity Matrix')
axes[0].set_xlabel('Sentence ID')
axes[0].set_ylabel('Sentence ID')
plt.colorbar(im1, ax=axes[0])

# Semantic similarity heatmap
im2 = axes[1].imshow(semantic_matrix, cmap='viridis', aspect='auto', vmin=0, vmax=1)
axes[1].set_title('Semantic Preprocessing Similarity Matrix')
axes[1].set_xlabel('Sentence ID')
axes[1].set_ylabel('Sentence ID')
plt.colorbar(im2, ax=axes[1])

plt.tight_layout()
plt.show()
No description has been provided for this image

Key observations:

  • Direct similarity catches exact copies but misses paraphrases
  • Semantic similarity identifies meaning-based relationships
  • Notice how sentences 0-2 (direct copies) are highly similar in both matrices
  • Sentences 3-4 (paraphrases) show higher semantic similarity
In [ ]:
 
DATA EXPLORATION SUMMARY
==================================================

1. TEXT CHARACTERISTICS:
   - Sentence length varies from 0 to 44 characters
   - Word count ranges from 0 to 10 words
   - 10/13 sentences contain punctuation

2. LINGUISTIC FEATURES:
   - Direct preprocessing preserves exact character matching
   - Semantic preprocessing extracts content words (nouns, verbs, adjectives)
   - Syntactic preprocessing normalizes structure while preserving grammar

3. DEPENDENCY PARSING INSIGHTS:
   - Different sentence structures can express the same meaning
   - Dependency trees reveal grammatical relationships
   - Root verbs often indicate the main action of the sentence

4. SIMILARITY PATTERNS:
   - Direct copies show high similarity in both preprocessing methods
   - Paraphrases show low direct similarity but higher semantic similarity
   - Different sentences show low similarity in both methods

5. IMPLICATIONS FOR PLAGIARISM DETECTION:
   - Need multiple similarity measures for comprehensive detection
   - Semantic analysis crucial for identifying paraphrased content
   - Syntactic analysis may help with structural plagiarism