Question: # TF - IDF and PPMI Code Template This template will help you implement TF - IDF and PPMI calculations using the

# TF-IDF and PPMI Code Template
"""
This template will help you implement TF-IDF and PPMI calculations using the NLTK library and the Brown corpus.
You will preprocess the corpus, compute term frequencies, document frequencies, TF-IDF scores, and create
a word co-occurrence matrix to compute Positive Pointwise Mutual Information (PPMI) scores.
"""
import nltk
from nltk.corpus import brown
from collections import defaultdict, Counter
import math
import numpy as np
# Download the Brown corpus if not already downloaded
nltk.download('brown')
# Preprocess the corpus: Tokenize, lowercase, and add start/end tokens
def preprocess(corpus):
"""
Preprocess the corpus by tokenizing, converting to lowercase, and adding and tokens.
Args:
corpus (list): List of sentences from the corpus.
Returns:
list: Preprocessed and tokenized corpus.
"""
tokenized_corpus =[]
for sentence in corpus:
# TODO: Implement tokenization and lowercasing
# HINT: Use list comprehension and str.lower()
# TODO: Add '' at the start and '' at the end of the sentence
pass # Remove this line after implementing
return tokenized_corpus
# Calculate Term Frequency (TF)
def compute_tf(corpus):
"""
Calculate the term frequency for each word in each document.
Args:
corpus (list): Preprocessed corpus where each document is a list of words.
Returns:
dict: Term frequencies for each document.
"""
tf = defaultdict(Counter)
# TODO: For each document, count the occurrences of each word
# HINT: Use enumerate to get document index and Counter to count words
pass # Remove this line after implementing
return tf
# Calculate Document Frequency (DF)
def compute_df(tf):
"""
Calculate the document frequency for each word across all documents.
Args:
tf (dict): Term frequencies for each document.
Returns:
Counter: Document frequencies for each word.
"""
df = Counter()
# TODO: For each word, count the number of documents it appears in
# HINT: Use a set of words for each document to avoid counting duplicates
pass # Remove this line after implementing
return df
# Calculate TF-IDF for each word
def compute_tfidf(tf, df, num_docs):
"""
Calculate the TF-IDF score for each word in each document.
Args:
tf (dict): Term frequencies for each document.
df (Counter): Document frequencies for each word.
num_docs (int): Total number of documents.
Returns:
dict: TF-IDF scores for each word in each document.
"""
tfidf = defaultdict(dict)
# TODO: For each document and word, calculate TF-IDF score
# TF-IDF formula: TF(word)* log(N /(1+ DF(word)))
# HINT: Use math.log() for logarithm
pass # Remove this line after implementing
return tfidf
# Create a word co-occurrence matrix
def create_cooccurrence_matrix(corpus, window_size=5):
"""
Create a word co-occurrence matrix from the corpus.
Args:
corpus (list): Preprocessed corpus where each document is a list of words.
window_size (int): The size of the context window.
Returns:
tuple: Co-occurrence matrix, word to index mapping, and index to word mapping.
"""
# TODO: Build the vocabulary of unique words
# HINT: Use a set to store unique words
pass # Remove this line after implementing
# TODO: Initialize co-occurrence matrix
# HINT: Use numpy to create a zero matrix of size vocab_size x vocab_size
pass # Remove this line after implementing
# TODO: Fill in the co-occurrence matrix
# HINT: For each word, consider a window of words around it
pass # Remove this line after implementing
return cooccurrence_matrix, word_to_id, id_to_word
# Calculate PPMI from co-occurrence matrix
def compute_ppmi(cooccurrence_matrix):
"""
Compute the Positive Pointwise Mutual Information (PPMI) matrix from the co-occurrence matrix.
Args:
cooccurrence_matrix (numpy.ndarray): Co-occurrence matrix of word counts.
Returns:
numpy.ndarray: PPMI matrix.
"""
# TODO: Calculate total sum of all co-occurrences
# HINT: Use numpy.sum()
pass # Remove this line after implementing
# TODO: Calculate sum over rows (word occurrence counts)
# HINT: Use numpy.sum() with axis=1
pass # Remove this line after implementing
# Initialize PPMI matrix with zeros
ppmi_matrix = np.zeros(cooccurrence_matrix.shape)
# TODO: Compute PPMI for each cell in the matrix
# HINT: Use nested loops to iterate over the matrix indices
# Remember to check if pij >0 before computing PMI
pass # Remove this line after implementing
return ppmi_matrix
# Main execution
if __name__=="__main_

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Programming Questions!