Question: import urllib.request, json sms_corpus = [] with urllib.request.urlopen(https://storage.googleapis.com/wd13/SMSSpamCollection.txt) as url: for line in url.readlines(): sms_corpus.append(line.decode().split('t')) # print the text and label of document 16 docid

import urllib.request, json

sms_corpus = []

with urllib.request.urlopen("https://storage.googleapis.com/wd13/SMSSpamCollection.txt") as url:

for line in url.readlines():

sms_corpus.append(line.decode().split('\t'))

# print the text and label of document 16

docid = 16

print(sms_corpus[docid])

# print the label of document 16

docid = 16

print(sms_corpus[docid][0])

# print the text of document 16

docid = 16

print(sms_corpus[docid][1])

# TOKENIZER WITH LEMMATIZER & STOPWORDS #

#importing nltk libraries

import nltk

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize

nltk.download('punkt')

nltk.download('wordnet')

nltk.download('stopwords')

def tokenize(doc):

punctuation = ['.', ',', ';', ':', '!', '?', '(', ')', '{', '}', '"', '\'']

lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

# Tokenize the document into words

words = word_tokenize(doc)

# Convert to lowercase, remove punctuation, lemmatize, and remove stopwords

words = [lemmatizer.lemmatize(word.lower()) for word in words

if word not in punctuation and word.lower() not in stop_words]

return words

# testing the tokenizer on the doc id's text

docid = 14

print(sms_corpus[docid][0]) #label

print(tokenize(sms_corpus[docid][1])) #tokenized document

from math import log

log(1)

-----------------------------

QUESTIONS:

Q1: I provided the codes above and I used below code to Calculate scores (Computing TF-IDF scores) for every token in the corpus. Store these scores in a dictionary called token_scores.

My problem is it get tokenized by sentence and not word (check the output). How can I calculate by word and using array. Kindly explain too please.

token_scores = {}

def calculate_token_scores(tokenized_docs):

token_scores = {}

# Count the frequency of each token in the corpus

token_counts = {}

for doc in tokenized_docs:

for token in doc:

if token in token_counts:

token_counts[token] += 1

else:

token_counts[token] = 1

# Calculate the token scores based on their frequency

total_docs = len(tokenized_docs)

for token, count in token_counts.items():

token_scores[token] = log(total_docs / count)

return token_scores

calculate_token_scores(sms_corpus)

Q2: What tokens are most predictive of a message being SPAM? Can you please explain

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Programming Questions!