Question: I am working on a project using Python and Jupyter. This is what I have to work with: # - * - coding: utf -

I am working on a project using Python and Jupyter. This is what I have to work with: # -*- coding: utf-8-*-
"""
Created on Fri Sep 202024
@author: CS634 Group 1, Fall 2024
"""
import pandas as pd
import json
import numpy as np
# Assign Dataset JSON File Paths
EN_FILE_PATH = "DataSets\Eng_sentences.json"
FRA_FILE_PATH = "DataSets\Fra_sentences.json"
MULTI_LANG_SENTENCE_FILE_PATH = "DataSets\CC0_sentences.json"
LINKS_FILE_PATH = "DataSets\Links.json" # Contains Translations to match the sentences from CC0_sentences.json
# Read the JSON file into a DataFrame
en_data = pd.read_json(EN_FILE_PATH, lines=True) # English
fra_data = pd.read_json(FRA_FILE_PATH, lines=True) # French
sentences_data = pd.read_json(MULTI_LANG_SENTENCE_FILE_PATH, lines=True) # Multi Lang Sentences
with open(LINKS_FILE_PATH, 'r') as link_file: # Link/Translation
link_data =[]
for line in link_file:
link_data.append(json.loads(line))
# Create a DataFrame
en_data.set_index('Sentence id', inplace=True) # English
fra_data.set_index('Sentence id', inplace=True) # French
sentences_data.set_index('Sentence id', inplace=True) # Multi Lang Sentences
link_df = pd.DataFrame(link_data) # Link/Translation
# Reset index to include 'Sentence id' as a column
en_data.reset_index(inplace=True) # English
fra_data.reset_index(inplace=True) # French
sentences_data.reset_index(inplace=True) # Multi Lang Sentences
link_df.reset_index(drop=True, inplace=True) # Link/Translation
# Display the first few rows of the DataFrame
print(en_data.head())
print(fra_data.head())
print(sentences_data.head())
print(link_df.head())
# =============================================================================
# Owen's part
# =============================================================================
# Sentences_data contains all the sentences available on the website
# Count number of sentences in each available languages
lang_count = sentences_data.groupby(['Lang'])['Sentence id'].agg('count').reset_index()
lang_count = lang_count.rename(columns={'Sentence id':'lang_count'})
idx = lang_count.lang_count.idxmax()
print('
{} language has {} available sentences which is the most in dataset'.format(lang_count.iloc[idx][0], lang_count.iloc[idx][1]))
# link_df contains the links between the sentences. 177 means that sentence #77 is the translation of sentence #1.
# The reciprocal link is also present, so the file will also contain a line that says 771.
# Count number of sentences that has been used
link_translate = link_df.groupby(['Sentence id']).agg('count').reset_index()
link_translate = link_translate.rename(columns={'Translation id':'translation_count'})
# Get the translations
link_translate = link_translate.merge(sentences_data.iloc[:,:3], how = 'left', on = 'Sentence id')
# Keep only the valid translation
valid_translation = link_translate[~link_translate.Lang.isna()]
valid_translation_count = valid_translation.groupby('Lang')['Sentence id'].agg('count').reset_index()
# Count number of each language valid translations
valid_translation_count = valid_translation_count.rename(columns={'Sentence id':'valid_count'})
idx = valid_translation_count.valid_count.idxmax()
print('
{} language has {} valid sentences which is the most in dataset'.format(valid_translation_count.iloc[idx][0], valid_translation_count.iloc[idx][1]))
# Most popular translated sentence
idx = valid_translation.translation_count.idxmax()
print('
{} is the most translated sentences'.format(valid_translation.loc[idx][-1]))
# Translation that has the most different meaning in English
en_data = en_data.merge(link_df, how = 'left', right_on = 'Sentence id', left_on = 'Sentence id')
en_translate_count = en_data['Translation id'].value_counts()
idx = en_translate_count.index[0]
print('
Translation that has the most different meaning in English:
')
en_data[en_data['Translation id']== idx][['Translation id','Text']]
# Translation that has the most different meaning in French
fra_data = fra_data.merge(link_df, how = 'left', right_on = 'Sentence id', left_on = 'Sentence id')
fra_translate_count = fra_data['Translation id'].value_counts()
idx = fra_translate_count.index[0]
print('
Translation that has the most different meaning in French:
')
fra_data[fra_data['Translation id']== idx][['Translation id','Text']]

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Programming Questions!