Question: import pandas as pd import numpy as np from scipy.stats import pearsonr import matplotlib.pyplot as plt # Read data files house _ prices _ df

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
# Read data files
house_prices_df = pd.read_excel("MeanHousePricesClean-1.xlsx")
crime_df = pd.read_excel("CrimeClean-1-1.xlsx")
population_df = pd.read_excel("PopulationClean.xlsx")
area_df = pd.read_excel("SuburbAreas-1.xlsx")
# Step B: Clean and prepare data
def prepare_data(df, columns):
df = df.dropna(subset=columns) # Remove rows with missing values in key columns
return df
# Rename columns for consistency
house_prices_df = house_prices_df.rename(columns={'Year': 'year'})
crime_df = crime_df.rename(columns={
'Year': 'year',
'Crime rate per 100,000 population': 'crime_rate',
'Local Government Area': 'local_government_area'
})
population_df = population_df.rename(columns={'Year': 'year'})
area_df = area_df.rename(columns={'Property': 'local_government_area'})
# Clean the area DataFrame to remove non-relevant rows
area_df = area_df[area_df['local_government_area']!= 'Area sq Km']
# Step C: Analysis functions
def analyze_correlation(df, col1, col2):
df = df.dropna(subset=[col1, col2])
if len(df)2:
return np.nan
correlation, _= pearsonr(df[col1], df[col2])
return correlation
# Reshape house_prices_df to long format
house_prices_long = house_prices_df.melt(id_vars=['year'], var_name='local_government_area', value_name='mean_house_price')
# Reshape population_df to long format
population_long = population_df.melt(id_vars=['year'], var_name='local_government_area', value_name='population')
# Reshape area_df to long format
area_long = area_df.melt(id_vars=['local_government_area'], var_name='year', value_name='area')
# Merge the datasets on 'year' and 'local_government_area'
merged_df = pd.merge(crime_df, house_prices_long, on=['year', 'local_government_area'], how='inner')
merged_df = pd.merge(merged_df, population_long, on=['year', 'local_government_area'], how='inner')
merged_df = pd.merge(merged_df, area_long, on='local_government_area', how='inner')
# Calculate population density
merged_df['population_density']= merged_df['population']/ merged_df['area']
# Step D: Prepare the data by cleaning
merged_df = prepare_data(merged_df,['mean_house_price', 'crime_rate', 'population_density'])
# Step E: Perform correlation analysis
house_price_population_corr = analyze_correlation(merged_df, 'mean_house_price', 'population_density')
crime_house_price_corr = analyze_correlation(merged_df, 'crime_rate', 'mean_house_price')
crime_population_density_corr = analyze_correlation(merged_df, 'crime_rate', 'population_density')
# Step F: Print the results
print(f"Correlation between house prices and population density: {house_price_population_corr}")
print(f"Correlation between crime rate and house prices: {crime_house_price_corr}")
print(f"Correlation between crime rate and population density: {crime_population_density_corr}")
# Plotting for visual analysis
plt.figure(figsize=(10,6))
plt.scatter(merged_df['population_density'], merged_df['mean_house_price'])
plt.title('House Price vs Population Density')
plt.xlabel('Population Density (people per square km)')
plt.ylabel('Mean House Price')
plt.grid(True)
plt.show()
plt.figure(figsize=(10,6))
plt.scatter(merged_df['mean_house_price'], merged_df['crime_rate'])
plt.title('Crime Rate vs House Price')
plt.xlabel('Mean House Price')
plt.ylabel('Crime Rate (per 100,000 population)')
plt.grid(True)
plt.show()
plt.figure(figsize=(10,6))
plt.scatter(merged_df['population_density'], merged_df['crime_rate'])
plt.title('Crime Rate vs Population Density')
plt.xlabel('Population Density (people per square km)')
plt.ylabel('Crime Rate (per 100,000 population)')
plt.grid(True)
plt.show()
The above python code is run in google colab and it produces nil output as nan . modify and correct the code and add some more correlation if it helps to achieve my output. i have access to all the 4 excel sheets of data. something is wrong since it keeps on producing Nan. i am getting no correlation no matter how i edit so i will explain what each data sheet looks like. i suspect it is not aligning with the merging in code. MeanHousePricesClean-1(1).xlsx:
The first row contains years and area names.
From the second row onwards, each row represents a year, and the values in each column represent mean house prices for the corresponding area.
CrimeClean-1-1.xlsx:
The first row contains years, local government area names, incidents recorded, and crime rate per 100,000 population.
From the second row onwards, each row represents data for a specific year and area, including incidents recorded and crime rate.
PopulationClean.xlsx:
Similar to MeanHousePricesClean-1(1).xlsx, the first row contains years and area names.
From the second row onwards, each row represents a year
SuburbAreas-1.xlsx:
The first row contains property names.
The second row contains the corresponding area in square kilometers

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!