Below is my code and I'm using Visual Studio Code (Python 3.9.13) to program and for some
Question:
Below is my code and I'm using Visual Studio Code (Python 3.9.13) to program and for some reason it won't read in my breast_cancer_wisconsin.data file any longer. It did previously and generated random centroid outputs, but now it doesn't recognize my data file, when it did previously. Below is the error message I get when I go to run my code.
FileNotFoundError: [Errno 2] No such file or directory: 'breast_cancer_wisconsin.data'
I deleted and re-saved the data file, I even tried renaming the data file, I tried removing other folders in the workspace, and restarted VSCode and I still get the same "FileNotFoundError" message in my terminal.
Any help appreciated, thank you!
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
def distance(u, point): #helper function to compute Euclidian distance from data points to centroids
dist = math.sqrt(sum([(a - b) ** 2 for a, b in zip(u, point)]))
dist = round(dist, 6)
return dist
def main():
col = ['SCN', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Class'] #dataframe column names
df = pd.read_csv('breast_cancer_wisconsin.data', na_values = '?', names = col) #read in data file
#Fills NaN values with mean
df['A7'].fillna(df['A7'].mean(), inplace = True)
#new dataframe
new_df = df.iloc[:, 1:10].astype('float64')
nrows = new_df.shape[0]
#Two random centroids generated, labels are u2 & u4
u2_idx, u4_idx = np.random.choice(nrows-1, 2)
u2 = new_df.iloc[u2_idx]
u4 = new_df.iloc[u4_idx]
#INITIAL STEP
#print initial centroids generated
print('Randomly selected row', u2_idx, 'for centroid mu_2.\n')
print('Initial centroid mu_2:')
print(u2)
print()
print('Randomly selected row', u4_idx, 'for centroid mu_4.\n')
print('Initial centroid mu_4:')
print(u4)
df['Predicted_Class'] = [-1] * len(df)
itr = 0
#ASSIGN STEP
#Loop 50 times or until centroids are equal
for i in range(10):
itr = itr + 1
for i in range(nrows):
dist_u2 = distance(u2, new_df.iloc[i])
dist_u4 = distance(u4, new_df.iloc[i])
if dist_u2 <= dist_u4:
new_df.at[i, 'Predicted_Class'] = 2 #cluster 2
else:
new_df.at[i, 'Predicted_Class'] = 4 #cluster 4
i_2 = new_df[new_df['Predicted_Class'] == 2].index
i_4 = new_df[new_df['Predicted_Class'] == 4].index
#RECOMPUTE STEP
#Update u2 & u4 by computing mean from cluster 2 & cluster 4 data points
mu_2 = np.mean(new_df.iloc[i_2, :9])
mu_4 = np.mean(new_df.iloc[i_4, :9])
if mu_2.equals(u2) and mu_4.equals(u4): #stop looping when centroids are equal
break
u2 = mu_2
u4 = mu_4
print('Program ended after', itr, 'iterations.\n')
#update values & print final results
print('Final centroid mu_2:')
for i in range (len(u2)):
print(col[i+1], ' ', mu_2[i])
print('')
print('Final centroid mu_4:')
for i in range (len(u4)):
print(col[i+1], ' ', mu_4[i])
#Include first 20 rows in report, run program multiple times and submit best results
print('\nFinal cluster assignment:\n')
df['Predicted_Class'] = new_df['Predicted_Class'].astype('int')
print()
#print(df[['SCN', 'Class', 'Predicted_Class']].head(21))
#Save output to CSV
df[['SCN', 'Class', 'Predicted_Class']].to_csv('output.csv')
main()
Income Tax Fundamentals 2013
ISBN: 9781285586618
31st Edition
Authors: Gerald E. Whittenburg, Martha Altus Buller, Steven L Gill