Question: Edit the code the python code below, to be one of a kind. extract training size csv, criterion csv, gini entropy csv, min leaf and
Edit the code the python code below, to be one of a kind. extract training size csv, criterion csv, gini entropy csv, min leaf and min split csv and max depth csv
extra 5 pounds for your troubles. Email me at bishoppzee at g mail dot com for payment.
Thank you!!
import pandas as pd import numpy as np import time from sklearn.tree import DecisionTreeClassifier as sklearnDT from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score, make_scorer from sklearn.model_selection import train_test_split, cross_val_score, KFold from sklearn.model_selection import StratifiedKFold
# Binary tree class class Tree: def __init__(self, content, left=None, right=None): self.content = content self.left = left self.right = right
# Decision tree classifier # Data needs to be a numpy array class myDecisionTree: """ Initialise the class with parameters
criterion max_depth min_samples_split min_samples_leaf min_impurity_decrease """ def __init__(self, criterion="gini", min_impurity_decrease=0, max_depth=None, min_samples_split=2, min_samples_leaf=1): self.criterion = criterion self.min_impurity_decrease = min_impurity_decrease self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf def _classCounts(self, rows): """ Returns a dictionary counting the number of each class >>> classCounts([["A"],["B"],["B"],["C"]]) {'A': 1, 'B': 2, 'C': 1} """ counts = {} for row in rows: # label in the last column label = row[-1] if label not in counts: counts[label] = 0 counts[label] += 1 return counts
def _findClass(self, rows): # find the majority class in leaf node counts = self._classCounts(rows) predclass = max(counts, key=counts.get) return predclass
def _gini(self, rows): # Returns gini impurity of a list counts = self._classCounts(rows) impurity = 1 for label in counts: # loop over all classes p_label = counts[label]/len(rows) # proportion of this class impurity -= p_label**2 return impurity
def _entropy(self, rows): counts = self._classCounts(rows) impurity = 0 for label in counts: p_label = counts[label]/len(rows) impurity += -p_label*np.log(p_label) return impurity
def _infoGain(self, left, right, current_uncertainty): #Information gain from spliting if self.criterion == "gini": p = len(left)/(len(left)+len(right)) return current_uncertainty - p*self._gini(left) - (1-p)*self._gini(right) elif self.criterion == "entropy": p = len(left)/(len(left)+len(right)) return current_uncertainty - p*self._entropy(left) - (1-p)*self._entropy(right)
def _partition(self, column, value, row): # Creat partition on a feature true_rows = row[row[:,column] < value] false_rows = row[row[:,column] >= value] return true_rows, false_rows
# Iterate over each feature to find the best split feature and value def _findBestSplit(self, rows): best_gain = 0 best_col = 0 best_value = None n_features = len(rows[0])-1
if self.criterion == "gini": current_uncertainty = self._gini(rows) elif self.criterion == "entropy": current_uncertainty = self._entropy(rows)
for col in range(n_features): # for each feature rows = rows[np.argsort(rows[:, col])] # sort data by column
values = set([row[col] for row in rows]) # unique values in the column
if len(values) == 1: # no possible splits for this feature continue
values = sorted(values) # sort set into ordered list midpoints = [] for i in range(len(values)-1): midpoints.append((values[i]+values[i+1])/2)
for val in midpoints: # try splitting the dataset true_rows, false_rows = self._partition(col, val, rows)
# Skip this split if it doesn't divide the dataset if len(true_rows) < self.min_samples_leaf or len(false_rows) < self.min_samples_leaf: continue
# Calculate the information gain from this split gain = self._infoGain(true_rows, false_rows, current_uncertainty) # Update current best gain found if gain > best_gain: best_gain, best_col, best_value = gain, col, val
return best_gain, best_col, best_value
def _build_tree(self, rows, depth=0): # create leaf if at max depth or min_sample if depth == self.max_depth or len(rows) < self.min_samples_split: return Tree(self._findClass(rows)) depth += 1
# Find best split gain, col, val = self._findBestSplit(rows)
# check if split decreas impurity if gain <= self.min_impurity_decrease: # create leaf node return Tree(self._findClass(rows)) # make split true_rows, false_rows = self._partition(col, val, rows)
# recurively build the decision tree true_branch = self._build_tree(true_rows, depth) false_branch = self._build_tree(false_rows, depth)
return Tree((col, val), true_branch, false_branch) # build a tree for self.tree by calling build tree function def fit(self, X, y): rows = np.column_stack((X, y)) # merge X and y arrays self.tree = self._build_tree(rows)
# Find the class of a single input from a tree def _classify(self, row, tree): # if at leaf node the return class label if (tree.left == None) and (tree.right == None): return tree.content
# move down the decision tree recursively else: col, val = tree.content if row[col] < val: return self._classify(row, tree.left) else: return self._classify(row, tree.right)
# Get a single prediction def predict(self, data): pred = [] # Find the class for each item in input data for row in data: pred.append(self._classify(row, self.tree))
return np.array(pred)
data = pd.read_csv("iris.data", names=["sepal length", "sepal width", "petal length", "petal width", "class"]) data = np.array(data)
X = data[:,:-1] y = data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dt = sklearnDT(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, random_state=0, min_impurity_decrease=0.0, ccp_alpha=0.0)
# Sklearn dt dt.fit(X_train, y_train) y_pred = dt.predict(X_test) print("sklearn dt: ") print(classification_report(y_test, y_pred))
# My decision tree dt = myDecisionTree(criterion="entropy", min_impurity_decrease=0, max_depth=None, min_samples_split=2, min_samples_leaf=1) dt.fit(X_train, y_train) y_pred = dt.predict(X_test) print("my dt: ") print(classification_report(y_test, y_pred))
Step by Step Solution
There are 3 Steps involved in it
Get step-by-step solutions from verified subject matter experts
