Question: Edit the code the python code below, to be one of a kind. extract training size csv, criterion csv, gini entropy csv, min leaf and

Edit the code the python code below, to be one of a kind. extract training size csv, criterion csv, gini entropy csv, min leaf and min split csv and max depth csv

extra 5 pounds for your troubles. Email me at bishoppzee at g mail dot com for payment.

Thank you!!

import pandas as pd import numpy as np import time from sklearn.tree import DecisionTreeClassifier as sklearnDT from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score, make_scorer from sklearn.model_selection import train_test_split, cross_val_score, KFold from sklearn.model_selection import StratifiedKFold

# Binary tree class class Tree: def __init__(self, content, left=None, right=None): self.content = content self.left = left self.right = right

# Decision tree classifier # Data needs to be a numpy array class myDecisionTree: """ Initialise the class with parameters

criterion max_depth min_samples_split min_samples_leaf min_impurity_decrease """ def __init__(self, criterion="gini", min_impurity_decrease=0, max_depth=None, min_samples_split=2, min_samples_leaf=1): self.criterion = criterion self.min_impurity_decrease = min_impurity_decrease self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf def _classCounts(self, rows): """ Returns a dictionary counting the number of each class >>> classCounts([["A"],["B"],["B"],["C"]]) {'A': 1, 'B': 2, 'C': 1} """ counts = {} for row in rows: # label in the last column label = row[-1] if label not in counts: counts[label] = 0 counts[label] += 1 return counts

def _findClass(self, rows): # find the majority class in leaf node counts = self._classCounts(rows) predclass = max(counts, key=counts.get) return predclass

def _gini(self, rows): # Returns gini impurity of a list counts = self._classCounts(rows) impurity = 1 for label in counts: # loop over all classes p_label = counts[label]/len(rows) # proportion of this class impurity -= p_label**2 return impurity

def _entropy(self, rows): counts = self._classCounts(rows) impurity = 0 for label in counts: p_label = counts[label]/len(rows) impurity += -p_label*np.log(p_label) return impurity

def _infoGain(self, left, right, current_uncertainty): #Information gain from spliting if self.criterion == "gini": p = len(left)/(len(left)+len(right)) return current_uncertainty - p*self._gini(left) - (1-p)*self._gini(right) elif self.criterion == "entropy": p = len(left)/(len(left)+len(right)) return current_uncertainty - p*self._entropy(left) - (1-p)*self._entropy(right)

def _partition(self, column, value, row): # Creat partition on a feature true_rows = row[row[:,column] < value] false_rows = row[row[:,column] >= value] return true_rows, false_rows

# Iterate over each feature to find the best split feature and value def _findBestSplit(self, rows): best_gain = 0 best_col = 0 best_value = None n_features = len(rows[0])-1

if self.criterion == "gini": current_uncertainty = self._gini(rows) elif self.criterion == "entropy": current_uncertainty = self._entropy(rows)

for col in range(n_features): # for each feature rows = rows[np.argsort(rows[:, col])] # sort data by column

values = set([row[col] for row in rows]) # unique values in the column

if len(values) == 1: # no possible splits for this feature continue

values = sorted(values) # sort set into ordered list midpoints = [] for i in range(len(values)-1): midpoints.append((values[i]+values[i+1])/2)

for val in midpoints: # try splitting the dataset true_rows, false_rows = self._partition(col, val, rows)

# Skip this split if it doesn't divide the dataset if len(true_rows) < self.min_samples_leaf or len(false_rows) < self.min_samples_leaf: continue

# Calculate the information gain from this split gain = self._infoGain(true_rows, false_rows, current_uncertainty) # Update current best gain found if gain > best_gain: best_gain, best_col, best_value = gain, col, val

return best_gain, best_col, best_value

def _build_tree(self, rows, depth=0): # create leaf if at max depth or min_sample if depth == self.max_depth or len(rows) < self.min_samples_split: return Tree(self._findClass(rows)) depth += 1

# Find best split gain, col, val = self._findBestSplit(rows)

# check if split decreas impurity if gain <= self.min_impurity_decrease: # create leaf node return Tree(self._findClass(rows)) # make split true_rows, false_rows = self._partition(col, val, rows)

# recurively build the decision tree true_branch = self._build_tree(true_rows, depth) false_branch = self._build_tree(false_rows, depth)

return Tree((col, val), true_branch, false_branch) # build a tree for self.tree by calling build tree function def fit(self, X, y): rows = np.column_stack((X, y)) # merge X and y arrays self.tree = self._build_tree(rows)

# Find the class of a single input from a tree def _classify(self, row, tree): # if at leaf node the return class label if (tree.left == None) and (tree.right == None): return tree.content

# move down the decision tree recursively else: col, val = tree.content if row[col] < val: return self._classify(row, tree.left) else: return self._classify(row, tree.right)

# Get a single prediction def predict(self, data): pred = [] # Find the class for each item in input data for row in data: pred.append(self._classify(row, self.tree))

return np.array(pred)

data = pd.read_csv("iris.data", names=["sepal length", "sepal width", "petal length", "petal width", "class"]) data = np.array(data)

X = data[:,:-1] y = data[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt = sklearnDT(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, random_state=0, min_impurity_decrease=0.0, ccp_alpha=0.0)

# Sklearn dt dt.fit(X_train, y_train) y_pred = dt.predict(X_test) print("sklearn dt: ") print(classification_report(y_test, y_pred))

# My decision tree dt = myDecisionTree(criterion="entropy", min_impurity_decrease=0, max_depth=None, min_samples_split=2, min_samples_leaf=1) dt.fit(X_train, y_train) y_pred = dt.predict(X_test) print("my dt: ") print(classification_report(y_test, y_pred))

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer

Step: 1 Unlock blur-text-image

Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock

Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!

Please help me to edit my code in python and this is my code and I don't know why I cannot print the last one like this dna = input("Enter a DNA string:") print("Length is {0}".format(len(dna))) gc =...

Can someone please help me with this question. I added some pictures with what I managed to do so far. I have this class and this constructor and I want to use this method we musn't use static method...

Please write a code in python The output configuration should be like the ones in the pictures Please please The outputs should be the same as the pictures PROJECT IS FOUR PAGES AND WITH A TOTAL OF...

ONLY NEED TO EDIT distortion.py and interpolation.py # Please do not change the structure Do not import cv2, numpy and other third party libs Distortion: Write code to perform barrel distortion on an...

Distortion: Write code to perform barrel distortion on an image. Starter code available in directory Tranform/ Transform/distortion.py: Edit the function distortion to implement this part. Correction...

1 Introduction In this assignment, you learn about various ways to normalize text data, using libraries like NLTK and SpaCy. Text normalization is performed to convert the textual data to a more...

1.Predict Mobile App Popularity mobile applications have truly revolutionized the way products and service are used. Businesses have started to realize the potential of having an app and they have...

A company that produces 8-ounce low-fat yogurt cups wanted to estimate the mean number of calories for such cups. A random sample of 10 such cups produced the following numbers of calories. Construct...

1. If you knew the personality profiles of your workers, how would you actually use the information to benefit the company? Can personality testing help you achieve the companys goal of becoming one...

10. From the trial balance and the net income or net loss determined in Exercise 3.9, prepare a statement of owners equity and a balance sheet for Cedar Canyon Nursery and Landscape as of December...

Several years ago, Polar Inc. acquired an 80% interest in Icecap Co. The book values of Icecap's asset and liability accounts at that time were considered to be equal to their fair values. Polar's...

KEY QUESTION Below is a list of domestic output and national income figures for a certain year. All figures are in billions. The questions that follow ask you to determine the major national income...

LAST WORD What government agency compiles the U.S. NIPA tables? In what U.S. department is it located? Of the several specific sources of information, name one source for each of the four components...

Why are changes in inventories included as part of investment spending? Suppose inventories declined by $1 billion during 2006. How would this affect the size of gross private domestic investment and...