Question: import numpy as np from collections import Counter from sklearn import datasets, model _ selection # No other libraries will be imported # load the

import numpy as np
from collections import Counter
from sklearn import datasets, model_selection
# No other libraries will be imported
# load the Iris Dataset, which contains 150 samples.
# each sample has 4 features.
# the dataset contains 3 classes of 50 instances each, where each class refers to a type of iris plant.
iris = datasets.load_iris()
X = np.array(iris.data) # features, numeric attributes. [Sepal length, Sepal Width, Petal length, Petal width]
Y = np.array(iris.target) # labels: class-0, class-1, class-2
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.25, random_state=0)
print("Train Shape:", X_train.shape)
print("Train Shape:", X_test.shape)
3. Calculate Information Gain for each attribute (numeric), and show the feature that should be used first when build a decision tree.
step-1: find the best cutpoint for each attribute. (find value to split the data)
step-2: calculate the information gain for each attribute. (decide the order of attributes when build DT)
#-------------------- Some helper functions ------------------------------
# calculate Entropy for a given distribution H(X).
def entropy(probabilities: list)-> float:
return sum([-p * np.log2(p) for p in probabilities if p>0])
# given a list of labels, return the probability for each class P(Y).
def class_probabilities(labels: list)-> list:
total_count = len(labels)
return [label_count / total_count for label_count in Counter(labels).values()]
# calculate the Entropy H(Y) for a given list of labels.
def data_entropy(labels: list)-> float:
return entropy(class_probabilities(labels))
# split data into two sub-groups [group1, goup2] based on attribute [feature_idx] and value [feature_val]
# if sample[feature_idx]< feature_val:
# group1<- sample
# else:
# group2<- sample
def split_data(data: np.array, feature_idx: int, feature_val: float)-> tuple:
mask_below_threshold = data[:, feature_idx]< feature_val
group1= data[mask_below_threshold]
group2= data[~mask_below_threshold]
return group1, group2
# calculate the entropy for current partition. H(Y|X=feature_val)
def partition_entropy(g1_labels: list, g2_labels:list)-> float:
total_count = len(g1_labels)+ len(g2_labels)
#weighted combination of conditional entropy in both group1 and group2.
return data_entropy(g1_labels)*(len(g1_labels)/total_count)+ data_entropy(g2_labels)*(len(g2_labels)/total_count)
#-----------------------------------------------------------------------------------------
#---------------------------- Examples to use the Helper functions -----------------------
# calculate the H(Y) for the train and test data:
print(data_entropy(Y_train))
print(data_entropy(Y_test))
## to split the data based on feature_idx and feature_val:
train_data = np.concatenate((X_train, np.reshape(Y_train, (-1,1))), axis=1) # concatenate [X_train, Y_train]
print(train_data.shape)
# split the data into two subgroups
g1, g2= split_data(train_data, feature_idx=1, feature_val=3)
print(g1.shape)
print(g2.shape)
# calculate the weighted entropy for the current split.
print(partition_entropy(g1[:,-1], g2[:,-1]))
#-----------------------------------------------------------------------------------------
#-------------------------------- Your implementation ------------------------------------
# Initialize variables to store the best cutpoint and information gain for each attribute
#-----------------------------------------------------------------------------------------
#----------------------------------- Printing --------------------------------------------
#print the calculated cutpoint [feature_val] and information gain for each attribute.
# print the feature should be used first when build decision tree.
Please help me complete this code. The dataset is part of the question. Iris dataset is already embeded in python

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!