Question: Start with iris7_explore_7models.py , and perform the following : 1. For the decision trees classifier: a. Visualize the decision tree with maximum depth of 2,
Start with iris7_explore_7models.py, and perform the following:
# iris7_explore_7models.py
# Load system libraries
import sys
import datetime
import random
# Load ML libraries
import pandas
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC as SVMClassifier
from scipy.spatial import distance
def _main():
if (_showingHelp()):
_showHelp()
exit(0)
if (_showingVersions()):
_showVersions()
# load dataset
if (not _showingSummaryOnly()): print(datetime.datetime.now(), "explore_iris_7: Loading data");
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# url = "file:///
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)
if (_showingSamples()):
_sampleData(dataset)
_visualizeData(dataset, pyplot, scatter_matrix)
# split data into train/test datasets
if (not _showingSummaryOnly()): print(datetime.datetime.now(), "explore_iris_7: Splitting data into training and test sets");
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
test_size = 0.20
seed = 7
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
# test options and perform evaluation metric
seed = 7
scoring = 'accuracy'
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('KNN Neighbors', KNeighborsClassifier()))
models.append(('Support Vector', SVMClassifier()))
models.append(('DecisionTree', DecisionTreeClassifier()))
models.append(('Random Forest', RandomForestClassifier()))
if (not _showingSummaryOnly()):
# evaluate models
results = []
names = []
msg = ""
for name, model in models:
print(datetime.datetime.now(), "explore_iris_7: Evaluate model %s" % name);
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg += "%20s: \t\t%f\t(%f)" % (name, cv_results.mean(), cv_results.std()) + " "
print(msg)
print();
if ((_showingEval())):
# compare algorithms
fig = pyplot.figure()
fig.suptitle('Compare algorithms')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()
# make predictions on test dataset
lor = LogisticRegression()
_train(lor, "Logistic Regression", X_train, Y_train, X_test, Y_test)
_predict(lor, "Logistic Regression", X_train, Y_train, X_test, Y_test)
knn = KNeighborsClassifier()
_train(knn, "KNeighbors Classifier", X_train, Y_train, X_test, Y_test)
_predict(knn, "KNeighbors Classifier", X_train, Y_train, X_test, Y_test)
svm = SVMClassifier()
_train(svm, "Support Vector", X_train, Y_train, X_test, Y_test)
_predict(svm, "Support Vector", X_train, Y_train, X_test, Y_test)
dtc = DecisionTreeClassifier()
_train(dtc, "Decision Tree Classifier", X_train, Y_train, X_test, Y_test)
_predict(dtc, "Decision Tree Classifier", X_train, Y_train, X_test, Y_test)
rfc = RandomForestClassifier()
_train(rfc, "Random Forest", X_train, Y_train, X_test, Y_test)
_predict(rfc, "Random Forest", X_train, Y_train, X_test, Y_test)
myrnd = myRNDClassifier()
_train(myrnd, "My Random", X_train, Y_train, X_test, Y_test)
_predict(myrnd, "My Random", X_train, Y_train, X_test, Y_test)
myknn = myKNNClassifier()
_train(myknn, "My KNN", X_train, Y_train, X_test, Y_test)
_predict(myknn, "My KNN", X_train, Y_train, X_test, Y_test)
models.clear()
models.append(('Logistic Regression', lor))
models.append(('KNN Neighbors', knn))
models.append(('Support Vector', svm))
models.append(('DecisionTree', dtc))
models.append(('Random Forest', rfc))
models.append(('My Random', myrnd))
models.append(('My KNN', myknn))
_predictionAccuracySummary(models, X_train, Y_train, X_test, Y_test)
#####################################################
# My random classifier
class myRNDClassifier:
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self, X_test):
predictions = []
for row in X_test:
label = random.choice(self.y_train)
predictions.append(label)
return predictions
#####################################################
# My KNN K =1 classifier
class myKNNClassifier:
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self, X_test):
predictions = []
for row in X_test:
label = self.closest(row)
predictions.append(label)
return predictions
def closest(self, row):
best_distance = euc(row, self.X_train[0])
best_index = 0
for i in range(1, len(self.X_train)):
dist = euc(row, self.X_train[i])
if (dist < best_distance):
best_distance = dist
best_index = i
return(self.y_train[best_index])
def euc(a,b):
return(distance.euclidean(a,b))
#####################################################
# training and prediction functions
def _train(alg, algName, X_train, Y_train, X_test, Y_test):
if (not _showingSummaryOnly()): print(datetime.datetime.now(), "Begin training: ", algName)
alg.fit(X_train, Y_train)
if (not _showingSummaryOnly()): print(datetime.datetime.now(), "End training: ", algName)
def _predict(alg, algName, X_train, Y_train, X_test, Y_test):
if (not _showingSummaryOnly()): print(datetime.datetime.now(), "Begin prediction: ", algName)
predictions = alg.predict(X_test)
if (not _showingSummaryOnly()): print(datetime.datetime.now(), "End prediction: ", algName)
if (not _showingSummaryOnly()): print("%s: accuracy_score=%0.2f" % (algName, accuracy_score(Y_test, predictions)))
if (not _showingSummaryOnly()): print(confusion_matrix(Y_test, predictions))
if (not _showingSummaryOnly()): print(classification_report(Y_test, predictions))
def _predictionAccuracySummary(models, X_train, Y_train, X_test, Y_test):
print("Algorithm\t\tAccuracy Score")
for name, model in models:
predictions = model.predict(X_test)
print("%20s\t\t%0.2f" % (name, accuracy_score(Y_test, predictions)))
#####################################################
# data sampling and visualization functions
def _sampleData(dataset):
# show shape, first 10 records, description of frame and its distribution
print(dataset.shape)
print(dataset.head(20))
print(dataset.describe())
# show class distribution and distribution by sepal-length
print(dataset.groupby('class').size())
print(dataset.groupby('sepal-length').size())
def _visualizeData(dataset, pyplot, scatter_matrix):
# visualize data and draw box and whisker plots
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
pyplot.show()
# show histograms and scatter plot matrix
dataset.hist()
pyplot.show()
scatter_matrix(dataset)
pyplot.show()
#####################################################
# helper functions
def _parseArgumets(arg):
for i in range(1, len(sys.argv)):
if (sys.argv[i] == arg):
return 1
def _showingHelp():
return(_parseArgumets("--help"))
def _showingSummaryOnly():
return(_parseArgumets("--summaryonly"))
def _showingVersions():
return(not _parseArgumets("--summaryonly") and _parseArgumets("--version"))
def _showingSamples():
return(not _parseArgumets("--summaryonly") and _parseArgumets("--sample"))
def _showingEval():
return(not _parseArgumets("--summaryonly") and _parseArgumets("--eval"))
def _showHelp():
print("iris7_explore_7models: syntax iris7_explore_7models --version --sample --eval --summaryonly")
print("--help: show this help message");
print("--version: show version info for Python runtime and ML libraries");
print("--sample: show sample data");
print("--eval: show evaluation of algorithms")
print("--summaryonly: show only a summary of algorithms and their accuracy scores")
def _showVersions():
# check versions of Python runtime and ML libraries
import sys
print('Python: {}'.format(sys.version))
import scipy
print('scipy: {}'.format(scipy.__version__))
import numpy
print('numpy: {}'.format(numpy.__version__))
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
import pandas
print('pandas: {}'.format(pandas.__version__))
import sklearn
print('sklearn: {}'.format(sklearn.__version__))
_main()
Step by Step Solution
There are 3 Steps involved in it
Get step-by-step solutions from verified subject matter experts
