Question: #UnivBook_classification # #Google Colaboratory #https://colab.research.google.com # #... #/usr/local/toku2/sample/UnivBook.ipynb # # #php book_category.php #tar --utc -cvzf book_category.tgz book_category # #tgz[>] # !wget http://www.cs.gunma-u.ac.jp/~michi/toku2/book_category.tgz !tar zxf book_category.tgz

#UnivBook_classification # #Google Colaboratory #https://colab.research.google.com # #... #/usr/local/toku2/sample/UnivBook.ipynb # # #php book_category.php #tar --utc -cvzf book_category.tgz book_category # #tgz[>] # !wget http://www.cs.gunma-u.ac.jp/~michi/toku2/book_category.tgz !tar zxf book_category.tgz #() !ls ./book_category # !ls ./book_category | head # #8 topics = [ 'computer_graphics', 'operating_systems', 'computer_security', 'application_service', 'computer_software', 'artificial_intelligence', 'search_engine', 'information_society', ] #import import glob import re import MeCab import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split docs = [] for topic in topics: for f in glob.glob(f"./book_category/{topic}/*.txt"): # 1 with open(f, "r") as fin: body = " ".join([line.strip() for line in fin if line.strip()]) docs.append((topic,body)) # df = pd.DataFrame( docs, columns=["topic","body"], dtype="category" ) # df.head() # df.topic.value_counts() # tagger = MeCab.Tagger("-Owakati") def parse_to_wakati(text): # wakatiMeCabparse return tagger.parse(text).strip() df = df.assign(body_wakati=df.body.apply(parse_to_wakati)) # df.head() # df.body_wakati.head() # le = LabelEncoder() y = le.fit_transform(df.topic) # print(le.classes_) # print(le.transform(["computer_graphics"])) print(le.transform(["operating_systems"])) # X_train, X_test, y_train, y_test = train_test_split( df.body_wakati, # y, # test_size=0.2, # 2 random_state=10, # shuffle=True ) # from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import classification_report, confusion_matrix class RulebasedEstimator(BaseEstimator, TransformerMixin): def __init__(self, label_encoder): self.le = label_encoder def fit(self, X, y): return self def predict(self, X): """""" result = [] for text in X: pred = 0 if re.search(r"(|)", text): pred = self.le.transform(["computer_graphics"])[0] elif re.search(r"(|)", text): pred = self.le.transform(["operating_systems"])[0] elif re.search(r"(|)", text): pred = self.le.transform(["computer_security"])[0] elif re.search(r"(|)", text): pred = self.le.transform(["application_service"])[0] elif re.search(r"(|)", text): pred = self.le.transform(["computer_software"])[0] elif re.search(r"(|)", text): pred = self.le.transform(["artificial_intelligence"])[0] elif re.search(r"(|)", text): pred = self.le.transform(["search_engine"])[0] elif re.search(r"(|)", text): pred = self.le.transform(["information_society"])[0] result.append(pred) return result # rulebased = RulebasedEstimator(label_encoder=le) # rulebased_pred = rulebased.predict(X_test) # from sklearn.metrics import confusion_matrix confusion_matrix(y_test,rulebased_pred) #(Precision)(Recall)F(F-measure) print(classification_report(y_test, rulebased_pred, target_names=le.classes_)) # #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import Pipeline rf_clf = Pipeline([ ("tfidf", TfidfVectorizer()), ("clf", RandomForestClassifier()), ]) rf_clf.fit(X_train, y_train) pred = rf_clf.predict(X_test) # confusion_matrix(y_test,pred) #(Precision)(Recall)F(F-measure) print(classification_report(y_test, pred, target_names=le.classes_)) #(MultinomialNB) #) https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline text_clf = Pipeline([ ("count_vec", CountVectorizer()), ("clf", MultinomialNB()), ]) text_clf.fit(X_train, y_train) pred = text_clf.predict(X_test) # confusion_matrix(y_test,pred) #(Precision)(Recall)F(F-measure) print(classification_report(y_test, pred, target_names=le.classes_)) #SVM #https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html from sklearn.svm import LinearSVC svm_clf = Pipeline([ ("tfidf", TfidfVectorizer()), ("clf", LinearSVC()), ]) svm_clf.fit(X_train, y_train) pred = svm_clf.predict(X_test) # confusion_matrix(y_test,pred) #(Precision)(Recall)F(F-measure) print(classification_report(y_test, pred, target_names=le.classes_))

i want to increase accuracy of the algorithm, please help

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!