I took this piece of code out of a project I am working on. I wanted to guess the tag based on keywords in the body of text. So, I take the text, apply a hash vectorizer and then pass the hashed values into a AdaBoostClassifier that uses DecisionTreeClassifier. I wanted to build it once and use it over and over again, so I used Pickle to save it on the file system to reuse.
This code assumes you have a dataframe populated already.
Includes:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import pickle
import os.path |
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import pickle
import os.path
Setting up filesystem and parameters stuff:
resetPickle = False
foundPickle = False
"""This is where you would load the dataset"""
df_tags = pd.DataFrame()
pick_model_path1='pickles/modelAdaDecTreeClassifier.pickle'
pick_model_tags_root_pre = 'pickles/model_tag_'
pick_model_tags_root_post = '_DecTreeClassifier.pickle'
tag_pickle_path = pick_model_tags_root_pre + 'PIC' + pick_model_tags_root_post |
resetPickle = False
foundPickle = False
"""This is where you would load the dataset"""
df_tags = pd.DataFrame()
pick_model_path1='pickles/modelAdaDecTreeClassifier.pickle'
pick_model_tags_root_pre = 'pickles/model_tag_'
pick_model_tags_root_post = '_DecTreeClassifier.pickle'
tag_pickle_path = pick_model_tags_root_pre + 'PIC' + pick_model_tags_root_post
Create HashingVectorizer. The ngrams 1,2 means that it will use words like “Richmond” and “Richmond VA” as tokens:
vctrizr_tag = HashingVectorizer(ngram_range=(1, 2)) |
vctrizr_tag = HashingVectorizer(ngram_range=(1, 2))
This will check to see if the pickle exists. It will load it into the model if it exists:
if resetPickle == False and os.path.isfile(tag_pickle_path):
pickle_in = open(tag_pickle_path,'rb')
model_tag = pickle.load(pickle_in)
foundPickle = True |
if resetPickle == False and os.path.isfile(tag_pickle_path):
pickle_in = open(tag_pickle_path,'rb')
model_tag = pickle.load(pickle_in)
foundPickle = True
If the pickle does not exist, it will go and train the AdaBoostClassifier and save it into the pickle:
if foundPickle == False:
y_tag = df_tags
vctr_tag= vctrizr_tag.transform(df_tags['Text'])
X_tag = vctrizr_tag.transform(df_tags['Text'])
X_train_tag, X_test_tag, y_train_tag, y_test_tag = train_test_split(X_tag, y_tag, test_size=0.2, random_state=1)
model_tag = AdaBoostClassifier(DecisionTreeClassifier(max_depth=44),n_estimators=25)
model_tag = model_tag.fit(X_train_tag, y_train_tag)
score = model_tag.score(X_test_tag, y_test_tag)
print('score',score)
with open(tag_pickle_path, 'wb') as f:
pickle.dump(model_tag, f) |
if foundPickle == False:
y_tag = df_tags
vctr_tag= vctrizr_tag.transform(df_tags['Text'])
X_tag = vctrizr_tag.transform(df_tags['Text'])
X_train_tag, X_test_tag, y_train_tag, y_test_tag = train_test_split(X_tag, y_tag, test_size=0.2, random_state=1)
model_tag = AdaBoostClassifier(DecisionTreeClassifier(max_depth=44),n_estimators=25)
model_tag = model_tag.fit(X_train_tag, y_train_tag)
score = model_tag.score(X_test_tag, y_test_tag)
print('score',score)
with open(tag_pickle_path, 'wb') as f:
pickle.dump(model_tag, f)
All together now:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import pickle
import os.path
resetPickle = False
foundPickle = False
"""This is where you would load the dataset"""
df_tags = pd.DataFrame()
pick_model_path1='pickles/modelAdaDecTreeClassifier.pickle'
pick_model_tags_root_pre = 'pickles/model_tag_'
pick_model_tags_root_post = '_DecTreeClassifier.pickle'
tag_pickle_path = pick_model_tags_root_pre + 'PIC' + pick_model_tags_root_post
vctrizr_tag = HashingVectorizer(ngram_range=(1, 2))
if resetPickle == False and os.path.isfile(tag_pickle_path):
pickle_in = open(tag_pickle_path,'rb')
model_tag = pickle.load(pickle_in)
foundPickle = True
if foundPickle == False:
y_tag = df_tags
vctr_tag= vctrizr_tag.transform(df_tags['Text'])
X_tag = vctrizr_tag.transform(df_tags['Text'])
X_train_tag, X_test_tag, y_train_tag, y_test_tag = train_test_split(X_tag, y_tag, test_size=0.2, random_state=1)
model_tag = AdaBoostClassifier(DecisionTreeClassifier(max_depth=44),n_estimators=25)
model_tag = model_tag.fit(X_train_tag, y_train_tag)
score = model_tag.score(X_test_tag, y_test_tag)
print('score',score)
with open(tag_pickle_path, 'wb') as f:
pickle.dump(model_tag, f) |
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import pickle
import os.path
resetPickle = False
foundPickle = False
"""This is where you would load the dataset"""
df_tags = pd.DataFrame()
pick_model_path1='pickles/modelAdaDecTreeClassifier.pickle'
pick_model_tags_root_pre = 'pickles/model_tag_'
pick_model_tags_root_post = '_DecTreeClassifier.pickle'
tag_pickle_path = pick_model_tags_root_pre + 'PIC' + pick_model_tags_root_post
vctrizr_tag = HashingVectorizer(ngram_range=(1, 2))
if resetPickle == False and os.path.isfile(tag_pickle_path):
pickle_in = open(tag_pickle_path,'rb')
model_tag = pickle.load(pickle_in)
foundPickle = True
if foundPickle == False:
y_tag = df_tags
vctr_tag= vctrizr_tag.transform(df_tags['Text'])
X_tag = vctrizr_tag.transform(df_tags['Text'])
X_train_tag, X_test_tag, y_train_tag, y_test_tag = train_test_split(X_tag, y_tag, test_size=0.2, random_state=1)
model_tag = AdaBoostClassifier(DecisionTreeClassifier(max_depth=44),n_estimators=25)
model_tag = model_tag.fit(X_train_tag, y_train_tag)
score = model_tag.score(X_test_tag, y_test_tag)
print('score',score)
with open(tag_pickle_path, 'wb') as f:
pickle.dump(model_tag, f)