Да погледнем правилата на състезанието:
# Dataset
import pandas as pd
train = pd.read_csv("data/spooky-authors/train.zip", index_col=['id'])
test = pd.read_csv("data/spooky-authors/test.zip", index_col=['id'])
sample_submission = pd.read_csv("data/spooky-authors/sample_submission.zip", index_col=['id'])
print(train.shape, test.shape, sample_submission.shape)
print(set(train.columns) - set(test.columns))
train.head(5)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
train.author = train.author.replace(['EAP', 'HPL', 'MWS'], ['Едгар', 'Хауърд', 'Мери'])
sns.countplot(data=train, x='author');
all_words = train['text'].str.split(expand=True).unstack().value_counts()
all_words.head(15)
all_words.tail(15)
eap = train[train.author=="Едгар"].text.values
hpl = train[train.author=="Хауърд"].text.values
mws = train[train.author=="Мери"].text.values
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import numpy as np
def plot_wordcloud_mask(words, img_path):
img = Image.open(img_path)
img_mask = np.array(img)
plt.figure(figsize=(12,8))
wc = WordCloud(background_color="black", max_words=10000, mask=img_mask,
stopwords=STOPWORDS, max_font_size= 40)
wc.generate(" ".join(words))
plt.imshow(wc.recolor( colormap= 'Pastel1_r' , random_state=17), alpha=0.98)
plt.axis('off');
plot_wordcloud_mask(hpl, "data/spooky-authors/hpl.png")
plot_wordcloud_mask(eap, "data/spooky-authors/eap.png")
plot_wordcloud_mask(mws, "data/spooky-authors/mws.png")
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
pipeline = Pipeline([
('features', CountVectorizer()),
('clf', LinearSVC())
])
cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3)
Да проверим какво е научил CountVectorizer
pipeline.fit(train.text, train.author)
count_vectorizer = pipeline.steps[0][1]
count_vectorizer
list(count_vectorizer.vocabulary_.items())[:15]
Другото му име е "bag of words".
Подобно на one-hot encoding но за текст.
fit
- прави речник с всички думи в корпуса (датасета) и им поставя индекс за всяка уникална дума. transform
- взима текста от всеки ред и го превръща във вектор, където отбелязва броя на всяка дума от веткора.vectorizer = CountVectorizer()
corpus = [
"Billions and billions of dollars",
"A lot of money",
"We are going to make",
"We are going ot take care of"
]
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)
Супер, да покажем и дву-грами
vectorizer = CountVectorizer(ngram_range=(1,2))
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)
Това беше кратко отклонение за да видим как работи CountVectorizer.
Да се върнем на модела и да погледнем какви са предсказванията
from sklearn.model_selection import cross_val_predict
prediction = cross_val_predict(pipeline, train.text, train.author, cv=3, n_jobs=3)
prediction
Забележете магията - не ползвах LabelEncoder
за класовете.
sklearn
е достатъчно умен да се оправи сам с категориинете данни в y
.
import itertools
from sklearn.metrics import confusion_matrix, accuracy_score
def plot_confusion_matrix(y_true, y_pred, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues,
figsize=(9, 7)):
matrix = confusion_matrix(y_true, y_pred)
if normalize:
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=figsize)
plt.imshow(matrix, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = matrix.max() / 2.
for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])):
plt.text(j, i, format(matrix[i, j], fmt),
horizontalalignment="center",
size=int((figsize[0] / 10) * 38),
color="white" if matrix[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
authors = pipeline.classes_
print(accuracy_score(train.author, prediction))
plot_confusion_matrix(train.author, prediction, classes=authors)
plot_confusion_matrix(train.author, prediction, classes=authors, normalize=True)
Да пробваме с RF
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
('features', CountVectorizer()),
('clf', RandomForestClassifier())
])
cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3)
В състезанието пише, че оценката ще се мери с LogLoss
.
Да видим каквъв резултат ще получим с тази метрика.
cross_val_score(pipeline, train.text, train.author,
cv=3, n_jobs=3, scoring='neg_log_loss')
И с линейна регресия защото LinearSVC няма predict_proba
по подразбиране.
За да се добави може да се използва decision_function
+ softmax
.
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
('features', CountVectorizer()),
('clf', LogisticRegression())
])
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3,
scoring='neg_log_loss'))
# Получихме малко по-добри резултати
Добре, този модел ще го оптимизираме доста.
Какви други фичъри можем да измислим?
explore = train.copy()
# бр. думи в текста
explore['words'] = explore.text.apply(lambda s: len(str(s).split()))
# бр. уникални думи
explore['unique_words'] = explore.text.apply(lambda s: len(set(str(s).split())))
# бр. символи
explore['symbols'] = explore.text.str.len()
# бр. уникални символи
explore['unique_symbols'] = explore.text.apply(lambda s: len(set(str(s))))
import string
# бр. главни букви
explore['capital_letters'] = explore.text.apply(lambda s: sum([str.isupper(c) for c in str(s)]))
# бр. на думи съдържащи само главни буква
explore['only_capital_letter_words'] = explore.text.apply(lambda s: sum([str.isupper(w) for w in str(s).split()]))
# средна дължина на дума
explore['average_word_lenght'] = explore.text.apply(lambda s: np.mean([len(w) for w in str(s).split()]))
# бр. цифрите
explore['digits'] = explore.text.apply(lambda s: sum([str.isdigit(c) for c in str(s)]))
# бр. на препинателни знаци
train["punctuation"] = train.text.apply(lambda s: sum([c in string.punctuation for c in str(s)]) )
print(string.punctuation)
import nltk
# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
print(len(stopwords))
print(stopwords)
explore['stop_words'] = explore.text.apply(lambda s: sum(w in stopwords for w in str(s).split()))
explore.head()
Ще създам един лист, в който да пазя имената на фичърите
print(explore.columns)
features_names = list(set(explore.columns) - {'text', 'author'})
for feature in features_names:
plt.figure()
sns.violinplot(x=feature, y="author", data=explore)
plt.title(feature);
Няма много вариация в разпределенията на фичърите.
Нека все пак натренираме модел с тях да видим как ще се държи.
from sklearn.ensemble import RandomForestClassifier
cross_val_score(RandomForestClassifier(), explore[features_names], explore.author, cv=3, n_jobs=3)
cross_val_score(LinearSVC(), explore[features_names], explore.author, cv=3, n_jobs=1)
Не е много добре.
Да видим какво показва confusion матрицата.
predict_from_features = cross_val_predict(RandomForestClassifier(), explore[features_names], explore.author, cv=3, n_jobs=3)
print(accuracy_score(explore.author, predict_from_features))
plot_confusion_matrix(explore.author, predict_from_features, classes=authors, normalize=True)
Да погледнем какво е разпределението на оригиналните класове.
explore.author.value_counts() / len(explore)
Моделите са по-лоши по точност от това да предсказваме само най-популярния клас.
Тези фичъри може и да са полезни в някой нелинеен модел в комбинация с други фичъри, но за сега ги оставяме.
Ще изчистим текста от мн. ч, времена и т.н.
За целта може да се ползва Stemming
или Lemmatization
.
На кратко, stemming:
Lemmatization:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
stem = PorterStemmer()
explore['stemmed'] = explore.text.apply(lambda t: " ".join([stem.stem(w) for w in t.split()]))
explore[['stemmed', 'text']].head()
print(explore.text[0])
print()
print(explore.stemmed[0])
pipeline = Pipeline([
('features', CountVectorizer()),
('clf', LinearSVC())
])
cross_val_score(pipeline, explore.stemmed, train.author, cv=3, n_jobs=3)
# Резултати от същия pipeline използвайки колона text:
# array([ 0.78783701, 0.79635305, 0.79509579])
Остават да пробвам:
За сега ще разгледаме само оптимизирането на модела.
CountVectorizer
и всичките имат голям набор параметри.¶Пространството за изследване става огромно и ще си помогнем с RandomSearch
от sklearn.
Освен това ще трябва да гледаме LogLoss
за оценка, а не Accuracy
, защото състезанието иска това и вероятностите имат значение.
Първо да опишем параметрите за търсене в трансфомацията (CountVectorizer)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
params_count_word = {"features__ngram_range": [(1,1), (1,2), (1,3)],
"features__analyzer": ['word'],
"features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
"features__min_df":[2, 3, 5, 10],
"features__lowercase": [False, True],
"features__stop_words": [None, stopwords]}
params_count_char = {"features__ngram_range": [(1,4), (1,5), (1,6)],
"features__analyzer": ['char'],
"features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
"features__min_df":[2, 3, 5, 10],
"features__lowercase": [False, True],
"features__stop_words": [None, stopwords]}
def report(results, n_top=5):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
def random_search():
params = {
"clf__C": [0.01, 0.1, 0.3, 1, 3, 10],
"clf__class_weight": [None, 'balanced']
}
params.update(params_count_word)
pipeline = Pipeline([
('features', CountVectorizer()),
('clf', LogisticRegression())
])
random_search = RandomizedSearchCV(pipeline, param_distributions=params,
scoring='neg_log_loss',
n_iter=20, cv=3, n_jobs=4)
random_search.fit(train.text, train.author)
report(random_search.cv_results_)
# random_search()
Model with rank: 1 Mean validation score: -0.475 (std: 0.002) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 1), 'featureslowercase': True, 'featuresanalyzer': 'word', 'clfclass_weight': None, 'clfC': 1}
Model with rank: 2 Mean validation score: -0.482 (std: 0.002) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 2), 'featureslowercase': True, 'featuresanalyzer': 'word', 'clfclass_weight': None, 'clfC': 1}
Model with rank: 3 Mean validation score: -0.486 (std: 0.001) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 1), 'featureslowercase': True, 'featuresanalyzer': 'word', 'clfclass_weight': 'balanced', 'clfC': 3}
Model with rank: 4 Mean validation score: -0.508 (std: 0.004) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 2), 'featureslowercase': False, 'featuresanalyzer': 'word', 'clfclass_weight': 'balanced', 'clfC': 0.3}
Model with rank: 5 Mean validation score: -0.525 (std: 0.004) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 3), 'featureslowercase': True, 'featuresanalyzer': 'word', 'clfclass_weight': 'balanced', 'clfC': 0.3}
Търсенето отнеме много време, заради това за char-grams
ще пусна само едно трениране и оценяване с по-стандартни стойности на хипер параметрите.
pipeline = Pipeline([
('features', CountVectorizer(ngram_range=(3,5), analyzer='char')),
('clf', LogisticRegression())
])
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, scoring='neg_log_loss'))
По-лоши резултати с chars - няма да го изследваме.
За сметка на това ще пробваме да заменим CountVectorizer
с по-големия му батко Tfidf
.
$$ \operatorname {tfidf} (w,d) = \operatorname{tf} * \log \Big( \frac{n+1}{n_w + 1} \Big ) + 1 $$
където:
$$ \text{tfidf("екстраполирам", "екстраполирам нещо си")} = 1 * log(1001 / 11) + 1 = 5.51 $$
$$ \text{tfidf("за", "отиде да тича за нещо си... за да му дойде акъла")} = 2 * log(1001 / 901) + 1 = 1.21 $$
print(1 * np.log(1001 / 11) + 1)
print(2 * np.log(1001 / 901) + 1)
tfidf = TfidfVectorizer()
print(tfidf.fit_transform(corpus).todense())
print(tfidf.vocabulary_)
CountVectorizer().fit(corpus).vocabulary_ == TfidfVectorizer().fit(corpus).vocabulary_
Горното сравнение ще рече, че CountVectorizer
и TfidfVectorizer
намирът един и същи речник или "торбата с думи".
Това е така защото TfidfVectorizer
вътрешно ползва CountVectorizer
а отгоре само добавя idf функционалността.
print(tfidf.idf_)
def random_search():
params = {
"clf__C": [0.01, 0.1, 0.3, 1, 3, 10],
"clf__class_weight": [None, 'balanced']
}
params.update(params_count_word)
pipeline = Pipeline([
('features', TfidfVectorizer()),
('clf', LogisticRegression())
])
random_search = RandomizedSearchCV(pipeline, param_distributions=params,
scoring='neg_log_loss',
n_iter=20, cv=3, n_jobs=4)
random_search.fit(train.text, train.author)
report(random_search.cv_results_)
# random_search() # предишния най-добър резултат: -0.475
Model with rank: 1 Mean validation score: -0.469 (std: 0.005) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 2), 'featuresmin_df': 2, 'featuresmax_df': 1.0, 'featureslowercase': True, 'featuresanalyzer': 'word', 'clfclass_weight': 'balanced', 'clfC': 10}
Model with rank: 2 Mean validation score: -0.471 (std: 0.006) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 2), 'featuresmin_df': 3, 'featuresmax_df': 0.5, 'featureslowercase': True, 'featuresanalyzer': 'word', 'clfclass_weight': None, 'clfC': 10}
Model with rank: 3 Mean validation score: -0.483 (std: 0.008) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 2), 'featuresmin_df': 5, 'featuresmax_df': 0.8, 'featureslowercase': False, 'featuresanalyzer': 'word', 'clfclass_weight': 'balanced', 'clfC': 10}
Model with rank: 4 Mean validation score: -0.495 (std: 0.002) Parameters: {'featuresstop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'], 'featuresngram_range': (1, 2), 'featuresmin_df': 2, 'featuresmax_df': 0.6, 'featureslowercase': True, 'featuresanalyzer': 'word', 'clfclass_weight': 'balanced', 'clfC': 10}
Model with rank: 5 Mean validation score: -0.522 (std: 0.005) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 3), 'featuresmin_df': 10, 'featuresmax_df': 0.5, 'featureslowercase': True, 'featuresanalyzer': 'word', 'clfclass_weight': 'balanced', 'clfC': 10}
Има леко подобрение в LogLoss
.
Да пробваме да сменим и класификатора с друг класически за класификация на текст: Naive Bayes
def random_search():
params = {
"clf__alpha": [0.01, 0.1, 0.5, 1, 2]
}
params.update(params_count_word)
pipeline = Pipeline([
('features', TfidfVectorizer()),
('clf', MultinomialNB())
])
random_search = RandomizedSearchCV(pipeline, param_distributions=params,
scoring='neg_log_loss',
n_iter=20, cv=3, n_jobs=4)
random_search.fit(train.text, train.author)
report(random_search.cv_results_)
# random_search() # Предишния най-добър резултат: -0.469
Model with rank: 1 Mean validation score: -0.423 (std: 0.003) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 2), 'featuresmin_df': 2, 'featuresmax_df': 0.8, 'featureslowercase': False, 'featuresanalyzer': 'word', 'clf__alpha': 0.01}
Model with rank: 2 Mean validation score: -0.465 (std: 0.003) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 1), 'featuresmin_df': 3, 'featuresmax_df': 0.9, 'featureslowercase': True, 'featuresanalyzer': 'word', 'clf__alpha': 0.01}
Model with rank: 3 Mean validation score: -0.469 (std: 0.004) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 3), 'featuresmin_df': 5, 'featuresmax_df': 0.9, 'featureslowercase': True, 'featuresanalyzer': 'word', 'clf__alpha': 0.1}
Model with rank: 4 Mean validation score: -0.495 (std: 0.002) Parameters: {'featuresstop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'], 'featuresngram_range': (1, 3), 'featuresmin_df': 5, 'featuresmax_df': 0.8, 'featureslowercase': False, 'featuresanalyzer': 'word', 'clf__alpha': 0.1}
Model with rank: 5 Mean validation score: -0.496 (std: 0.004) Parameters: {'featuresstop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'], 'featuresngram_range': (1, 3), 'featuresmin_df': 5, 'featuresmax_df': 0.6, 'featureslowercase': False, 'featuresanalyzer': 'word', 'clf__alpha': 0.01}
Тук има още подобрение в метриката.
Искам да го пробвам и със stemming.
Освен това се вижда, че избира най-ниската предоставена стойност за alpha
, може би трябва да пробвам с още по-ниски.
def random_search():
params = {
"clf__alpha": [0.001, 0.005, 0.01, 0.05, 0.1, 0.3]
}
params.update(params_count_word)
pipeline = Pipeline([
('features', TfidfVectorizer()),
('clf', MultinomialNB())
])
random_search = RandomizedSearchCV(pipeline, param_distributions=params,
scoring='neg_log_loss',
n_iter=20, cv=3, n_jobs=4)
random_search.fit(explore.stemmed, train.author)
report(random_search.cv_results_)
# random_search() # -0.423
Model with rank: 1 Mean validation score: -0.438 (std: 0.002) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 2), 'featuresmin_df': 2, 'featuresmax_df': 0.6, 'featureslowercase': False, 'featuresanalyzer': 'word', 'clf__alpha': 0.01}
Model with rank: 2 Mean validation score: -0.443 (std: 0.004) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 3), 'featuresmin_df': 3, 'featuresmax_df': 0.6, 'featureslowercase': True, 'featuresanalyzer': 'word', 'clf__alpha': 0.05}
Model with rank: 3 Mean validation score: -0.453 (std: 0.002) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 3), 'featuresmin_df': 2, 'featuresmax_df': 1.0, 'featureslowercase': False, 'featuresanalyzer': 'word', 'clf__alpha': 0.01}
Model with rank: 4 Mean validation score: -0.471 (std: 0.003) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 2), 'featuresmin_df': 5, 'featuresmax_df': 1.0, 'featureslowercase': False, 'featuresanalyzer': 'word', 'clf__alpha': 0.01}
Model with rank: 5 Mean validation score: -0.472 (std: 0.004) Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 3), 'featuresmin_df': 5, 'featuresmax_df': 0.5, 'featureslowercase': False, 'featuresanalyzer': 'word', 'clf__alpha': 0.05}
Ще използвам следния модел:
TfIdf + MultinomialNB, без стеминг на текста.
Mean validation score: -0.423 (std: 0.003)
Ще ползвам и следните параметри:
Parameters: {'featuresstop_words': None, 'featuresngram_range': (1, 2), 'featuresmin_df': 2, 'featuresmax_df': 0.8, 'featureslowercase': False, 'featuresanalyzer': 'word', 'clf__alpha': 0.01}
Последна проверка на този модел за LogLoss
и Accuracy
from sklearn.naive_bayes import MultinomialNB
pipeline = Pipeline([
('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
max_df=0.8, lowercase=False)),
('clf', MultinomialNB(alpha=0.01))
])
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3,
scoring='neg_log_loss'))
Първо да видим в какъв формат трябва да се подадат резултатите за тест
sample_submission = pd.read_csv("data/spooky-authors/sample_submission.zip")
sample_submission.head()
pipeline = pipeline.fit(train.text, train.author)
print(pipeline.predict_proba(test[:10].text))
test_predictions = pipeline.predict_proba(test.text)
print(pipeline.classes_)
submit_file = pd.DataFrame(test_predictions, columns=['EAP', 'MWS', 'HPL'], index=test.index)
submit_file.head(10)
submit_file.to_csv("data/spooky-authors/submit_Tfidf_MNB_text.csv")
Очакванията за събмита са да имаме скор някъде около 0.41 - 0.42.
Може да е малко по-добър защото при крос-валидацията тренирахме на 13к и тествахме 6к.
Сега трейн сета е целия: 19.5к
# Да хакнем ранкинга в кагъл?
print(test.text[:5].values)