Please do not reorder the assignment - fill in each chunk as requested.
Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.
##r chunk
library(reticulate)
Load the Python libraries or functions that you will use for that section.
##python chunk
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
ps = PorterStemmer()
import nltk
stopwords = nltk.corpus.stopwords.words('english')
import unicodedata
from contractions import contractions_dict
The dataset is a set of Youtube comments that have been coded as: - 1: spam youtube messages - 0: good youtube messages - This data is stored in the CLASS column
Import the data using either R or Python. I put a Python chunk here because you will need one to import the data, but if you want to first import into R, that’s fine.
##python chunk
df = pd.read_csv("youtube_spam (1).csv")
df.head(3)
## COMMENT_ID ... CLASS
## 0 LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU ... 1
## 1 LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A ... 1
## 2 LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8 ... 1
##
## [3 rows x 5 columns]
Use one of our clean text functions to clean up the CONTENT column in the dataset.
##python chunk
df = pd.read_csv("youtube_spam (1).csv")
df.head(3)
# drop duplicates
## COMMENT_ID ... CLASS
## 0 LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU ... 1
## 1 LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A ... 1
## 2 LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8 ... 1
##
## [3 rows x 5 columns]
df = df.drop_duplicates()
# lower all words
df['CONTENT_1'] = df['CONTENT'].str.lower()
# remove punctuation
for contraction, expansion in contractions_dict.items():
df['CONTENT_2'] = df['CONTENT_1'].str.replace(contraction, expansion)
df['CONTENT_3'] = df['CONTENT_2'].str.replace('[^a-zA-Z0-9\s]|\[|\]', '')
#stemming
df['CONTENT_4'] = [' '.join([ps.stem(word) for word in text.split()]) for text in df['CONTENT_3'].tolist()]
#stop words
df['CONTENT_5'] = [' '.join([word for word in text.split() if word not in stopwords]) for text in df['CONTENT_4'].tolist()]
Split the data into testing and training data.
##python chunk
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['CONTENT_5'],
df['CLASS'],
test_size=0.15,
random_state=8)
print(X_train.shape)
## (1660,)
print(X_test.shape)
## (293,)
print(y_train.shape)
## (1660,)
print(y_test.shape)
## (293,)
For word2vec, create the tokenized vectors of the text.
##python chunk
tokenized_train = [nltk.tokenize.word_tokenize(text)
for text in X_train]
tokenized_test = [nltk.tokenize.word_tokenize(text)
for text in X_test]
Create a TF-IDF matrix.
##python chunk
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
# apply to train and test
tv_train_features = tv.fit_transform(X_train)
tv_test_features = tv.transform(X_test)
Build the word2vec model.
##python chunk
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import gensim
# build word2vec model
w2v_num_features = 300
w2v_model = gensim.models.Word2Vec(tokenized_train, #corpus
size=w2v_num_features, #number of features
window=10, #size of moving window
min_count=2, #minimum number of times to run
sg = 0, #cbow model
iter=5, workers=5) #iterations and cores
Convert the word2vec model into a set of features to use in our classifier.
##python chunk
lr = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', multi_class='ovr',
max_iter=1000, C=1, random_state=42)
lr.fit(tv_train_features, y_train)
## LogisticRegression(C=1, max_iter=1000, multi_class='ovr', random_state=42)
y_pred = lr.predict(tv_test_features)
print('accuracy %s' % accuracy_score(y_test, y_pred))
## accuracy 0.9180887372013652
print(classification_report(y_test, y_pred))
## precision recall f1-score support
##
## 0 0.90 0.95 0.92 156
## 1 0.94 0.88 0.91 137
##
## accuracy 0.92 293
## macro avg 0.92 0.92 0.92 293
## weighted avg 0.92 0.92 0.92 293
In class, we used a few algorithms to test which model might be the best. Pick one of the algorithms to use here (logistic regression, naive bayes, support vector machine).
Run your algorithm on both the TF-IDF matrix and the output from word2vec.
##python chunk
## model 1: logistic regression
lr = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', multi_class='ovr',
max_iter=1000, C=1, random_state=42)
lr.fit(tv_train_features, y_train)
## LogisticRegression(C=1, max_iter=1000, multi_class='ovr', random_state=42)
y_pred = lr.predict(tv_test_features)
print('accuracy %s' % accuracy_score(y_test, y_pred))
## accuracy 0.9180887372013652
print(classification_report(y_test, y_pred))
## model 2: bayes
##python chunk
#create flattening function
## precision recall f1-score support
##
## 0 0.90 0.95 0.92 156
## 1 0.94 0.88 0.91 137
##
## accuracy 0.92 293
## macro avg 0.92 0.92 0.92 293
## weighted avg 0.92 0.92 0.92 293
def document_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index2word)
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,), dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = np.add(feature_vector, model.wv[word])
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return np.array(features)
# generate averaged word vector features from word2vec model
avg_wv_train_features = document_vectorizer(corpus=tokenized_train, model=w2v_model,
num_features=w2v_num_features)
avg_wv_test_features = document_vectorizer(corpus=tokenized_test, model=w2v_model,
num_features=w2v_num_features)
#specifically Bayes does not allow negative numbers
from sklearn.naive_bayes import MultinomialNB
#find the minimum values
avg_wv_train_features.min()
## -0.5519111752510071
avg_wv_test_features.min()
#add a constant to both
## -0.4886804223060608
avg_wv_train_features = avg_wv_train_features + 4
avg_wv_test_features = avg_wv_test_features + 4
#then run bayes
mnb = MultinomialNB(alpha=1)
mnb.fit(avg_wv_train_features, y_train)
## MultinomialNB(alpha=1)
y_pred = mnb.predict(avg_wv_test_features)
print('accuracy %s' % accuracy_score(y_test, y_pred))
## accuracy 0.5255972696245734
print(classification_report(y_test, y_pred))
## precision recall f1-score support
##
## 0 0.72 0.18 0.29 156
## 1 0.50 0.92 0.64 137
##
## accuracy 0.53 293
## macro avg 0.61 0.55 0.47 293
## weighted avg 0.61 0.53 0.45 293
Print out the accuracy, recall, and precision of both of your models.
##python chunk
# set up the index
index_X_test = X_test.index
# map index
df_test = df.loc[index_X_test]
# predictions results
df_test['Prediction'] = y_pred
df_test = df_test[['CONTENT', 'CLASS', 'Prediction']]
df_test['Category_Predicted'] = df_test['Prediction']
df_test.head()
# find the misclassified
## CONTENT CLASS Prediction Category_Predicted
## 794 Love this song 0 1 1
## 238 2:05. Hahahahah 0 0 0
## 946 wtf. subscribe my channel thanx ;) 1 1 1
## 860 it is wonderful 0 0 0
## 1220 old and good song 0 1 1
condition = (df_test['CLASS'] != df_test['Category_Predicted'])
df_misclassified = df_test[condition]
df_misclassified.head(3)
## CONTENT CLASS Prediction Category_Predicted
## 794 Love this song 0 1 1
## 1220 old and good song 0 1 1
## 1799 katy perry is awesome 0 1 1