Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.
##r chunk
library(reticulate)
py_config()
## python: /usr/bin/python3
## libpython: /usr/lib/python3.8/config-3.8-x86_64-linux-gnu/libpython3.8.so
## pythonhome: //usr://usr
## version: 3.8.5 (default, Jul 28 2020, 12:59:40) [GCC 9.3.0]
## numpy: /usr/lib/python3/dist-packages/numpy
## numpy_version: 1.17.4
Load the Python libraries or functions that you will use for that section.
##python chunk
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
The dataset is a set of Youtube comments that have been coded as: - 1: spam youtube messages - 0: good youtube messages - This data is stored in the CLASS column
Import the data using either R or Python. I put a Python chunk here because you will need one to import the data, but if you want to first import into R, that’s fine.
##python chunk
youtube = pd.read_csv("youtube_spam.csv")
youtube.head(5)
## COMMENT_ID ... CLASS
## 0 LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU ... 1
## 1 LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A ... 1
## 2 LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8 ... 1
## 3 z13jhp0bxqncu512g22wvzkasxmvvzjaz04 ... 1
## 4 z13fwbwp1oujthgqj04chlngpvzmtt3r3dw ... 1
##
## [5 rows x 5 columns]
Use one of our clean text functions to clean up the CONTENT column in the dataset.
##python chunk
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') #remove symbols with space
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') #take out symbols altogether
STOPWORDS = set(stopwords.words('english')) #stopwords
def clean_text(text):
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
return text
youtube['CONTENT'] = youtube['CONTENT'].apply(clean_text)
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://twitter.com/GBphotographyGB" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://ubuntuone.com/40beUutVu2ZKxK4uTgPZ8K" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://hackfbaccountlive.com/?ref=4604617" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://binbox.io/1FIRo#123" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://thepiratebay.se/torrent/6381501/Timothy_Sykes_Collection" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://www.gcmforex.com/partners/aw.aspx?Task=JoinT2&AffiliateID=9107" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://www.facebook.com/teeLaLaLa" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://www.twitch.tv/zxlightsoutxz" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://flipagram.com/f/LUkA1QMrhF" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://www.gofundme.com/gvr7xg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://soundcloud.com/jackal-and-james/wrap-up-the-night" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://www.surveymonkey.com/s/CVHMKLT" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://www.facebook.com/pages/Mathster-WP/1495323920744243?ref=hl" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://hackfbaccountlive.com/?ref=5242575" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://www.facebook.com/FUDAIRYQUEEN?pnref=story" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://www.twitch.tv/daconnormc" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://www.facebook.com/photo.php?fbid=543627485763966&l=0d878a889c" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://www.bubblews.com/news/6401116-vps-solutions" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://shhort.com/a?r=HuPwEH5ab" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://shhort.com/a?r=G8iX5cTKd" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://shhort.com/a?r=Jt2ufxHxc" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://www.reverbnation.com/slicknick313/songs" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://viralangels.com/user/d4aaacwk" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://vimeo.com/107297364" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://www.facebook.com/pages/Komedi-burda-gel/775510675841486" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://www.facebook.com/profile.php?id=100007085325116" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://vimeo.com/106865403" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://minhateca.com.br/mauro-sp2013/Filmes+Series+Desenhos+Animes+Mp3+etc" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://www.facebook.com/antrobofficial" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://hackfbaccountlive.com/?ref=4344749" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://soundcloud.com/j-supt-fils-du-son/fucking-hostile" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://www.facebook.com/myfunnyriddles" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "http://www.wattpad.com/story/26032883-she-can-love-you-good" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
## /usr/lib/python3/dist-packages/bs4/__init__.py:385: UserWarning: "https://binbox.io/DNCkM#qT4Q1JB1" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
## warnings.warn(
youtube.head()
## COMMENT_ID ... CLASS
## 0 LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU ... 1
## 1 LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A ... 1
## 2 LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8 ... 1
## 3 z13jhp0bxqncu512g22wvzkasxmvvzjaz04 ... 1
## 4 z13fwbwp1oujthgqj04chlngpvzmtt3r3dw ... 1
##
## [5 rows x 5 columns]
Split the data into testing and training data.
##python chunk
X = youtube['CONTENT']
y = youtube['CLASS']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)
For FastText OR word2vec, create the tokenized vectors of the text.
##python chunk
tokenized_train = [nltk.tokenize.word_tokenize(text)
for text in X_train.to_list()]
tokenized_test = [nltk.tokenize.word_tokenize(text)
for text in X_test.to_list()]
Build either a word2vec or FastText model.
##python chunk
import gensim
w2v_model = gensim.models.Word2Vec(tokenized_train,
size=100, window=6,
min_count=2, iter=5, workers=4)
Convert the model into a set of features to use in our classifier.
##python chunk
def document_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index2word)
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,), dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = np.add(feature_vector, model.wv[word])
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return np.array(features)
avg_wv_train_features = document_vectorizer(corpus=tokenized_train,
model=w2v_model,
num_features=100)
avg_wv_test_features = document_vectorizer(corpus=tokenized_test,
model=w2v_model,
num_features=100)
In class, we used logistic regression to classify the data. You can use any machine learning algorithm you want here, and build a classification model.
##python chunk
my_tags = ["python", "php", "c++"]
### Using RANDOM FOREST CLASSIFIER
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(avg_wv_train_features, y_train)
## RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
## criterion='gini', max_depth=None, max_features='auto',
## max_leaf_nodes=None, max_samples=None,
## min_impurity_decrease=0.0, min_impurity_split=None,
## min_samples_leaf=1, min_samples_split=2,
## min_weight_fraction_leaf=0.0, n_estimators=100,
## n_jobs=None, oob_score=False, random_state=None,
## verbose=0, warm_start=False)
Print out the accuracy, recall, and precision of your model.
##python chunk
#predict new data
y_pred = rf_classifier.predict(avg_wv_test_features)
#print out results
print('accuracy %s' % accuracy_score(y_pred, y_test))
## accuracy 0.8775510204081632
Describe a set of texts and research question that interests you that could be explored using this method. Basically, what is a potential application of this method to another area of research?