Load the libraries + functions

##r chunk
library(reticulate)
#py_install("feature_selector")
 
#source_python("model_evaluation_utils.py")
##python chunk
import pandas as pd
import numpy as np
import nltk
import textblob
from bs4 import BeautifulSoup
import unicodedata
import contractions
from nltk import PorterStemmer
ps = PorterStemmer()

The Data

##python chunk
df = pd.read_csv('twitter_small.csv')

Clean up the data (text normalization)

##python chunk
stop_words = set(nltk.corpus.stopwords.words('english')) #stopwords
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')

def remove_stop_words(text):
    text = BeautifulSoup(text).get_text() #html
    text = text.lower() #lower case
    text = contractions.fix(text) #contractions
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') #symbols
    #text = ' '.join([ps.stem(word) for word in text.split()]) #stem
    text = ' '.join(word for word in text.split() if word not in stop_words) # stopwords
    return text
df['tweet_parse1'] = df['tweet'].apply(remove_stop_words)
df.head(2)
##   sentiment  ...                       tweet_parse1
## 0  negative  ...                     worried adara.
## 1  negative  ...  german television program boring.
## 
## [2 rows x 3 columns]

TextBlob

##python chunk

tweets = np.array(df['tweet_parse1'])
sentiments = np.array(df['sentiment'])
#py_install("afinn", pip = T)
from afinn import Afinn
#load the model 
afn = Afinn(emoticons=True)

AFINN

##python chunk
#py_install("afinn", pip = T)
#get_sentiments("afinn")

Split the dataset

##python chunk
from sklearn.model_selection import train_test_split

train_tweet, test_tweet, train_sentiment, test_sentiment = train_test_split(df['tweet_parse1'], df['sentiment'], test_size=0.20, random_state = 42)
train_tweet.shape
## (3200,)
test_tweet.shape
## (800,)

TF-IDF

##python chunk
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)

# apply to train and test
tv_train_features = tv.fit_transform(train_tweet)
tv_test_features = tv.transform(test_tweet)

Logistic Regression Classifier

##python chunk
from sklearn import linear_model
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
lr = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', multi_class='ovr',
                        max_iter=1000, C=1, random_state=42)
lr.fit(tv_train_features, train_sentiment)
## LogisticRegression(C=1, max_iter=1000, multi_class='ovr', random_state=42)
y_pred = lr.predict(tv_test_features)

Accuracy and Classification Report

##python chunk
print('accuracy %s' % accuracy_score(test_sentiment, y_pred))
## accuracy 0.72125
print(classification_report(test_sentiment, y_pred))
##               precision    recall  f1-score   support
## 
##     negative       0.74      0.72      0.73       422
##     positive       0.70      0.72      0.71       378
## 
##     accuracy                           0.72       800
##    macro avg       0.72      0.72      0.72       800
## weighted avg       0.72      0.72      0.72       800

Topic Model Positive Reviews

##python chunk
from nltk import word_tokenize, pos_tag
from nltk.classify import NaiveBayesClassifier
import nltk.classify.util # Utility functions and classes for classifiers. Contains functions such as accuracy(classifier, gold)

# Given a word, returns a dict {word: True}. This will be our feature in the classifier. 
#def word_feats(words):
#  return dict([(word, True) for word in words if word not in stops and word.isalpha()])

#pos_ids = df['tweet_parse1'].('pos')
#neg_ids = df['tweet_parse1'].('neg')

#print(len(pos_ids) + len(neg_ids))

#pos_feats = [(word_feats(df['tweet_parse1'].words(fileids=[f])), 'pos') for f in pos_ids]
#neg_feats = [(word_feats(df['tweet_parse1'].words(fileids=[f])), 'neg') for f in neg_ids]
#pos_len_train = int(len(pos_feats) * 3 / 4)
#neg_len_train = int(len(neg_feats) * 3 / 4)
##train_feats = neg_feats[:neg_len_train] + pos_feats[:pos_len_train]
#test_feats = neg_feats[neg_len_train:] + pos_feats[pos_len_train:]
#classifier = NaiveBayesClassifier.train(tv_train_features)

#print('Accuracy: ', nltk.classify.util.accuracy(classifier, tv_test_features))

Topic Model

##python chunk


#classifier.show_most_informative_features()

Terms for the Topics

##python chunk
#sentence = "I feel so miserable, it makes me amazing"
#tokens = [word for word in word_tokenize(sentence) if word not in stop_words]
#tokens
#feats = word_feats(word for word in tokens)
#print(feats)
#test = lr.predict(feats)
#print(test)
 
#from feature_selector import FeatureSelector
#fs = FeatureSelector(data = tv_train_features, labels = train_sentiment)
#fs.feature_importances.head(10)

Interpretation