##r chunk
library(reticulate)
py_config()
## python: C:/Users/punthakur/AppData/Local/Programs/Python/Python36/python.exe
## libpython: C:/Users/punthakur/AppData/Local/Programs/Python/Python36/python36.dll
## pythonhome: C:/Users/punthakur/AppData/Local/Programs/Python/Python36
## version: 3.6.0 (v3.6.0:41df79263a11, Dec 23 2016, 08:06:12) [MSC v.1900 64 bit (AMD64)]
## Architecture: 64bit
## numpy: C:/Users/punthakur/AppData/Local/Programs/Python/Python36/Lib/site-packages/numpy
## numpy_version: 1.19.2
##
## python versions found:
## C:/Users/punthakur/AppData/Local/Programs/Python/Python36/python.exe
## C:/Program Files (x86)/Microsoft Visual Studio/Shared/Python37_64/python.exe
library(plyr)
## Warning: package 'plyr' was built under R version 3.6.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(gbm)
## Warning: package 'gbm' was built under R version 3.6.3
## Loaded gbm 2.1.8
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.6.3
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
##
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
##
## panel.fill
## The following object is masked from 'package:plyr':
##
## baseball
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
sentiments
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
# -*- coding: utf-8 -*-
from sklearn import metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.preprocessing import label_binarize
from scipy import interp
from sklearn.metrics import roc_curve, auc
def get_metrics(true_labels, predicted_labels):
print('Accuracy:', np.round(
metrics.accuracy_score(true_labels,
predicted_labels),
4))
print('Precision:', np.round(
metrics.precision_score(true_labels,
predicted_labels,
average='weighted'),
4))
print('Recall:', np.round(
metrics.recall_score(true_labels,
predicted_labels,
average='weighted'),
4))
print('F1 Score:', np.round(
metrics.f1_score(true_labels,
predicted_labels,
average='weighted'),
4))
def train_predict_model(classifier,
train_features, train_labels,
test_features, test_labels):
# build model
classifier.fit(train_features, train_labels)
# predict using model
predictions = classifier.predict(test_features)
return predictions
def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
total_classes = len(classes)
level_labels = [total_classes*[0], list(range(total_classes))]
cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels,
labels=classes)
cm_frame = pd.DataFrame(data=cm,
columns=pd.MultiIndex(levels=[['Predicted:'], classes],
codes=level_labels),
index=pd.MultiIndex(levels=[['Actual:'], classes],
codes=level_labels))
print(cm_frame)
def display_confusion_matrix_pretty(true_labels, predicted_labels, classes=[1,0]):
total_classes = len(classes)
level_labels = [total_classes*[0], list(range(total_classes))]
cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels,
labels=classes)
cm_frame = pd.DataFrame(data=cm,
columns=pd.MultiIndex(levels=[['Predicted:'], classes],
codes=level_labels),
index=pd.MultiIndex(levels=[['Actual:'], classes],
codes=level_labels))
return cm_frame
def display_classification_report(true_labels, predicted_labels, classes=[1,0]):
report = metrics.classification_report(y_true=true_labels,
y_pred=predicted_labels,
labels=classes)
print(report)
def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
print('Model Performance metrics:')
print('-'*30)
get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
print('\nModel Classification report:')
print('-'*30)
display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels,
classes=classes)
print('\nPrediction Confusion Matrix:')
print('-'*30)
display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels,
classes=classes)
def plot_model_decision_surface(clf, train_features, train_labels,
plot_step=0.02, cmap=plt.cm.RdYlBu,
markers=None, alphas=None, colors=None):
if train_features.shape[1] != 2:
raise ValueError("X_train should have exactly 2 columnns!")
x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
clf_est = clone(clf)
clf_est.fit(train_features,train_labels)
if hasattr(clf_est, 'predict_proba'):
Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
else:
Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=cmap)
le = LabelEncoder()
y_enc = le.fit_transform(train_labels)
n_classes = len(le.classes_)
plot_colors = ''.join(colors) if colors else [None] * n_classes
label_names = le.classes_
markers = markers if markers else [None] * n_classes
alphas = alphas if alphas else [None] * n_classes
for i, color in zip(range(n_classes), plot_colors):
idx = np.where(y_enc == i)
plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
label=label_names[i], cmap=cmap, edgecolors='black',
marker=markers[i], alpha=alphas[i])
plt.legend()
plt.show()
def plot_model_roc_curve(clf, features, true_labels, label_encoder=None, class_names=None):
## Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
if hasattr(clf, 'classes_'):
class_labels = clf.classes_
elif label_encoder:
class_labels = label_encoder.classes_
elif class_names:
class_labels = class_names
else:
raise ValueError('Unable to derive prediction classes, please specify class_names!')
n_classes = len(class_labels)
y_test = label_binarize(true_labels, classes=class_labels)
if n_classes == 2:
if hasattr(clf, 'predict_proba'):
prob = clf.predict_proba(features)
y_score = prob[:, prob.shape[1]-1]
elif hasattr(clf, 'decision_function'):
prob = clf.decision_function(features)
y_score = prob[:, prob.shape[1]-1]
else:
raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='ROC curve (area = {0:0.2f})'
''.format(roc_auc),
linewidth=2.5)
elif n_classes > 2:
if hasattr(clf, 'predict_proba'):
y_score = clf.predict_proba(features)
elif hasattr(clf, 'decision_function'):
y_score = clf.decision_function(features)
else:
raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
## Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
## Compute macro-average ROC curve and ROC area
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
## Plot ROC curves
plt.figure(figsize=(6, 4))
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]), linewidth=3)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]), linewidth=3)
for i, label in enumerate(class_labels):
plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
''.format(label, roc_auc[i]),
linewidth=2, linestyle=':')
else:
raise ValueError('Number of classes should be atleast 2 or more')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
##python chunk
from textsearch import TextSearch
import spacy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
ps = PorterStemmer()
import nltk
stopwords = nltk.corpus.stopwords.words('english')
import unicodedata
from contractions import contractions_dict
import pandas as pd
import textblob
import contractions
##python chunk
dataset = pd.read_csv("twitter_small.csv")
dataset.shape
## (4000, 2)
dataset.head()
## sentiment tweet
## 0 negative worried about adara.
## 1 negative German television program is boring.
## 2 negative I have a headache...
## 3 negative birthday party tomorrow early, i need to sleep...
## 4 negative i got slow internet connection in exchange of ...
##python chunk
STOPWORDS = set(nltk.corpus.stopwords.words('english')) #stopwords
STOPWORDS.remove('no')
STOPWORDS.remove('but')
STOPWORDS.remove('not')
def clean_text(text):
text = BeautifulSoup(text).get_text() #html
text = text.lower() #lower case
text = contractions.fix(text) #contractions
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') #symbols
#text = ' '.join([ps.stem(word) for word in text.split()]) #stem
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # stopwords
return text
dataset['tweet'] = dataset['tweet'].apply(clean_text)
dataset.head()
## sentiment tweet
## 0 negative worried adara.
## 1 negative german television program boring.
## 2 negative headache...
## 3 negative birthday party tomorrow early, need sleep mayb...
## 4 negative got slow internet connection exchange rain
##python chunk
tweets = np.array(dataset['tweet'])
sentiments = np.array(dataset['sentiment'])
from sklearn.model_selection import train_test_split
train_tweets, test_tweets, train_sentiments, test_sentiments = train_test_split(tweets, sentiments, test_size=0.20, random_state = 42)
train_tweets.shape
## (3200,)
test_tweets.shape
##Calculate Sentiment Score for all Tweets
## (800,)
example_test_tweets = test_tweets[0:1001]
example_test_sentiments = test_sentiments[0:1001]
#calculate sentiment for smaller example set
sentiment_polarity = [textblob.TextBlob(tweet).sentiment.polarity for tweet in example_test_tweets]
#convert to categorical labels
predicted_sentiments = ['positive' if score >= 0.1 else 'negative' for score in sentiment_polarity]
display_model_performance_metrics(true_labels=example_test_sentiments, predicted_labels=predicted_sentiments,
classes=['positive', 'negative'])
## Model Performance metrics:
## ------------------------------
## Accuracy: 0.6262
## Precision: 0.6267
## Recall: 0.6262
## F1 Score: 0.6212
##
## Model Classification report:
## ------------------------------
## precision recall f1-score support
##
## positive 0.63 0.51 0.56 378
## negative 0.62 0.73 0.67 422
##
## accuracy 0.63 800
## macro avg 0.63 0.62 0.62 800
## weighted avg 0.63 0.63 0.62 800
##
##
## Prediction Confusion Matrix:
## ------------------------------
## Predicted:
## positive negative
## Actual: positive 192 186
## negative 113 309
##python chunk
from afinn import Afinn
#load the model
afn = Afinn(emoticons=True)
#predict the polarity
sentiment_polarity = [afn.score(tweet) for tweet in example_test_tweets]
#decide how to categorize
predicted_sentiments = ['positive' if score >= 1.0 else 'negative' for score in sentiment_polarity]
display_model_performance_metrics(true_labels=example_test_sentiments, predicted_labels=predicted_sentiments,
classes=['positive', 'negative'])
## Model Performance metrics:
## ------------------------------
## Accuracy: 0.63
## Precision: 0.6293
## Recall: 0.63
## F1 Score: 0.6293
##
## Model Classification report:
## ------------------------------
## precision recall f1-score support
##
## positive 0.61 0.58 0.60 378
## negative 0.64 0.67 0.66 422
##
## accuracy 0.63 800
## macro avg 0.63 0.63 0.63 800
## weighted avg 0.63 0.63 0.63 800
##
##
## Prediction Confusion Matrix:
## ------------------------------
## Predicted:
## positive negative
## Actual: positive 220 158
## negative 138 284
##python chunk
tweets = np.array(dataset['tweet'])
sentiments = np.array(dataset['sentiment'])
from sklearn.model_selection import train_test_split
train_tweets, test_tweets, train_sentiments, test_sentiments = train_test_split(tweets, sentiments, test_size=0.20, random_state = 42)
train_tweets.shape
## (3200,)
test_tweets.shape
## (800,)
##python chunk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(train_tweets)
cv_test_features = cv.transform(test_tweets)
# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),
sublinear_tf=True)
tv_train_features = tv.fit_transform(train_tweets)
tv_test_features = tv.transform(test_tweets)
##python chunk
from sklearn.linear_model import LogisticRegression
#blank model
lr = LogisticRegression(penalty='l2', max_iter=1000, C=1)
##python chunk
##Logistic Regression Bag of Words
# fit the model
lr_bow_model = lr.fit(cv_train_features, train_sentiments)
# grab the predictions
lr_bow_predictions = lr_bow_model.predict(cv_test_features)
#model performance
display_model_performance_metrics(true_labels=test_sentiments,
predicted_labels=lr_bow_predictions,
classes=['positive', 'negative'])
##Logistic Regression TF-IDF
# fit the model
## Model Performance metrics:
## ------------------------------
## Accuracy: 0.7188
## Precision: 0.7196
## Recall: 0.7188
## F1 Score: 0.7189
##
## Model Classification report:
## ------------------------------
## precision recall f1-score support
##
## positive 0.69 0.72 0.71 378
## negative 0.74 0.72 0.73 422
##
## accuracy 0.72 800
## macro avg 0.72 0.72 0.72 800
## weighted avg 0.72 0.72 0.72 800
##
##
## Prediction Confusion Matrix:
## ------------------------------
## Predicted:
## positive negative
## Actual: positive 273 105
## negative 120 302
lr_tfidf_model = lr.fit(tv_train_features, train_sentiments)
# grab the predictions
lr_tfidf_predictions = lr_tfidf_model.predict(tv_test_features)
#model performance
display_model_performance_metrics(true_labels=test_sentiments,
predicted_labels=lr_tfidf_predictions,
classes=['positive', 'negative'])
## Model Performance metrics:
## ------------------------------
## Accuracy: 0.7175
## Precision: 0.7178
## Recall: 0.7175
## F1 Score: 0.7176
##
## Model Classification report:
## ------------------------------
## precision recall f1-score support
##
## positive 0.70 0.71 0.70 378
## negative 0.74 0.73 0.73 422
##
## accuracy 0.72 800
## macro avg 0.72 0.72 0.72 800
## weighted avg 0.72 0.72 0.72 800
##
##
## Prediction Confusion Matrix:
## ------------------------------
## Predicted:
## positive negative
## Actual: positive 268 110
## negative 116 306
##python chunk
import pyLDAvis
## C:\Users\punthakur\Documents\R\win-library\3.6\reticulate\python\rpytools\loader.py:24: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
## level=level
import pyLDAvis.gensim # don't skip this
import matplotlib.pyplot as plt
import gensim
## C:\Users\PUNTHA~1\AppData\Local\Programs\Python\Python36\lib\site-packages\scipy\sparse\sparsetools.py:21: DeprecationWarning: `scipy.sparse.sparsetools` is deprecated!
## scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
## _deprecated()
import gensim.corpora as corpora
dataset.head()
## sentiment tweet
## 0 negative worried adara.
## 1 negative german television program boring.
## 2 negative headache...
## 3 negative birthday party tomorrow early, need sleep mayb...
## 4 negative got slow internet connection exchange rain
positive = dataset[dataset['sentiment']=="positive"][0:1001]
positive_tweets = positive['tweet'].apply(nltk.word_tokenize)
negative = dataset[dataset['sentiment']=="negative"][0:1001]
negative_tweets = negative['tweet'].apply(nltk.word_tokenize)
#create a dictionary of the words
dictionary_positive = corpora.Dictionary(positive_tweets)
dictionary_negative = corpora.Dictionary(negative_tweets)
#create a doc term matrix
pos_doc_term_matrix = [dictionary_positive.doc2bow(doc) for doc in positive_tweets]
neg_doc_term_matrix = [dictionary_negative.doc2bow(doc) for doc in negative_tweets]
##python chunk
lda_model_pos = gensim.models.ldamodel.LdaModel(
corpus = pos_doc_term_matrix, #TDM
id2word = dictionary_positive, #Dictionary
num_topics = 10,
random_state = 100,
update_every = 1,
chunksize = 100,
passes = 10,
alpha = 'auto',
per_word_topics = True)
lda_model_neg = gensim.models.ldamodel.LdaModel(
corpus = neg_doc_term_matrix, #TDM
id2word = dictionary_negative, #Dictionary
num_topics = 10,
random_state = 100,
update_every = 1,
chunksize = 100,
passes = 10,
alpha = 'auto',
per_word_topics = True)
##python chunk
print(lda_model_pos.print_topics())
## [(0, '0.017*"thought" + 0.011*"funny" + 0.010*"quote" + 0.009*"years" + 0.008*"live" + 0.008*"place" + 0.008*"pics" + 0.008*"figure" + 0.007*"10" + 0.005*"ago"'), (1, '0.061*"." + 0.047*"@" + 0.034*"," + 0.025*"time" + 0.016*"see" + 0.015*"hope" + 0.015*"know" + 0.015*"hey" + 0.013*"not" + 0.013*"\'"'), (2, '0.027*"." + 0.025*"got" + 0.020*"," + 0.018*"go" + 0.017*"nice" + 0.016*"happy" + 0.014*"that" + 0.013*"<" + 0.013*"i" + 0.013*"man"'), (3, '0.057*"." + 0.056*"@" + 0.029*"," + 0.022*"!" + 0.015*"lol" + 0.014*"``" + 0.013*"\'\'" + 0.012*"-" + 0.011*"one" + 0.011*"new"'), (4, '0.015*"up" + 0.015*"let" + 0.014*"watched" + 0.013*"?" + 0.010*"hi" + 0.010*"free" + 0.009*"point" + 0.009*"even" + 0.007*"x" + 0.007*"idea"'), (5, '0.104*"..." + 0.066*"!" + 0.053*":" + 0.049*"http" + 0.045*"@" + 0.042*"cute" + 0.042*"necklace" + 0.042*"form" + 0.042*"cupcake" + 0.042*"//tinyurl.com/djbec4"'), (6, '0.059*"." + 0.042*"@" + 0.026*"thanks" + 0.022*"," + 0.021*"not" + 0.020*"!" + 0.020*"morning" + 0.018*"*" + 0.016*"it" + 0.009*"thing"'), (7, '0.023*"getting" + 0.018*"around" + 0.015*"amazing" + 0.014*"till" + 0.012*"listening" + 0.009*"all" + 0.009*"coffee" + 0.008*"music" + 0.008*"finally" + 0.008*"asleep"'), (8, '0.072*"night" + 0.008*"--" + 0.007*"feeling" + 0.007*"berlin" + 0.007*"waiting" + 0.006*"ice" + 0.006*"chat" + 0.005*"dream" + 0.005*"eating" + 0.004*"healthy"'), (9, '0.041*"\'s" + 0.025*"2" + 0.017*"." + 0.015*"always" + 0.014*"week" + 0.012*"post" + 0.011*"world" + 0.011*"enough" + 0.009*"4" + 0.009*"here"')]
print(lda_model_neg.print_topics())
##Unable to get the below code to work
##vis = pyLDAvis.gensim.prepare(lda_model_pos, pos_doc_term_matrix, dictionary_positive)
##pyLDAvis.save_html(vis, 'LDA_Visualization_Positive.html') ##saves the file
##vis = pyLDAvis.gensim.prepare(lda_model_neg, neg_doc_term_matrix, dictionary_negative)
##pyLDAvis.save_html(vis, 'LDA_Visualization_Negative.html') ##saves the file
## [(0, '0.036*"*" + 0.019*"made" + 0.016*"cry" + 0.013*"walk" + 0.012*"-" + 0.010*"movie" + 0.010*"girls" + 0.010*"awww" + 0.008*"ok" + 0.008*"call"'), (1, '0.020*"morning" + 0.017*"soon" + 0.017*"week" + 0.013*"day" + 0.012*"baby" + 0.011*"like" + 0.010*"sun" + 0.010*"make" + 0.010*"another" + 0.010*"today"'), (2, '0.132*"!" + 0.096*"going" + 0.093*"get" + 0.089*"\'s" + 0.081*"dammit" + 0.081*"adam" + 0.081*"melindaed" + 0.009*"miss" + 0.008*"home" + 0.008*"...."'), (3, '0.031*"good" + 0.026*"still" + 0.024*"no" + 0.024*"one" + 0.023*"tomorrow" + 0.016*"weekend" + 0.015*"gone" + 0.013*"find" + 0.013*"missed" + 0.012*"though"'), (4, '0.029*"sad" + 0.016*"sorry" + 0.014*"&" + 0.012*"waiting" + 0.011*")" + 0.010*"though" + 0.010*"(" + 0.009*"hard" + 0.009*"also" + 0.009*"play"'), (5, '0.076*":" + 0.040*"http" + 0.037*"like" + 0.022*"now" + 0.021*"you" + 0.018*"right" + 0.014*"look" + 0.012*"first" + 0.011*"x" + 0.008*"friends"'), (6, '0.050*".." + 0.029*"really" + 0.022*"day" + 0.020*"today" + 0.019*"no" + 0.017*"think" + 0.016*"well" + 0.013*"sleep" + 0.013*"feel" + 0.012*"feeling"'), (7, '0.033*"come" + 0.016*"2" + 0.016*"best" + 0.014*"house" + 0.013*"up" + 0.012*"sure" + 0.012*"hey" + 0.011*"found" + 0.010*"cool" + 0.008*"old"'), (8, '0.226*"." + 0.085*"@" + 0.060*"not" + 0.054*"," + 0.041*"?" + 0.031*"..." + 0.030*"I" + 0.020*"but" + 0.012*"want" + 0.009*"work"'), (9, '0.038*"hate" + 0.018*"next" + 0.018*"sick" + 0.016*"life" + 0.016*"little" + 0.013*"left" + 0.011*"two" + 0.010*"cold" + 0.009*"end" + 0.009*"hours"')]
Which model best represented the polarity in the dataset? ##Logistic Regression Bag of Words
Looking at the topics analysis, what are main positive components to the data? ## ’0.017“thought” + 0.011“funny” +0.016*“happy”