Load the libraries + functions

##r chunk

library(reticulate)
py_config()
## python:         C:/Users/punthakur/AppData/Local/Programs/Python/Python36/python.exe
## libpython:      C:/Users/punthakur/AppData/Local/Programs/Python/Python36/python36.dll
## pythonhome:     C:/Users/punthakur/AppData/Local/Programs/Python/Python36
## version:        3.6.0 (v3.6.0:41df79263a11, Dec 23 2016, 08:06:12) [MSC v.1900 64 bit (AMD64)]
## Architecture:   64bit
## numpy:          C:/Users/punthakur/AppData/Local/Programs/Python/Python36/Lib/site-packages/numpy
## numpy_version:  1.19.2
## 
## python versions found: 
##  C:/Users/punthakur/AppData/Local/Programs/Python/Python36/python.exe
##  C:/Program Files (x86)/Microsoft Visual Studio/Shared/Python37_64/python.exe
library(plyr)
## Warning: package 'plyr' was built under R version 3.6.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(gbm)
## Warning: package 'gbm' was built under R version 3.6.3
## Loaded gbm 2.1.8
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.6.3
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
## 
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
## 
##     panel.fill
## The following object is masked from 'package:plyr':
## 
##     baseball
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
sentiments
## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows

# -*- coding: utf-8 -*-

from sklearn import metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.preprocessing import label_binarize
from scipy import interp
from sklearn.metrics import roc_curve, auc 


def get_metrics(true_labels, predicted_labels):
    
    print('Accuracy:', np.round(
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        4))
    print('Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
                        

def train_predict_model(classifier, 
                        train_features, train_labels, 
                        test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    return predictions    


def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
    
    total_classes = len(classes)
    level_labels = [total_classes*[0], list(range(total_classes))]

    cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels, 
                                  labels=classes)
    cm_frame = pd.DataFrame(data=cm, 
                            columns=pd.MultiIndex(levels=[['Predicted:'], classes], 
                                                  codes=level_labels), 
                            index=pd.MultiIndex(levels=[['Actual:'], classes], 
                                                codes=level_labels)) 
    print(cm_frame) 


def display_confusion_matrix_pretty(true_labels, predicted_labels, classes=[1,0]):
    
    total_classes = len(classes)
    level_labels = [total_classes*[0], list(range(total_classes))]

    cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels, 
                                  labels=classes)
    cm_frame = pd.DataFrame(data=cm, 
                            columns=pd.MultiIndex(levels=[['Predicted:'], classes], 
                                                  codes=level_labels), 
                            index=pd.MultiIndex(levels=[['Actual:'], classes], 
                                                codes=level_labels)) 
    return cm_frame
    
def display_classification_report(true_labels, predicted_labels, classes=[1,0]):

    report = metrics.classification_report(y_true=true_labels, 
                                           y_pred=predicted_labels, 
                                           labels=classes) 
    print(report)
    
    
    
def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, 
                                  classes=classes)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)
    display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels, 
                             classes=classes)


def plot_model_decision_surface(clf, train_features, train_labels,
                                plot_step=0.02, cmap=plt.cm.RdYlBu,
                                markers=None, alphas=None, colors=None):
    
    if train_features.shape[1] != 2:
        raise ValueError("X_train should have exactly 2 columnns!")
    
    x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
    y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    clf_est = clone(clf)
    clf_est.fit(train_features,train_labels)
    if hasattr(clf_est, 'predict_proba'):
        Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
    else:
        Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])    
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=cmap)
    
    le = LabelEncoder()
    y_enc = le.fit_transform(train_labels)
    n_classes = len(le.classes_)
    plot_colors = ''.join(colors) if colors else [None] * n_classes
    label_names = le.classes_
    markers = markers if markers else [None] * n_classes
    alphas = alphas if alphas else [None] * n_classes
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y_enc == i)
        plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
                    label=label_names[i], cmap=cmap, edgecolors='black', 
                    marker=markers[i], alpha=alphas[i])
    plt.legend()
    plt.show()


def plot_model_roc_curve(clf, features, true_labels, label_encoder=None, class_names=None):
    
    ## Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    if hasattr(clf, 'classes_'):
        class_labels = clf.classes_
    elif label_encoder:
        class_labels = label_encoder.classes_
    elif class_names:
        class_labels = class_names
    else:
        raise ValueError('Unable to derive prediction classes, please specify class_names!')
    n_classes = len(class_labels)
    y_test = label_binarize(true_labels, classes=class_labels)
    if n_classes == 2:
        if hasattr(clf, 'predict_proba'):
            prob = clf.predict_proba(features)
            y_score = prob[:, prob.shape[1]-1] 
        elif hasattr(clf, 'decision_function'):
            prob = clf.decision_function(features)
            y_score = prob[:, prob.shape[1]-1]
        else:
            raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
        
        fpr, tpr, _ = roc_curve(y_test, y_score)      
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label='ROC curve (area = {0:0.2f})'
                                 ''.format(roc_auc),
                 linewidth=2.5)
        
    elif n_classes > 2:
        if hasattr(clf, 'predict_proba'):
            y_score = clf.predict_proba(features)
        elif hasattr(clf, 'decision_function'):
            y_score = clf.decision_function(features)
        else:
            raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")

        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        ## Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        ## Compute macro-average ROC curve and ROC area
        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])
        # Finally average it and compute AUC
        mean_tpr /= n_classes
        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        ## Plot ROC curves
        plt.figure(figsize=(6, 4))
        plt.plot(fpr["micro"], tpr["micro"],
                 label='micro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["micro"]), linewidth=3)

        plt.plot(fpr["macro"], tpr["macro"],
                 label='macro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["macro"]), linewidth=3)

        for i, label in enumerate(class_labels):
            plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                           ''.format(label, roc_auc[i]), 
                     linewidth=2, linestyle=':')
    else:
        raise ValueError('Number of classes should be atleast 2 or more')
        
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()
##python chunk

from textsearch import TextSearch
import spacy
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
from bs4 import BeautifulSoup 
from nltk.stem import PorterStemmer
ps = PorterStemmer()
import nltk
stopwords = nltk.corpus.stopwords.words('english')
import unicodedata
from contractions import contractions_dict

import pandas as pd

import textblob
import contractions

The Data

##python chunk

dataset = pd.read_csv("twitter_small.csv")
dataset.shape
## (4000, 2)
dataset.head()
##   sentiment                                              tweet
## 0  negative                              worried about adara. 
## 1  negative              German television program is boring. 
## 2  negative                              I have a headache... 
## 3  negative  birthday party tomorrow early, i need to sleep...
## 4  negative  i got slow internet connection in exchange of ...

Clean up the data (text normalization)

##python chunk

STOPWORDS = set(nltk.corpus.stopwords.words('english')) #stopwords
STOPWORDS.remove('no')
STOPWORDS.remove('but')
STOPWORDS.remove('not')

def clean_text(text):
    text = BeautifulSoup(text).get_text() #html
    text = text.lower() #lower case
    text = contractions.fix(text) #contractions
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') #symbols
    #text = ' '.join([ps.stem(word) for word in text.split()]) #stem
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # stopwords
    return text
    
dataset['tweet'] = dataset['tweet'].apply(clean_text)
dataset.head()
##   sentiment                                              tweet
## 0  negative                                     worried adara.
## 1  negative                  german television program boring.
## 2  negative                                        headache...
## 3  negative  birthday party tomorrow early, need sleep mayb...
## 4  negative         got slow internet connection exchange rain

TextBlob

##python chunk


tweets = np.array(dataset['tweet'])
sentiments = np.array(dataset['sentiment'])

from sklearn.model_selection import train_test_split

train_tweets, test_tweets, train_sentiments, test_sentiments = train_test_split(tweets, sentiments, test_size=0.20, random_state = 42)
train_tweets.shape
## (3200,)
test_tweets.shape




##Calculate Sentiment Score for all Tweets
## (800,)
example_test_tweets = test_tweets[0:1001]
example_test_sentiments = test_sentiments[0:1001]


#calculate sentiment for smaller example set
sentiment_polarity = [textblob.TextBlob(tweet).sentiment.polarity for tweet in example_test_tweets]


#convert to categorical labels
predicted_sentiments = ['positive' if score >= 0.1 else 'negative' for score in sentiment_polarity]


display_model_performance_metrics(true_labels=example_test_sentiments, predicted_labels=predicted_sentiments, 
                                  classes=['positive', 'negative'])
## Model Performance metrics:
## ------------------------------
## Accuracy: 0.6262
## Precision: 0.6267
## Recall: 0.6262
## F1 Score: 0.6212
## 
## Model Classification report:
## ------------------------------
##               precision    recall  f1-score   support
## 
##     positive       0.63      0.51      0.56       378
##     negative       0.62      0.73      0.67       422
## 
##     accuracy                           0.63       800
##    macro avg       0.63      0.62      0.62       800
## weighted avg       0.63      0.63      0.62       800
## 
## 
## Prediction Confusion Matrix:
## ------------------------------
##                  Predicted:         
##                    positive negative
## Actual: positive        192      186
##         negative        113      309

AFINN

##python chunk

from afinn import Afinn
#load the model 
afn = Afinn(emoticons=True)


#predict the polarity
sentiment_polarity = [afn.score(tweet) for tweet in example_test_tweets]

#decide how to categorize
predicted_sentiments = ['positive' if score >= 1.0 else 'negative' for score in sentiment_polarity]


display_model_performance_metrics(true_labels=example_test_sentiments, predicted_labels=predicted_sentiments, 
                                  classes=['positive', 'negative'])
## Model Performance metrics:
## ------------------------------
## Accuracy: 0.63
## Precision: 0.6293
## Recall: 0.63
## F1 Score: 0.6293
## 
## Model Classification report:
## ------------------------------
##               precision    recall  f1-score   support
## 
##     positive       0.61      0.58      0.60       378
##     negative       0.64      0.67      0.66       422
## 
##     accuracy                           0.63       800
##    macro avg       0.63      0.63      0.63       800
## weighted avg       0.63      0.63      0.63       800
## 
## 
## Prediction Confusion Matrix:
## ------------------------------
##                  Predicted:         
##                    positive negative
## Actual: positive        220      158
##         negative        138      284

Split the dataset

##python chunk

tweets = np.array(dataset['tweet'])
sentiments = np.array(dataset['sentiment'])

from sklearn.model_selection import train_test_split

train_tweets, test_tweets, train_sentiments, test_sentiments = train_test_split(tweets, sentiments, test_size=0.20, random_state = 42)
train_tweets.shape
## (3200,)
test_tweets.shape
## (800,)

TF-IDF

##python chunk

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(train_tweets)
cv_test_features = cv.transform(test_tweets)

# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(train_tweets)
tv_test_features = tv.transform(test_tweets)

Logistic Regression Classifier

##python chunk

from sklearn.linear_model import LogisticRegression

#blank model
lr = LogisticRegression(penalty='l2', max_iter=1000, C=1)

Accuracy and Classification Report

##python chunk


##Logistic Regression Bag of Words

# fit the model
lr_bow_model = lr.fit(cv_train_features, train_sentiments)

# grab the predictions
lr_bow_predictions = lr_bow_model.predict(cv_test_features)

#model performance
display_model_performance_metrics(true_labels=test_sentiments, 
  predicted_labels=lr_bow_predictions,
  classes=['positive', 'negative'])
  

##Logistic Regression TF-IDF
  
# fit the model
## Model Performance metrics:
## ------------------------------
## Accuracy: 0.7188
## Precision: 0.7196
## Recall: 0.7188
## F1 Score: 0.7189
## 
## Model Classification report:
## ------------------------------
##               precision    recall  f1-score   support
## 
##     positive       0.69      0.72      0.71       378
##     negative       0.74      0.72      0.73       422
## 
##     accuracy                           0.72       800
##    macro avg       0.72      0.72      0.72       800
## weighted avg       0.72      0.72      0.72       800
## 
## 
## Prediction Confusion Matrix:
## ------------------------------
##                  Predicted:         
##                    positive negative
## Actual: positive        273      105
##         negative        120      302
lr_tfidf_model = lr.fit(tv_train_features, train_sentiments)

# grab the predictions
lr_tfidf_predictions = lr_tfidf_model.predict(tv_test_features)

#model performance
display_model_performance_metrics(true_labels=test_sentiments,
  predicted_labels=lr_tfidf_predictions,
  classes=['positive', 'negative'])
## Model Performance metrics:
## ------------------------------
## Accuracy: 0.7175
## Precision: 0.7178
## Recall: 0.7175
## F1 Score: 0.7176
## 
## Model Classification report:
## ------------------------------
##               precision    recall  f1-score   support
## 
##     positive       0.70      0.71      0.70       378
##     negative       0.74      0.73      0.73       422
## 
##     accuracy                           0.72       800
##    macro avg       0.72      0.72      0.72       800
## weighted avg       0.72      0.72      0.72       800
## 
## 
## Prediction Confusion Matrix:
## ------------------------------
##                  Predicted:         
##                    positive negative
## Actual: positive        268      110
##         negative        116      306

Topic Model Positive Reviews

##python chunk

import pyLDAvis
## C:\Users\punthakur\Documents\R\win-library\3.6\reticulate\python\rpytools\loader.py:24: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
##   level=level
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
import gensim
## C:\Users\PUNTHA~1\AppData\Local\Programs\Python\Python36\lib\site-packages\scipy\sparse\sparsetools.py:21: DeprecationWarning: `scipy.sparse.sparsetools` is deprecated!
## scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
##   _deprecated()
import gensim.corpora as corpora

dataset.head()
##   sentiment                                              tweet
## 0  negative                                     worried adara.
## 1  negative                  german television program boring.
## 2  negative                                        headache...
## 3  negative  birthday party tomorrow early, need sleep mayb...
## 4  negative         got slow internet connection exchange rain
positive = dataset[dataset['sentiment']=="positive"][0:1001]
positive_tweets = positive['tweet'].apply(nltk.word_tokenize)

negative = dataset[dataset['sentiment']=="negative"][0:1001]
negative_tweets = negative['tweet'].apply(nltk.word_tokenize)


#create a dictionary of the words
dictionary_positive = corpora.Dictionary(positive_tweets)
dictionary_negative = corpora.Dictionary(negative_tweets)

#create a doc term matrix
pos_doc_term_matrix = [dictionary_positive.doc2bow(doc) for doc in positive_tweets]
neg_doc_term_matrix = [dictionary_negative.doc2bow(doc) for doc in negative_tweets]

Topic Model

##python chunk


lda_model_pos = gensim.models.ldamodel.LdaModel(
  corpus = pos_doc_term_matrix, #TDM
  id2word = dictionary_positive, #Dictionary
  num_topics = 10, 
  random_state = 100,
  update_every = 1,
  chunksize = 100,
  passes = 10,
  alpha = 'auto',
  per_word_topics = True)
  
  

lda_model_neg = gensim.models.ldamodel.LdaModel(
  corpus = neg_doc_term_matrix, #TDM
  id2word = dictionary_negative, #Dictionary
  num_topics = 10, 
  random_state = 100,
  update_every = 1,
  chunksize = 100,
  passes = 10,
  alpha = 'auto',
  per_word_topics = True)

Terms for the Topics

##python chunk


print(lda_model_pos.print_topics())
## [(0, '0.017*"thought" + 0.011*"funny" + 0.010*"quote" + 0.009*"years" + 0.008*"live" + 0.008*"place" + 0.008*"pics" + 0.008*"figure" + 0.007*"10" + 0.005*"ago"'), (1, '0.061*"." + 0.047*"@" + 0.034*"," + 0.025*"time" + 0.016*"see" + 0.015*"hope" + 0.015*"know" + 0.015*"hey" + 0.013*"not" + 0.013*"\'"'), (2, '0.027*"." + 0.025*"got" + 0.020*"," + 0.018*"go" + 0.017*"nice" + 0.016*"happy" + 0.014*"that" + 0.013*"<" + 0.013*"i" + 0.013*"man"'), (3, '0.057*"." + 0.056*"@" + 0.029*"," + 0.022*"!" + 0.015*"lol" + 0.014*"``" + 0.013*"\'\'" + 0.012*"-" + 0.011*"one" + 0.011*"new"'), (4, '0.015*"up" + 0.015*"let" + 0.014*"watched" + 0.013*"?" + 0.010*"hi" + 0.010*"free" + 0.009*"point" + 0.009*"even" + 0.007*"x" + 0.007*"idea"'), (5, '0.104*"..." + 0.066*"!" + 0.053*":" + 0.049*"http" + 0.045*"@" + 0.042*"cute" + 0.042*"necklace" + 0.042*"form" + 0.042*"cupcake" + 0.042*"//tinyurl.com/djbec4"'), (6, '0.059*"." + 0.042*"@" + 0.026*"thanks" + 0.022*"," + 0.021*"not" + 0.020*"!" + 0.020*"morning" + 0.018*"*" + 0.016*"it" + 0.009*"thing"'), (7, '0.023*"getting" + 0.018*"around" + 0.015*"amazing" + 0.014*"till" + 0.012*"listening" + 0.009*"all" + 0.009*"coffee" + 0.008*"music" + 0.008*"finally" + 0.008*"asleep"'), (8, '0.072*"night" + 0.008*"--" + 0.007*"feeling" + 0.007*"berlin" + 0.007*"waiting" + 0.006*"ice" + 0.006*"chat" + 0.005*"dream" + 0.005*"eating" + 0.004*"healthy"'), (9, '0.041*"\'s" + 0.025*"2" + 0.017*"." + 0.015*"always" + 0.014*"week" + 0.012*"post" + 0.011*"world" + 0.011*"enough" + 0.009*"4" + 0.009*"here"')]
print(lda_model_neg.print_topics())

##Unable to get the below code to work
##vis = pyLDAvis.gensim.prepare(lda_model_pos, pos_doc_term_matrix, dictionary_positive)
##pyLDAvis.save_html(vis, 'LDA_Visualization_Positive.html') ##saves the file

##vis = pyLDAvis.gensim.prepare(lda_model_neg, neg_doc_term_matrix, dictionary_negative)
##pyLDAvis.save_html(vis, 'LDA_Visualization_Negative.html') ##saves the file
## [(0, '0.036*"*" + 0.019*"made" + 0.016*"cry" + 0.013*"walk" + 0.012*"-" + 0.010*"movie" + 0.010*"girls" + 0.010*"awww" + 0.008*"ok" + 0.008*"call"'), (1, '0.020*"morning" + 0.017*"soon" + 0.017*"week" + 0.013*"day" + 0.012*"baby" + 0.011*"like" + 0.010*"sun" + 0.010*"make" + 0.010*"another" + 0.010*"today"'), (2, '0.132*"!" + 0.096*"going" + 0.093*"get" + 0.089*"\'s" + 0.081*"dammit" + 0.081*"adam" + 0.081*"melindaed" + 0.009*"miss" + 0.008*"home" + 0.008*"...."'), (3, '0.031*"good" + 0.026*"still" + 0.024*"no" + 0.024*"one" + 0.023*"tomorrow" + 0.016*"weekend" + 0.015*"gone" + 0.013*"find" + 0.013*"missed" + 0.012*"though"'), (4, '0.029*"sad" + 0.016*"sorry" + 0.014*"&" + 0.012*"waiting" + 0.011*")" + 0.010*"though" + 0.010*"(" + 0.009*"hard" + 0.009*"also" + 0.009*"play"'), (5, '0.076*":" + 0.040*"http" + 0.037*"like" + 0.022*"now" + 0.021*"you" + 0.018*"right" + 0.014*"look" + 0.012*"first" + 0.011*"x" + 0.008*"friends"'), (6, '0.050*".." + 0.029*"really" + 0.022*"day" + 0.020*"today" + 0.019*"no" + 0.017*"think" + 0.016*"well" + 0.013*"sleep" + 0.013*"feel" + 0.012*"feeling"'), (7, '0.033*"come" + 0.016*"2" + 0.016*"best" + 0.014*"house" + 0.013*"up" + 0.012*"sure" + 0.012*"hey" + 0.011*"found" + 0.010*"cool" + 0.008*"old"'), (8, '0.226*"." + 0.085*"@" + 0.060*"not" + 0.054*"," + 0.041*"?" + 0.031*"..." + 0.030*"I" + 0.020*"but" + 0.012*"want" + 0.009*"work"'), (9, '0.038*"hate" + 0.018*"next" + 0.018*"sick" + 0.016*"life" + 0.016*"little" + 0.013*"left" + 0.011*"two" + 0.010*"cold" + 0.009*"end" + 0.009*"hours"')]

Interpretation