Load Data

library(tidyverse)

twitter_sentiment<-read_csv('./data/twitter_training.csv', col_names = FALSE)
#twitter_sentiment<-sample_n(twitter_sentiment, size = 10000)

#add column names
colnames(twitter_sentiment)<-c('tweet_id', 'entity','sentiment', 'content')

Missing Data

Lets check our missing data

library(naniar)

vis_miss(twitter_sentiment)

It looks like such a small amount of data is missing, we should just drop it.

twitter_sentiment<-twitter_sentiment%>%na.omit()

EDA

Lets take a first look at the data

glimpse(twitter_sentiment, header = FALSE)
## Rows: 73,824
## Columns: 4
## $ tweet_id  <dbl> 2401, 2401, 2401, 2401, 2401, 2401, 2402, 2402, 2402, 2402, ~
## $ entity    <chr> "Borderlands", "Borderlands", "Borderlands", "Borderlands", ~
## $ sentiment <chr> "Positive", "Positive", "Positive", "Positive", "Positive", ~
## $ content   <chr> "im getting on borderlands and i will murder you all ,", "I ~

There are 4 variables, where the target is sentiment.

tweet_id

It isn’t clear what this variable is. Lets take a look at one example tweet_id

twitter_sentiment%>%filter(tweet_id==2403)
## # A tibble: 6 x 4
##   tweet_id entity      sentiment content                                        
##      <dbl> <chr>       <chr>     <chr>                                          
## 1     2403 Borderlands Neutral   Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM~
## 2     2403 Borderlands Neutral   Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM~
## 3     2403 Borderlands Neutral   Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM~
## 4     2403 Borderlands Neutral   Rock-Hard La Vita, RARE BUT POWERFUL, HANDSOME~
## 5     2403 Borderlands Neutral   Live Rock - Hard music La la Varlope, RARE & t~
## 6     2403 Borderlands Neutral   I-Hard like me, RARE LONDON DE, HANDSOME 2011,~

It seems like the tweet_id could be the user name? It doesn’t look like they all come from the same tweet, as they are very close copies of each other, yet not unique. I will start off by assuming that tweet_id is the ID of the twitter user.

entity

In this case, entity probably means the product that the user is talking about in their tweet. Lets explore.

ggplot(twitter_sentiment, aes(x = as.factor(entity)))+geom_bar()+scale_x_discrete(guide = guide_axis(angle = 90)) +ggtitle('counts of unique `entity` values ')

sentiment

This variable is our target, and is the labeled sentiment of the tweets we will be modeling

lets look at the distribution

ggplot(twitter_sentiment, aes(x = as.factor(sentiment)))+geom_bar()+scale_x_discrete(guide = guide_axis(angle = 90)) +ggtitle('Distribution of Sentiment ')

To keep this prediction simple, I will filter for only the positive and negative reviews.

twitter_sentiment<-twitter_sentiment%>%filter(sentiment == 'Positive' | sentiment == 'Negative')

content

This looks about right, It looks like a chisquare distribution with no major gaps.

Lets make a wordcloud to visualize the most common words used

library(tidytext)
library(wordcloud)
words<-twitter_sentiment%>%
  unnest_tokens(word, content)
words%>%
  anti_join(stop_words)%>%
  count(word)%>%
  with(wordcloud(word,n,max.words = 100))

Model building

Lets start by preparing our data for modeling

library(tidymodels)
set.seed(123)
twit_split<-initial_split(twitter_sentiment, strata = sentiment)
twit_train<-training(twit_split)
twit_test<- testing(twit_split)
library(tidylo)
content_log_odds<-
  twitter_sentiment%>%
  unnest_tokens(word, content)%>%
  count(sentiment, word, sort = TRUE)%>%
  bind_log_odds(sentiment, word, n)
content_log_odds%>%
  group_by(sentiment)%>%
  slice_max(log_odds_weighted, n = 10)%>%
  ungroup()%>%
  ggplot(aes(log_odds_weighted,
             fct_reorder(word, log_odds_weighted),
             fill = sentiment
             ))+
  geom_col(show.legend = FALSE)+
  facet_wrap(vars(sentiment), scales = 'free_y')+
  labs(x = 'Log odds (weighted)', y = NULL)

These common phrases make me feel like we are on the right track!. Love is the most common token in positive reviews and fix is the most common in Negative Reviews.

Colorful language in the negative class aside, i feel like we can all relate to verizonfios being on the list…

library(tidymodels)
set.seed(123)
twit_split<-initial_split(twitter_sentiment, strata = sentiment)
twit_train<-training(twit_split)
twit_test<-testing(twit_split)

set.seed(234)
twit_folds<-vfold_cv(twit_train, strata = sentiment)
twit_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 x 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [28977/3221]> Fold01
##  2 <split [28977/3221]> Fold02
##  3 <split [28977/3221]> Fold03
##  4 <split [28977/3221]> Fold04
##  5 <split [28979/3219]> Fold05
##  6 <split [28979/3219]> Fold06
##  7 <split [28979/3219]> Fold07
##  8 <split [28979/3219]> Fold08
##  9 <split [28979/3219]> Fold09
## 10 <split [28979/3219]> Fold10

I will use a recipe to transform text data into features for modeling

library(themis)
library(textrecipes)

twit_rec<-
  recipe(sentiment~content, data = twit_train)%>%
  step_tokenize(content)%>%
  step_tokenfilter(content, max_tokens = 200)%>%
  step_tfidf(content)%>%
  step_downsample(sentiment)

twit_rec
## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          1
## 
## Operations:
## 
## Tokenization for content
## Text filtering for content
## Term frequency-inverse document frequency with content
## Down-sampling based on sentiment

create model specs for a lasso model

multi_spec<-
  multinom_reg(penalty = tune(), mixture = 1)%>%
  set_mode('classification')%>%
  set_engine('glmnet')
multi_spec
## Multinomial Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet
twit_wf<-workflow(twit_rec, multi_spec)
twit_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: multinom_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 4 Recipe Steps
## 
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_downsample()
## 
## -- Model -----------------------------------------------------------------------
## Multinomial Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet

Now I will tune the model over a grid to find the best penalty

twit_grid<-grid_regular(penalty(range(-2,0)), levels =10)

set.seed(2022)
twit_rs<-
  tune_grid(
    twit_wf,
    twit_folds,
    grid= twit_grid
  )

twit_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 x 4
##    splits               id     .metrics          .notes          
##    <list>               <chr>  <list>            <list>          
##  1 <split [28977/3221]> Fold01 <tibble [20 x 5]> <tibble [0 x 3]>
##  2 <split [28977/3221]> Fold02 <tibble [20 x 5]> <tibble [0 x 3]>
##  3 <split [28977/3221]> Fold03 <tibble [20 x 5]> <tibble [0 x 3]>
##  4 <split [28977/3221]> Fold04 <tibble [20 x 5]> <tibble [0 x 3]>
##  5 <split [28979/3219]> Fold05 <tibble [20 x 5]> <tibble [0 x 3]>
##  6 <split [28979/3219]> Fold06 <tibble [20 x 5]> <tibble [0 x 3]>
##  7 <split [28979/3219]> Fold07 <tibble [20 x 5]> <tibble [0 x 3]>
##  8 <split [28979/3219]> Fold08 <tibble [20 x 5]> <tibble [0 x 3]>
##  9 <split [28979/3219]> Fold09 <tibble [20 x 5]> <tibble [0 x 3]>
## 10 <split [28979/3219]> Fold10 <tibble [20 x 5]> <tibble [0 x 3]>
autoplot(twit_rs)

It looks like the best penalty is around .01, lets look at the best model

show_best(twit_rs)
## # A tibble: 5 x 7
##   penalty .metric .estimator  mean     n std_err .config              
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1  0.01   roc_auc binary     0.813    10 0.00294 Preprocessor1_Model01
## 2  0.0167 roc_auc binary     0.800    10 0.00332 Preprocessor1_Model02
## 3  0.0278 roc_auc binary     0.765    10 0.00286 Preprocessor1_Model03
## 4  0.0464 roc_auc binary     0.645    10 0.00369 Preprocessor1_Model04
## 5  0.0774 roc_auc binary     0.5      10 0       Preprocessor1_Model05

Evaluate final model

I will use the “one standard error” rule to select the simplest model one standard error from the optimal model, since often a simpler model is often preferrable

final_penalty <-
  twit_rs %>%
  select_by_one_std_err(metric = "roc_auc", desc(penalty))

final_penalty
## # A tibble: 1 x 9
##   penalty .metric .estimator  mean     n std_err .config            .best .bound
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>              <dbl>  <dbl>
## 1    0.01 roc_auc binary     0.813    10 0.00294 Preprocessor1_Mod~ 0.813  0.810
final_rs <-
  twit_wf %>%
  finalize_workflow(final_penalty) %>%
  last_fit(twit_split)

collect_metrics(final_rs)
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.739 Preprocessor1_Model1
## 2 roc_auc  binary         0.815 Preprocessor1_Model1

Lets visualize the cross-class performance with a confusion matrix

collect_predictions(final_rs)%>%
  conf_mat(sentiment, .pred_class)%>%
  autoplot()

Interpret this plot by looking at the bars along the ‘Truth’ axis. The sub Boxes in that column are how many of the class are predicted in a specific class.

Now lets look at the ROC curves

collect_predictions(final_rs)%>%
  roc_curve(truth = sentiment, .pred_Negative)%>%
  ggplot(aes(1-specificity, sensitivity))+
  geom_abline(slope =1, color = 'gray', lty = 2)+
  geom_path(size = 1.5, alpha = .7)+
  labs(color = NULL)+
  coord_fixed()

Our ROC curve is looking good. We are able to predict a positive or negative tweet with 73% accuracy!