library(tidyverse)
twitter_sentiment<-read_csv('./data/twitter_training.csv', col_names = FALSE)
#twitter_sentiment<-sample_n(twitter_sentiment, size = 10000)
#add column names
colnames(twitter_sentiment)<-c('tweet_id', 'entity','sentiment', 'content')
Lets check our missing data
library(naniar)
vis_miss(twitter_sentiment)
It looks like such a small amount of data is missing, we should just drop it.
twitter_sentiment<-twitter_sentiment%>%na.omit()
Lets take a first look at the data
glimpse(twitter_sentiment, header = FALSE)
## Rows: 73,824
## Columns: 4
## $ tweet_id <dbl> 2401, 2401, 2401, 2401, 2401, 2401, 2402, 2402, 2402, 2402, ~
## $ entity <chr> "Borderlands", "Borderlands", "Borderlands", "Borderlands", ~
## $ sentiment <chr> "Positive", "Positive", "Positive", "Positive", "Positive", ~
## $ content <chr> "im getting on borderlands and i will murder you all ,", "I ~
There are 4 variables, where the target is
sentiment.
tweet_id
It isn’t clear what this variable is. Lets take a look at one example
tweet_id
twitter_sentiment%>%filter(tweet_id==2403)
## # A tibble: 6 x 4
## tweet_id entity sentiment content
## <dbl> <chr> <chr> <chr>
## 1 2403 Borderlands Neutral Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM~
## 2 2403 Borderlands Neutral Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM~
## 3 2403 Borderlands Neutral Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM~
## 4 2403 Borderlands Neutral Rock-Hard La Vita, RARE BUT POWERFUL, HANDSOME~
## 5 2403 Borderlands Neutral Live Rock - Hard music La la Varlope, RARE & t~
## 6 2403 Borderlands Neutral I-Hard like me, RARE LONDON DE, HANDSOME 2011,~
It seems like the tweet_id could be the user name? It
doesn’t look like they all come from the same tweet, as they are very
close copies of each other, yet not unique. I will start off by assuming
that tweet_id is the ID of the twitter user.
entity
In this case, entity probably means the product that the user is talking about in their tweet. Lets explore.
ggplot(twitter_sentiment, aes(x = as.factor(entity)))+geom_bar()+scale_x_discrete(guide = guide_axis(angle = 90)) +ggtitle('counts of unique `entity` values ')
sentiment
This variable is our target, and is the labeled sentiment of the tweets we will be modeling
lets look at the distribution
ggplot(twitter_sentiment, aes(x = as.factor(sentiment)))+geom_bar()+scale_x_discrete(guide = guide_axis(angle = 90)) +ggtitle('Distribution of Sentiment ')
To keep this prediction simple, I will filter for only the positive and negative reviews.
twitter_sentiment<-twitter_sentiment%>%filter(sentiment == 'Positive' | sentiment == 'Negative')
content
This looks about right, It looks like a chisquare distribution with no major gaps.
Lets make a wordcloud to visualize the most common words used
library(tidytext)
library(wordcloud)
words<-twitter_sentiment%>%
unnest_tokens(word, content)
words%>%
anti_join(stop_words)%>%
count(word)%>%
with(wordcloud(word,n,max.words = 100))
Lets start by preparing our data for modeling
library(tidymodels)
set.seed(123)
twit_split<-initial_split(twitter_sentiment, strata = sentiment)
twit_train<-training(twit_split)
twit_test<- testing(twit_split)
library(tidylo)
content_log_odds<-
twitter_sentiment%>%
unnest_tokens(word, content)%>%
count(sentiment, word, sort = TRUE)%>%
bind_log_odds(sentiment, word, n)
content_log_odds%>%
group_by(sentiment)%>%
slice_max(log_odds_weighted, n = 10)%>%
ungroup()%>%
ggplot(aes(log_odds_weighted,
fct_reorder(word, log_odds_weighted),
fill = sentiment
))+
geom_col(show.legend = FALSE)+
facet_wrap(vars(sentiment), scales = 'free_y')+
labs(x = 'Log odds (weighted)', y = NULL)
These common phrases make me feel like we are on the right track!. Love is the most common token in positive reviews and fix is the most common in Negative Reviews.
Colorful language in the negative class aside, i feel like we can all relate to verizonfios being on the list…
library(tidymodels)
set.seed(123)
twit_split<-initial_split(twitter_sentiment, strata = sentiment)
twit_train<-training(twit_split)
twit_test<-testing(twit_split)
set.seed(234)
twit_folds<-vfold_cv(twit_train, strata = sentiment)
twit_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 x 2
## splits id
## <list> <chr>
## 1 <split [28977/3221]> Fold01
## 2 <split [28977/3221]> Fold02
## 3 <split [28977/3221]> Fold03
## 4 <split [28977/3221]> Fold04
## 5 <split [28979/3219]> Fold05
## 6 <split [28979/3219]> Fold06
## 7 <split [28979/3219]> Fold07
## 8 <split [28979/3219]> Fold08
## 9 <split [28979/3219]> Fold09
## 10 <split [28979/3219]> Fold10
I will use a recipe to transform text data into features for modeling
library(themis)
library(textrecipes)
twit_rec<-
recipe(sentiment~content, data = twit_train)%>%
step_tokenize(content)%>%
step_tokenfilter(content, max_tokens = 200)%>%
step_tfidf(content)%>%
step_downsample(sentiment)
twit_rec
## Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Operations:
##
## Tokenization for content
## Text filtering for content
## Term frequency-inverse document frequency with content
## Down-sampling based on sentiment
create model specs for a lasso model
multi_spec<-
multinom_reg(penalty = tune(), mixture = 1)%>%
set_mode('classification')%>%
set_engine('glmnet')
multi_spec
## Multinomial Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = 1
##
## Computational engine: glmnet
twit_wf<-workflow(twit_rec, multi_spec)
twit_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: multinom_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 4 Recipe Steps
##
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
## * step_downsample()
##
## -- Model -----------------------------------------------------------------------
## Multinomial Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = 1
##
## Computational engine: glmnet
Now I will tune the model over a grid to find the best penalty
twit_grid<-grid_regular(penalty(range(-2,0)), levels =10)
set.seed(2022)
twit_rs<-
tune_grid(
twit_wf,
twit_folds,
grid= twit_grid
)
twit_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 x 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [28977/3221]> Fold01 <tibble [20 x 5]> <tibble [0 x 3]>
## 2 <split [28977/3221]> Fold02 <tibble [20 x 5]> <tibble [0 x 3]>
## 3 <split [28977/3221]> Fold03 <tibble [20 x 5]> <tibble [0 x 3]>
## 4 <split [28977/3221]> Fold04 <tibble [20 x 5]> <tibble [0 x 3]>
## 5 <split [28979/3219]> Fold05 <tibble [20 x 5]> <tibble [0 x 3]>
## 6 <split [28979/3219]> Fold06 <tibble [20 x 5]> <tibble [0 x 3]>
## 7 <split [28979/3219]> Fold07 <tibble [20 x 5]> <tibble [0 x 3]>
## 8 <split [28979/3219]> Fold08 <tibble [20 x 5]> <tibble [0 x 3]>
## 9 <split [28979/3219]> Fold09 <tibble [20 x 5]> <tibble [0 x 3]>
## 10 <split [28979/3219]> Fold10 <tibble [20 x 5]> <tibble [0 x 3]>
autoplot(twit_rs)
It looks like the best penalty is around .01, lets look at the best model
show_best(twit_rs)
## # A tibble: 5 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.01 roc_auc binary 0.813 10 0.00294 Preprocessor1_Model01
## 2 0.0167 roc_auc binary 0.800 10 0.00332 Preprocessor1_Model02
## 3 0.0278 roc_auc binary 0.765 10 0.00286 Preprocessor1_Model03
## 4 0.0464 roc_auc binary 0.645 10 0.00369 Preprocessor1_Model04
## 5 0.0774 roc_auc binary 0.5 10 0 Preprocessor1_Model05
I will use the “one standard error” rule to select the simplest model one standard error from the optimal model, since often a simpler model is often preferrable
final_penalty <-
twit_rs %>%
select_by_one_std_err(metric = "roc_auc", desc(penalty))
final_penalty
## # A tibble: 1 x 9
## penalty .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.01 roc_auc binary 0.813 10 0.00294 Preprocessor1_Mod~ 0.813 0.810
final_rs <-
twit_wf %>%
finalize_workflow(final_penalty) %>%
last_fit(twit_split)
collect_metrics(final_rs)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.739 Preprocessor1_Model1
## 2 roc_auc binary 0.815 Preprocessor1_Model1
Lets visualize the cross-class performance with a confusion matrix
collect_predictions(final_rs)%>%
conf_mat(sentiment, .pred_class)%>%
autoplot()
Interpret this plot by looking at the bars along the ‘Truth’ axis. The sub Boxes in that column are how many of the class are predicted in a specific class.
Now lets look at the ROC curves
collect_predictions(final_rs)%>%
roc_curve(truth = sentiment, .pred_Negative)%>%
ggplot(aes(1-specificity, sensitivity))+
geom_abline(slope =1, color = 'gray', lty = 2)+
geom_path(size = 1.5, alpha = .7)+
labs(color = NULL)+
coord_fixed()
Our ROC curve is looking good. We are able to predict a positive or negative tweet with 73% accuracy!