For this project, we will be using Corona virus tweets data. The entire dataset is available here: https://www.kaggle.com/datatattle/covid-19-nlp-text-classification?select=Corona_NLP_train.csv
Notice that these are actual tweets, there may be offending words/language.
The following R chunk reads the data:
tweet=read_csv("https://unh.box.com/shared/static/b3iz7j2d33ju568vi0yc7842s01pypjg.csv")
## Rows: 41157 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TweetAt, OriginalTweet, Sentiment
## dbl (2): UserName, ScreenName
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(tweet)
## # A tibble: 6 × 5
## UserName ScreenName TweetAt OriginalTweet Sentiment
## <dbl> <dbl> <chr> <chr> <chr>
## 1 3799 48751 16-03-2020 "@MeNyrbie @Phil_Gahan @Chrisitv h… Neutral
## 2 3800 48752 16-03-2020 "advice Talk to your neighbours fa… Positive
## 3 3801 48753 16-03-2020 "Coronavirus Australia: Woolworths… Positive
## 4 3802 48754 16-03-2020 "My food stock is not the only one… Positive
## 5 3803 48755 16-03-2020 "Me, ready to go at supermarket du… Extremely …
## 6 3804 48756 16-03-2020 "As news of the region\u0092s firs… Positive
nrow(tweet)
## [1] 41157
set.seed(123)
index = sample(1:nrow(tweet), size = 10000)
new_tweet = tweet[index,]
nrow(new_tweet)
## [1] 10000
head(new_tweet)
## # A tibble: 6 × 5
## UserName ScreenName TweetAt OriginalTweet Sentiment
## <dbl> <dbl> <chr> <chr> <chr>
## 1 6784 51736 18-03-2020 "\"As the #coronavirus spreads acr… Negative
## 2 33723 78675 6/4/2020 "PRIORITY APPEAL 1 If you or any… Extremely …
## 3 33508 78460 5/4/2020 "@ShalluKapoor2 @GuptaPradeepkr @D… Negative
## 4 41327 86279 10/4/2020 "Just Yesterday in Orlando Warehou… Positive
## 5 6555 51507 18-03-2020 "There is enough for everyone if w… Neutral
## 6 42736 87688 12/4/2020 "Gas prices is looking lovely. I j… Positive
text<- new_tweet$OriginalTweet
#removing non ASCII characters
text<- gsub('[^\x20-\x7F]', "", text)
#removing retweets
text<- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", text)
#removing mentions
text<- gsub('@\\w+', "", text)
#removing hashtags
text<- str_replace_all(text, '#[a-z,A-Z]*', "")
#replacing "&" with "and"
text<- gsub('&', "and", text)
#removing puntuations
text<- gsub('[[:punct:]]', "", text)
#removing numbers
text<- gsub('[[:digit:]]', "", text)
#removing URLs
text<- gsub('http\\w+', "", text)
text<- gsub('[\t]{2,}', "", text)
text<- gsub('^\\s+|\\s+$', "", text)
text[1:5]
## [1] "As the spreads across America many workers are being directed to work from home but staff at and are being squeezed to keep up with increasing demand caused by Americans stockpiling food and household products"
## [2] "PRIORITY APPEAL If you or anyone you are self isolating with are NOT in a high risk vulnerable elderly health conditions category PLEASE do not deprive those who are by booking online supermarket shopping slots"
## [3] "We can control Covid effects by adoting ayurvedsocial distancingcleanliness and eating vegetarian foodAvoid panic stay at home and stay safeWe Request To PM"
## [4] "Just Yesterday in Orlando Warehouse Distributed Million Masks Face Shields Shoe Covers Gowns Gloves Containers of Hand Sanitizer Florida is the second highest state in testing for"
## [5] "There is enough for everyone if we all work together UK supermarkets are besieged by customers"
#removing spaces
text<- str_replace_all(text, " ", " ")
#Converting all words to lower case
cleaned_tweet<- tolower(text)
cleaned_tweet[1:5]
## [1] "as the spreads across america many workers are being directed to work from home but staff at and are being squeezed to keep up with increasing demand caused by americans stockpiling food and household products"
## [2] "priority appeal if you or anyone you are self isolating with are not in a high risk vulnerable elderly health conditions category please do not deprive those who are by booking online supermarket shopping slots"
## [3] "we can control covid effects by adoting ayurvedsocial distancingcleanliness and eating vegetarian foodavoid panic stay at home and stay safewe request to pm"
## [4] "just yesterday in orlando warehouse distributed million masks face shields shoe covers gowns gloves containers of hand sanitizer florida is the second highest state in testing for"
## [5] "there is enough for everyone if we all work together uk supermarkets are besieged by customers"
new_tweet['cleaned_tweet'] <- cleaned_tweet
head(new_tweet)
## # A tibble: 6 × 6
## UserName ScreenName TweetAt OriginalTweet Sentiment cleaned_tweet
## <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 6784 51736 18-03-2020 "\"As the #corona… Negative as the spreads …
## 2 33723 78675 6/4/2020 "PRIORITY APPEAL … Extremely… priority appeal …
## 3 33508 78460 5/4/2020 "@ShalluKapoor2 @… Negative we can control c…
## 4 41327 86279 10/4/2020 "Just Yesterday i… Positive just yesterday i…
## 5 6555 51507 18-03-2020 "There is enough … Neutral there is enough …
## 6 42736 87688 12/4/2020 "Gas prices is lo… Positive gas prices is lo…
tweet_count<- new_tweet%>%
unnest_tokens(word, cleaned_tweet)%>%
anti_join(stop_words)%>%
filter(!word %in% c("dont", "im", "rt", "oz", "lol", "loo"))%>%
count(word, sort = TRUE)
## Joining, by = "word"
tweet_count
## # A tibble: 18,268 × 2
## word n
## <chr> <int>
## 1 covid 2485
## 2 prices 1863
## 3 store 1597
## 4 food 1584
## 5 supermarket 1562
## 6 grocery 1478
## 7 people 1356
## 8 consumer 1034
## 9 shopping 801
## 10 online 754
## # … with 18,258 more rows
cast_dtm where document=UserName, term=word.Do we need to remove sparse terms?
tweet_sentence <- new_tweet%>%
unnest_tokens(sentence, token = "sentences", input = cleaned_tweet)
tweet_token <- tweet_sentence%>%
unnest_tokens(word, token = "words", input = sentence)%>%
anti_join(stop_words)%>%
mutate(word = lemmatize_words(word, dictionary=lexicon::hash_lemmas))
## Joining, by = "word"
doc_matrix<- tweet_token%>%
count(UserName, word)%>%
cast_dtm(UserName, term = word, value = n, weighting = tm::weightTfIdf)
doc_matrix
## <<DocumentTermMatrix (documents: 9971, terms: 14543)>>
## Non-/sparse entries: 117343/144890910
## Sparsity : 100%
## Maximal term length: 113
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
doc_matrix = removeSparseTerms(doc_matrix, sparse=.99)
doc_matrix
## <<DocumentTermMatrix (documents: 9971, terms: 174)>>
## Non-/sparse entries: 48900/1686054
## Sparsity : 97%
## Maximal term length: 11
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
doc_matrixdf <- as.data.frame(as.matrix(doc_matrix)) # count matrix
doc_matrixdf2 <- as.matrix((doc_matrixdf > 0) + 0) # binary instance matrix
doc_matrixdf2 <- as.data.frame(doc_matrixdf2)
doc_matrixdf2 <- cbind(doc_matrixdf2, new_tweet$Sentiment[1:9971]) # append label column from original
doc_matrixdf[1:10,]
## family list online shop supply do empty
## 3800 0.2320532 0.2709234 0.1600241 0.1362874 0.1928703 0.0000000 0.0000000
## 3803 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.3903008 0.4761389
## 3805 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3808 0.0000000 0.0000000 0.4267310 0.1817166 0.0000000 0.0000000 0.0000000
## 3814 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3816 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3817 0.0000000 0.0000000 0.1745718 0.1486772 0.0000000 0.0000000 0.0000000
## 3826 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3828 0.0000000 0.0000000 0.0000000 0.0000000 0.3560682 0.0000000 0.0000000
## 3829 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.7142084
## food im panic shortage stock supermarket grocery
## 3800 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3803 0.2374079 0.4361395 0.3446931 0.5365027 0.3430087 0.2146803 0.0000000
## 3805 0.0000000 0.8051806 0.0000000 0.0000000 0.0000000 0.0000000 0.2106583
## 3808 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3814 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.3042842
## 3816 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3817 0.1294952 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.2489598
## 3826 0.0000000 0.3078632 0.2433128 0.0000000 0.0000000 0.0000000 0.0000000
## 3828 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.1981664 0.0000000
## 3829 0.3561118 0.0000000 0.0000000 0.0000000 0.5145130 0.0000000 0.0000000
## share store buy covid fight home spread
## 3800 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000
## 3803 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000
## 3805 0.4628258 0.1937424 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000
## 3808 0.0000000 0.0000000 0.2100936 0.11305566 0.3379917 0.2329672 0.2808169
## 3814 0.0000000 0.2798501 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000
## 3816 0.0000000 0.0000000 0.0000000 0.18500017 0.0000000 0.0000000 0.0000000
## 3817 0.0000000 0.0000000 0.0000000 0.09250009 0.0000000 0.0000000 0.2297593
## 3826 0.0000000 0.0000000 0.0000000 0.11970599 0.0000000 0.2466712 0.0000000
## 3828 0.0000000 0.1937424 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000
## 3829 0.0000000 0.0000000 0.4727107 0.00000000 0.0000000 0.0000000 0.0000000
## stop time avoid line restaurant safe consumer
## 3800 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3803 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3805 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3808 0.2843675 0.2121087 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3814 0.0000000 0.0000000 0.7361457 0.6365482 0.7195742 0.6035304 0.0000000
## 3816 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.2868662
## 3817 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3826 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3828 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 3829 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## report affect impact increase people feel morning
## 3800 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.000000 0.0000000
## 3803 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.000000 0.0000000
## 3805 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.000000 0.0000000
## 3808 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.000000 0.0000000
## 3814 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.000000 0.0000000
## 3816 0.4904345 0.0000000 0.0000000 0.0000000 0.0000000 0.000000 0.0000000
## 3817 0.0000000 0.2915072 0.2306395 0.2308614 0.1385231 0.000000 0.0000000
## 3826 0.0000000 0.0000000 0.0000000 0.0000000 0.1792652 0.369619 0.3856236
## 3828 0.0000000 0.0000000 0.0000000 0.0000000 0.2344237 0.000000 0.0000000
## 3829 0.0000000 0.0000000 0.0000000 0.0000000 0.3809385 0.000000 0.0000000
## stay test update virus close crisis country
## 3800 0.0000000 0.000000 0.0000000 0.0000000 0.00000 0.0000000 0.0000000
## 3803 0.0000000 0.000000 0.0000000 0.0000000 0.00000 0.0000000 0.0000000
## 3805 0.0000000 0.000000 0.0000000 0.0000000 0.00000 0.0000000 0.0000000
## 3808 0.0000000 0.000000 0.0000000 0.0000000 0.00000 0.0000000 0.0000000
## 3814 0.0000000 0.000000 0.0000000 0.0000000 0.00000 0.0000000 0.0000000
## 3816 0.0000000 0.000000 0.0000000 0.0000000 0.00000 0.0000000 0.0000000
## 3817 0.0000000 0.000000 0.0000000 0.0000000 0.00000 0.0000000 0.0000000
## 3826 0.2804602 0.332598 0.3644741 0.3248218 0.00000 0.0000000 0.0000000
## 3828 0.0000000 0.000000 0.0000000 0.0000000 0.39648 0.3580128 0.0000000
## 3829 0.0000000 0.000000 0.0000000 0.0000000 0.00000 0.0000000 0.7209778
## shelf money pandemic customer provide check coronavirus die due market
## 3800 0.0000000 0 0 0 0 0 0 0 0 0
## 3803 0.0000000 0 0 0 0 0 0 0 0 0
## 3805 0.0000000 0 0 0 0 0 0 0 0 0
## 3808 0.0000000 0 0 0 0 0 0 0 0 0
## 3814 0.0000000 0 0 0 0 0 0 0 0 0
## 3816 0.0000000 0 0 0 0 0 0 0 0 0
## 3817 0.0000000 0 0 0 0 0 0 0 0 0
## 3826 0.0000000 0 0 0 0 0 0 0 0 0
## 3828 0.0000000 0 0 0 0 0 0 0 0 0
## 3829 0.6009736 0 0 0 0 0 0 0 0 0
## delivery free issue live local offer come day retail run emergency worker
## 3800 0 0 0 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0 0 0 0
## continue demand global month price bad paper toilet wash youre bank
## 3800 0 0 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0 0 0
## include lose support government item amid hour news hope low plan business
## 3800 0 0 0 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0 0 0 0
## staff hand world distance social essential house read job call product
## 3800 0 0 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0 0 0
## scam service community friend protection public care health take company
## 3800 0 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0 0
## sell week american response rise hoard start drop roll uk driver march
## 3800 0 0 0 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0 0 0 0
## shopper real sanitizer deal leave lockdown learn situation employee
## 3800 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0
## measure hard life outbreak industry change daily limit amp protect chain
## 3800 0 0 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0 0 0
## lot tell spend clean cut reduce deliver fall retailer sale drive wait hit
## 3800 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0 0 0 0 0
## mask post vulnerable economy meet help risk pay person ive glove nurse
## 3800 0 0 0 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0 0 0 0
## wear queue behavior gas oil purchase expect shift medical
## 3800 0 0 0 0 0 0 0 0 0
## 3803 0 0 0 0 0 0 0 0 0
## 3805 0 0 0 0 0 0 0 0 0
## 3808 0 0 0 0 0 0 0 0 0
## 3814 0 0 0 0 0 0 0 0 0
## 3816 0 0 0 0 0 0 0 0 0
## 3817 0 0 0 0 0 0 0 0 0
## 3826 0 0 0 0 0 0 0 0 0
## 3828 0 0 0 0 0 0 0 0 0
## 3829 0 0 0 0 0 0 0 0 0
set.seed(123)
s <- sample(1:nrow(doc_matrixdf2), nrow(doc_matrixdf2)*(0.70), replace = FALSE) # random sampling
trainData <- doc_matrixdf2[s,] # training set
testData <- doc_matrixdf2[-s,] # testing set
rf_tweet=randomForest(x=trainData[-10],
y=factor(trainData$`new_tweet$Sentiment[1:9971]`),
ntree=30)
rf_tweet
##
## Call:
## randomForest(x = trainData[-10], y = factor(trainData$`new_tweet$Sentiment[1:9971]`), ntree = 30)
## Type of random forest: classification
## Number of trees: 30
## No. of variables tried at each split: 13
##
## OOB estimate of error rate: 3.83%
## Confusion matrix:
## Extremely Negative Extremely Positive Negative Neutral
## Extremely Negative 850 62 25 3
## Extremely Positive 59 1015 41 10
## Negative 9 18 1629 10
## Neutral 1 5 16 1304
## Positive 0 0 2 3
## Positive class.error
## Extremely Negative 1 0.096705632
## Extremely Positive 0 0.097777778
## Negative 1 0.022795441
## Neutral 1 0.017332329
## Positive 1914 0.002605524
tweet_pred=predict(rf_tweet, newdata=testData)
confusionMatrix(reference=factor(testData$`new_tweet$Sentiment[1:9971]`), tweet_pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Extremely Negative Extremely Positive Negative Neutral
## Extremely Negative 377 9 0 0
## Extremely Positive 10 448 1 0
## Negative 7 7 718 0
## Neutral 1 0 1 568
## Positive 0 0 0 0
## Reference
## Prediction Positive
## Extremely Negative 0
## Extremely Positive 0
## Negative 0
## Neutral 0
## Positive 845
##
## Overall Statistics
##
## Accuracy : 0.988
## 95% CI : (0.9834, 0.9916)
## No Information Rate : 0.2824
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9847
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Extremely Negative Class: Extremely Positive
## Sensitivity 0.9544 0.9655
## Specificity 0.9965 0.9956
## Pos Pred Value 0.9767 0.9760
## Neg Pred Value 0.9931 0.9937
## Prevalence 0.1320 0.1551
## Detection Rate 0.1260 0.1497
## Detection Prevalence 0.1290 0.1534
## Balanced Accuracy 0.9755 0.9806
## Class: Negative Class: Neutral Class: Positive
## Sensitivity 0.9972 1.0000 1.0000
## Specificity 0.9938 0.9992 1.0000
## Pos Pred Value 0.9809 0.9965 1.0000
## Neg Pred Value 0.9991 1.0000 1.0000
## Prevalence 0.2406 0.1898 0.2824
## Detection Rate 0.2400 0.1898 0.2824
## Detection Prevalence 0.2447 0.1905 0.2824
## Balanced Accuracy 0.9955 0.9996 1.0000
cast_dtm with weighting=tm::weightTf.TF_matrix<- tweet_token%>%
count(UserName, word)%>%
cast_dtm(UserName, term = word, value = n, weighting = tm::weightTf)
TF_matrix
## <<DocumentTermMatrix (documents: 9971, terms: 14543)>>
## Non-/sparse entries: 117343/144890910
## Sparsity : 100%
## Maximal term length: 113
## Weighting : term frequency (tf)
LDA_model = LDA(TF_matrix, k=6, control = list(seed=123))
LDA_model
## A LDA_VEM topic model with 6 topics.
top10_words = tidy(LDA_model, matrix="beta")
top10_words
## # A tibble: 87,258 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 account 0.000225
## 2 2 account 0.0000474
## 3 3 account 0.000481
## 4 4 account 0.000356
## 5 5 account 0.0000555
## 6 6 account 0.000394
## 7 1 adequate 0.00000668
## 8 2 adequate 0.0000194
## 9 3 adequate 0.000191
## 10 4 adequate 0.0000258
## # … with 87,248 more rows
top5_topics = top10_words %>%
group_by(topic) %>%
top_n(10, beta)%>%
ungroup() %>%
arrange(topic, beta)
top5_topics %>%
mutate(term=reorder(term, beta)) %>%
ggplot(aes(term, beta, fill=factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~topic, scales = "free") +
coord_flip()
gamma_matrix = tidy(LDA_model, matrix="gamma")
gamma_matrix
## # A tibble: 59,826 × 3
## document topic gamma
## <chr> <int> <dbl>
## 1 3800 1 0.163
## 2 3803 1 0.169
## 3 3805 1 0.169
## 4 3808 1 0.170
## 5 3814 1 0.165
## 6 3816 1 0.168
## 7 3817 1 0.172
## 8 3826 1 0.167
## 9 3828 1 0.167
## 10 3829 1 0.165
## # … with 59,816 more rows
gamma_matrix%>%
filter(document==3800)
## # A tibble: 6 × 3
## document topic gamma
## <chr> <int> <dbl>
## 1 3800 1 0.163
## 2 3800 2 0.160
## 3 3800 3 0.171
## 4 3800 4 0.166
## 5 3800 5 0.179
## 6 3800 6 0.161
samp_size=floor(.9*nrow(TF_matrix))
set.seed(1234)
train_i=sample(nrow(TF_matrix), size=samp_size)
train_lda=TF_matrix[train_i,]
test_lda=TF_matrix[-train_i,]
values=c()
for(i in c(2:10)){
lda_model2=LDA(train_lda, k=i, control=list(seed=123))
values =c(values, perplexity(lda_model2, newdata=test_lda))
}
plot(c(2:10), values, main="Perplexityfor k",
xlab="Number of Topics", ylab="perplexity")