\[\\[0.2in]\]
\[\\[0.1in]\]
library(e1071)
library(dplyr)
library(ggplot2)
library(gmodels)
library(psych)
library(tm)
\[\\[0.1in]\]
youtube <- read.csv("tut3b.youtube.csv", header = T)
\[\\[0.001in]\]
\[\\[0.01in]\] ## Preparing the outcome variable
# Preparing the outcome variable
# 1. Recoding to a binary/dummmy variable
youtube$anger_dummy <- case_when((youtube$anger > 0) ~ 1, TRUE ~ 0)
# 2. Transforming to a factor
# Factors are used to categorize and store data, and can have string or numeric values
# coresponding as levels within a dataset.
# They're used to represent categorical variables, or in our case - classes.
youtube$anger_dummy <- factor(youtube$anger_dummy)
table(youtube$anger_dummy)
##
## 0 1
## 6446 3187
ytcorpus <- VCorpus(VectorSource(youtube$cleaner_text))
inspect(ytcorpus[1:5])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 193
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 80
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 44
##
## [[4]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 25
##
## [[5]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 10
print(ytcorpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 9633
# Looking at some comments in the corpus
as.character(ytcorpus[[1200]])
## [1] "democrats republicans unite standing ovation juan guaido people hate guy venezuela check https skeptical youtubewnwspqlo"
as.character(ytcorpus[[2222]])
## [1] "trump orator skills average middle schooler honesty grave thieves feaking lies https skeptical wcnncomfebruary fourth thousand twentypoliticsfactchecktrumpstateoftheunionindexhtml"
as.character(ytcorpus[[3399]])
## [1] "stupidly wrong president millions americans private health care happy underinsured straddled thousands dollars copayments deductables fiddle healthcare country treating healthcare capitalist venture bound millions americans fall cracks bankrupt medical condition economy steroids falls steeper recession president precious stock market falls millions healthcare straddled college debt president economy inevitably runs steroids playing golf mansion grandma bankrupt cancer medicare tulsi gabbard president"
as.character(ytcorpus[[6699]])
## [1] "lies clown president"
as.character(ytcorpus[[6817]])
## [1] "holland netherlands problemthe majority lies statistics middle class shrinking country millionaires record increases salaries prescription drugs continued excuse tongue sticking lode insurance companies deny people preexisting conditions obamathe truth admin middle court battle insurance companies deny people preexisting conditions half country despise lie filled speech true onreally talks god religion church goer pronounce books bible racist companies tons lawsuits paying bills wife found lady multiple payoffs mistresses talk affairs running president documents real yea honest honorable yea speaker woman shaking head shocked half country insane amount pure bull shit spewing disgusting mouth"
# Create Document-Term Matrix
yt_dtm <- DocumentTermMatrix(ytcorpus)
# This will split the data 3:1 (Training:Testing)
train_proportion <- 0.75
test_proportion <- 1 - train_proportion
# Calculate the sizes based on proportions
total_size <- nrow(yt_dtm)
train_size <- round(train_proportion * total_size)
test_size <- total_size - train_size
# Generate random indices for train and test sets
set.seed(123) # Set seed for reproducibility
indices <- sample(nrow(yt_dtm)) # Create a shuffled sequence of row indices
# Split the data based on the shuffled indices and calculated sizes
train_indices <- indices[1:train_size]
test_indices <- indices[(train_size + 1):(train_size + test_size)]
# Create train and test dataframes using the selected indices
train <- yt_dtm[train_indices, ]
test <- yt_dtm[test_indices, ]
# Adding the variable's labels (0 and 1 in this case)
trainlabs <- youtube[train_indices, ]$anger_dummy
testlabs <- youtube[test_indices, ]$anger_dummy
# The resulting datasets' structures
str(train)
## List of 6
## $ i : int [1:68104] 869 869 869 869 869 869 869 869 869 869 ...
## $ j : int [1:68104] 432 841 1415 1460 2915 7956 8057 8572 8651 8971 ...
## $ v : num [1:68104] 1 1 1 1 1 1 1 1 1 1 ...
## $ nrow : int 7225
## $ ncol : int 15392
## $ dimnames:List of 2
## ..$ Docs : chr [1:7225] "2463" "2511" "8718" "2986" ...
## ..$ Terms: chr [1:15392] "aamazing" "aaplauing" "aaron" "aases" ...
## - attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
## - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
str(test)
## List of 6
## $ i : int [1:23003] 993 993 993 1969 1969 1969 1969 1969 1969 1969 ...
## $ j : int [1:23003] 1536 8971 11108 2701 3446 3816 8501 9697 9849 12266 ...
## $ v : num [1:23003] 1 1 1 1 1 1 1 1 1 1 ...
## $ nrow : int 2408
## $ ncol : int 15392
## $ dimnames:List of 2
## ..$ Docs : chr [1:2408] "7094" "3328" "406" "4533" ...
## ..$ Terms: chr [1:15392] "aamazing" "aaplauing" "aaron" "aases" ...
## - attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
## - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
# Check if the anger_dummy distributions are similar in both datasets
summarytools::freq(trainlabs)
## Frequencies
## trainlabs
## Type: Factor
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 4828 66.82 66.82 66.82 66.82
## 1 2397 33.18 100.00 33.18 100.00
## <NA> 0 0.00 100.00
## Total 7225 100.00 100.00 100.00 100.00
summarytools::freq(testlabs)
## Frequencies
## testlabs
## Type: Factor
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 1618 67.19 67.19 67.19 67.19
## 1 790 32.81 100.00 32.81 100.00
## <NA> 0 0.00 100.00
## Total 2408 100.00 100.00 100.00 100.00
# setting the bar: only words that appear 5 times or more in the corpus will be included in the analysis
yt_freq_words <- findFreqTerms(train, 5)
str(yt_freq_words)
## chr [1:2811] "aaron" "abc" "ability" "abortion" "abraham" "absolute" ...
# include only the frequent terms in the datasets
train_freq <- train[, yt_freq_words]
test_freq <- test[, yt_freq_words]
# A function to create a dummy for when a word appears in a comment
wordhits <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
train <- apply(train_freq, MARGIN = 2, wordhits)
test <- apply(test_freq, MARGIN = 2, wordhits)
# Create the model
yt_bayes <- e1071::naiveBayes(train, trainlabs)
# Test the model
yt_predict <- predict(yt_bayes, test)
summary(yt_bayes)
## Length Class Mode
## apriori 2 table numeric
## tables 2811 -none- list
## levels 2 -none- character
## isnumeric 2811 -none- logical
## call 3 -none- call
yt_bayes$apriori
## trainlabs
## 0 1
## 4828 2397
yt_bayes$tables[sample(length(yt_bayes$tables), 10)]
## $house
## house
## trainlabs No Yes
## 0 0.992543496 0.007456504
## 1 0.971631206 0.028368794
##
## $poison
## poison
## trainlabs No Yes
## 0 1.000000000 0.000000000
## 1 0.996245307 0.003754693
##
## $wrote
## wrote
## trainlabs No Yes
## 0 0.9993786247 0.0006213753
## 1 0.9970796829 0.0029203171
##
## $swamp
## swamp
## trainlabs No Yes
## 0 0.998342999 0.001657001
## 1 0.997496871 0.002503129
##
## $jfk
## jfk
## trainlabs No Yes
## 0 0.9995857498 0.0004142502
## 1 0.9991656237 0.0008343763
##
## $light
## light
## trainlabs No Yes
## 0 0.9991714996 0.0008285004
## 1 0.9974968711 0.0025031289
##
## $denials
## denials
## trainlabs No Yes
## 0 1.000000000 0.000000000
## 1 0.998331247 0.001668753
##
## $jerry
## jerry
## trainlabs No Yes
## 0 0.9985501243 0.0014498757
## 1 0.9991656237 0.0008343763
##
## $minorities
## minorities
## trainlabs No Yes
## 0 0.997307374 0.002692626
## 1 0.996662495 0.003337505
##
## $devil
## devil
## trainlabs No Yes
## 0 1.000000000 0.000000000
## 1 0.993742178 0.006257822
# Some summary stats:
conf_matrix1 <- table(yt_predict, testlabs)
conf_matrix1
## testlabs
## yt_predict 0 1
## 0 1498 404
## 1 120 386
conf_matrix1[2, 2] / sum(conf_matrix1[2, ]) # Sensitivity
## [1] 0.7628458
conf_matrix1[1, 1] / sum(conf_matrix1[1, ]) # Specificity
## [1] 0.787592
conf_matrix1[2, 2] / sum(conf_matrix1[, 2]) # Precision
## [1] 0.4886076
# improve the model by smoothing it, specifically: removing NA probabilities
# Create the model
yt_bayes2 <- e1071::naiveBayes(train, trainlabs, laplace = 1)
yt_predict2 <- predict(yt_bayes2, test)
summary(yt_bayes2)
## Length Class Mode
## apriori 2 table numeric
## tables 2811 -none- list
## levels 2 -none- character
## isnumeric 2811 -none- logical
## call 4 -none- call
yt_bayes2$apriori
## trainlabs
## 0 1
## 4828 2397
yt_bayes2$tables[sample(length(yt_bayes2$tables), 10)]
## $reply
## reply
## trainlabs No Yes
## 0 0.9995857498 0.0008285004
## 1 0.9987484355 0.0020859408
##
## $flynn
## flynn
## trainlabs No Yes
## 0 1.0002071251 0.0002071251
## 1 0.9987484355 0.0020859408
##
## $elect
## elect
## trainlabs No Yes
## 0 1.0000000000 0.0004142502
## 1 0.9970796829 0.0037546934
##
## $surprised
## surprised
## trainlabs No Yes
## 0 0.998964374 0.001449876
## 1 0.994576554 0.006257822
##
## $returns
## returns
## trainlabs No Yes
## 0 0.9997928749 0.0006213753
## 1 0.9983312474 0.0025031289
##
## $wife
## wife
## trainlabs No Yes
## 0 0.997721624 0.002692626
## 1 0.991656237 0.009178139
##
## $helps
## helps
## trainlabs No Yes
## 0 0.9997928749 0.0006213753
## 1 0.9979140592 0.0029203171
##
## $taylor
## taylor
## trainlabs No Yes
## 0 0.998964374 0.001449876
## 1 0.998748436 0.002085941
##
## $sean
## sean
## trainlabs No Yes
## 0 1.0000000000 0.0004142502
## 1 0.9987484355 0.0020859408
##
## $peoples
## peoples
## trainlabs No Yes
## 0 0.999378625 0.001035626
## 1 0.996662495 0.004171882
# Some summary stats again, based on the confusion matrix for the second model
conf_matrix2 <- table(yt_predict2, testlabs)
conf_matrix2
## testlabs
## yt_predict2 0 1
## 0 1480 292
## 1 138 498
conf_matrix2[2, 2] / sum(conf_matrix2[2, ]) # Sensitivity
## [1] 0.7830189
conf_matrix2[1, 1] / sum(conf_matrix2[1, ]) # Specificity
## [1] 0.8352144
conf_matrix2[2, 2] / sum(conf_matrix2[, 2]) # Precision
## [1] 0.6303797