\[\\[0.2in]\]

Naïve Bayes Classifier

\[\\[0.1in]\]

Housekeeping

Loading needed libraries

library(e1071)
library(dplyr)
library(ggplot2)
library(gmodels)
library(psych)
library(tm)

\[\\[0.1in]\]

Read YouTube data from a CSV file

youtube <- read.csv("tut3b.youtube.csv", header = T)

\[\\[0.001in]\]

\[\\[0.01in]\] ## Preparing the outcome variable

# Preparing the outcome variable

# 1. Recoding to a binary/dummmy variable
youtube$anger_dummy <- case_when((youtube$anger > 0) ~ 1, TRUE ~ 0)

# 2. Transforming to a factor
# Factors are used to categorize and store data, and can have string or numeric values
# coresponding as levels within a dataset. 
# They're used to represent categorical variables, or in our case - classes.
youtube$anger_dummy <- factor(youtube$anger_dummy)

table(youtube$anger_dummy)

## 
##    0    1 
## 6446 3187

Convert text into corpus

ytcorpus <- VCorpus(VectorSource(youtube$cleaner_text))
inspect(ytcorpus[1:5])

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 5
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 193
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 80
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 44
## 
## [[4]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 25
## 
## [[5]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 10

print(ytcorpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 9633

# Looking at some comments in the corpus

as.character(ytcorpus[[1200]])

## [1] "democrats republicans unite standing ovation juan guaido people hate guy venezuela check https skeptical youtubewnwspqlo"

as.character(ytcorpus[[2222]])

## [1] "trump orator skills average middle schooler honesty grave thieves feaking lies https skeptical wcnncomfebruary fourth thousand twentypoliticsfactchecktrumpstateoftheunionindexhtml"

as.character(ytcorpus[[3399]])

## [1] "stupidly wrong president millions americans private health care happy underinsured straddled thousands dollars copayments deductables fiddle healthcare country treating healthcare capitalist venture bound millions americans fall cracks bankrupt medical condition economy steroids falls steeper recession president precious stock market falls millions healthcare straddled college debt president economy inevitably runs steroids playing golf mansion grandma bankrupt cancer medicare tulsi gabbard president"

as.character(ytcorpus[[6699]])

## [1] "lies clown president"

as.character(ytcorpus[[6817]])

## [1] "holland netherlands problemthe majority lies statistics middle class shrinking country millionaires record increases salaries prescription drugs continued excuse tongue sticking lode insurance companies deny people preexisting conditions obamathe truth admin middle court battle insurance companies deny people preexisting conditions half country despise lie filled speech true onreally talks god religion church goer pronounce books bible racist companies tons lawsuits paying bills wife found lady multiple payoffs mistresses talk affairs running president documents real yea honest honorable yea speaker woman shaking head shocked half country insane amount pure bull shit spewing disgusting mouth"

Document-Term Matrix

# Create Document-Term Matrix

yt_dtm <- DocumentTermMatrix(ytcorpus)

Splitting the data

# This will split the data 3:1 (Training:Testing)
train_proportion <- 0.75 
test_proportion <- 1 - train_proportion

# Calculate the sizes based on proportions
total_size <- nrow(yt_dtm)
train_size <- round(train_proportion * total_size)
test_size <- total_size - train_size

# Generate random indices for train and test sets
set.seed(123)  # Set seed for reproducibility
indices <- sample(nrow(yt_dtm))  # Create a shuffled sequence of row indices

# Split the data based on the shuffled indices and calculated sizes
train_indices <- indices[1:train_size]
test_indices <- indices[(train_size + 1):(train_size + test_size)]

# Create train and test dataframes using the selected indices
train <- yt_dtm[train_indices, ]
test <- yt_dtm[test_indices, ]

# Adding the variable's labels (0 and 1 in this case)
trainlabs <- youtube[train_indices, ]$anger_dummy
testlabs <- youtube[test_indices, ]$anger_dummy

# The resulting datasets' structures
str(train)

## List of 6
##  $ i       : int [1:68104] 869 869 869 869 869 869 869 869 869 869 ...
##  $ j       : int [1:68104] 432 841 1415 1460 2915 7956 8057 8572 8651 8971 ...
##  $ v       : num [1:68104] 1 1 1 1 1 1 1 1 1 1 ...
##  $ nrow    : int 7225
##  $ ncol    : int 15392
##  $ dimnames:List of 2
##   ..$ Docs : chr [1:7225] "2463" "2511" "8718" "2986" ...
##   ..$ Terms: chr [1:15392] "aamazing" "aaplauing" "aaron" "aases" ...
##  - attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
##  - attr(*, "weighting")= chr [1:2] "term frequency" "tf"

str(test)

## List of 6
##  $ i       : int [1:23003] 993 993 993 1969 1969 1969 1969 1969 1969 1969 ...
##  $ j       : int [1:23003] 1536 8971 11108 2701 3446 3816 8501 9697 9849 12266 ...
##  $ v       : num [1:23003] 1 1 1 1 1 1 1 1 1 1 ...
##  $ nrow    : int 2408
##  $ ncol    : int 15392
##  $ dimnames:List of 2
##   ..$ Docs : chr [1:2408] "7094" "3328" "406" "4533" ...
##   ..$ Terms: chr [1:15392] "aamazing" "aaplauing" "aaron" "aases" ...
##  - attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
##  - attr(*, "weighting")= chr [1:2] "term frequency" "tf"

# Check if the anger_dummy distributions are similar in both datasets
summarytools::freq(trainlabs)

## Frequencies  
## trainlabs  
## Type: Factor  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0   4828     66.82          66.82     66.82          66.82
##           1   2397     33.18         100.00     33.18         100.00
##        <NA>      0                               0.00         100.00
##       Total   7225    100.00         100.00    100.00         100.00

summarytools::freq(testlabs)

## Frequencies  
## testlabs  
## Type: Factor  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0   1618     67.19          67.19     67.19          67.19
##           1    790     32.81         100.00     32.81         100.00
##        <NA>      0                               0.00         100.00
##       Total   2408    100.00         100.00    100.00         100.00

# setting the bar: only words that appear 5 times or more in the corpus will be included in the analysis
yt_freq_words <- findFreqTerms(train, 5)
str(yt_freq_words)

##  chr [1:2811] "aaron" "abc" "ability" "abortion" "abraham" "absolute" ...

# include only the frequent terms in the datasets
train_freq <- train[, yt_freq_words]
test_freq <- test[, yt_freq_words]

The model

# A function to create a dummy for when a word appears in a comment
wordhits <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}


train <- apply(train_freq, MARGIN = 2, wordhits)
test <- apply(test_freq, MARGIN = 2, wordhits)

# Create the model
yt_bayes <- e1071::naiveBayes(train, trainlabs)

# Test the model
yt_predict <- predict(yt_bayes, test)
summary(yt_bayes)

##           Length Class  Mode     
## apriori      2   table  numeric  
## tables    2811   -none- list     
## levels       2   -none- character
## isnumeric 2811   -none- logical  
## call         3   -none- call

yt_bayes$apriori

## trainlabs
##    0    1 
## 4828 2397

yt_bayes$tables[sample(length(yt_bayes$tables), 10)]

## $house
##          house
## trainlabs          No         Yes
##         0 0.992543496 0.007456504
##         1 0.971631206 0.028368794
## 
## $poison
##          poison
## trainlabs          No         Yes
##         0 1.000000000 0.000000000
##         1 0.996245307 0.003754693
## 
## $wrote
##          wrote
## trainlabs           No          Yes
##         0 0.9993786247 0.0006213753
##         1 0.9970796829 0.0029203171
## 
## $swamp
##          swamp
## trainlabs          No         Yes
##         0 0.998342999 0.001657001
##         1 0.997496871 0.002503129
## 
## $jfk
##          jfk
## trainlabs           No          Yes
##         0 0.9995857498 0.0004142502
##         1 0.9991656237 0.0008343763
## 
## $light
##          light
## trainlabs           No          Yes
##         0 0.9991714996 0.0008285004
##         1 0.9974968711 0.0025031289
## 
## $denials
##          denials
## trainlabs          No         Yes
##         0 1.000000000 0.000000000
##         1 0.998331247 0.001668753
## 
## $jerry
##          jerry
## trainlabs           No          Yes
##         0 0.9985501243 0.0014498757
##         1 0.9991656237 0.0008343763
## 
## $minorities
##          minorities
## trainlabs          No         Yes
##         0 0.997307374 0.002692626
##         1 0.996662495 0.003337505
## 
## $devil
##          devil
## trainlabs          No         Yes
##         0 1.000000000 0.000000000
##         1 0.993742178 0.006257822

# Some summary stats:
conf_matrix1 <- table(yt_predict, testlabs)
conf_matrix1

##           testlabs
## yt_predict    0    1
##          0 1498  404
##          1  120  386

conf_matrix1[2, 2] / sum(conf_matrix1[2, ]) # Sensitivity

## [1] 0.7628458

conf_matrix1[1, 1] / sum(conf_matrix1[1, ]) # Specificity

## [1] 0.787592

conf_matrix1[2, 2] / sum(conf_matrix1[, 2]) # Precision

## [1] 0.4886076

# improve the model by smoothing it, specifically: removing NA probabilities

# Create the model
yt_bayes2 <- e1071::naiveBayes(train, trainlabs, laplace = 1)
yt_predict2 <- predict(yt_bayes2, test)
summary(yt_bayes2)

##           Length Class  Mode     
## apriori      2   table  numeric  
## tables    2811   -none- list     
## levels       2   -none- character
## isnumeric 2811   -none- logical  
## call         4   -none- call

yt_bayes2$apriori

## trainlabs
##    0    1 
## 4828 2397

yt_bayes2$tables[sample(length(yt_bayes2$tables), 10)]

## $reply
##          reply
## trainlabs           No          Yes
##         0 0.9995857498 0.0008285004
##         1 0.9987484355 0.0020859408
## 
## $flynn
##          flynn
## trainlabs           No          Yes
##         0 1.0002071251 0.0002071251
##         1 0.9987484355 0.0020859408
## 
## $elect
##          elect
## trainlabs           No          Yes
##         0 1.0000000000 0.0004142502
##         1 0.9970796829 0.0037546934
## 
## $surprised
##          surprised
## trainlabs          No         Yes
##         0 0.998964374 0.001449876
##         1 0.994576554 0.006257822
## 
## $returns
##          returns
## trainlabs           No          Yes
##         0 0.9997928749 0.0006213753
##         1 0.9983312474 0.0025031289
## 
## $wife
##          wife
## trainlabs          No         Yes
##         0 0.997721624 0.002692626
##         1 0.991656237 0.009178139
## 
## $helps
##          helps
## trainlabs           No          Yes
##         0 0.9997928749 0.0006213753
##         1 0.9979140592 0.0029203171
## 
## $taylor
##          taylor
## trainlabs          No         Yes
##         0 0.998964374 0.001449876
##         1 0.998748436 0.002085941
## 
## $sean
##          sean
## trainlabs           No          Yes
##         0 1.0000000000 0.0004142502
##         1 0.9987484355 0.0020859408
## 
## $peoples
##          peoples
## trainlabs          No         Yes
##         0 0.999378625 0.001035626
##         1 0.996662495 0.004171882

# Some summary stats again, based on the confusion matrix for the second model
conf_matrix2 <- table(yt_predict2, testlabs)
conf_matrix2

##            testlabs
## yt_predict2    0    1
##           0 1480  292
##           1  138  498

conf_matrix2[2, 2] / sum(conf_matrix2[2, ]) # Sensitivity

## [1] 0.7830189

conf_matrix2[1, 1] / sum(conf_matrix2[1, ]) # Specificity

## [1] 0.8352144

conf_matrix2[2, 2] / sum(conf_matrix2[, 2]) # Precision

## [1] 0.6303797

DSO2008 2023-2024 Working with Big Data

Eliyahu V. Sapir

4 December 2023

Naïve Bayes Classifier

Housekeeping

Loading needed libraries

Read YouTube data from a CSV file

Convert text into corpus

Document-Term Matrix

Splitting the data

The model