In this lab, we are focussing on the use of Bayesian methods, specifically Naïve Bayesian Classifiers . In Machine learning, we focus on improving the performance of a model,whether that is improving the performance of an algorithm or increasing learning, than it (machine learning) does on the specific results obtained from running an algorithm (as does data mining).

Hence, in this assignment we will look at two ways of improving a model(using brute force).The data used is the Online News popularity data , that can be found here: https://archive.ics.uci.edu/ml/datasets/Online+News+Popularity

Method 1:(Original Unimproved Application)

Step1 :Data Exploration and Preparation

library(readxl)
OnlineNews <- read_excel("Copy of OnlineNewsPopularity.xlsx")
OnlineNews<-as.data.frame(OnlineNews)
news<-OnlineNews

##check missing data
str(news)
## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : chr  "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : num  593 711 1500 1200 505 855 556 891 3600 710 ...
sum(is.na(news))
## [1] 0
##reduced dataframe to 17 variables
##renamed some columns
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

We introduce a new column called “popular” and it will be a categorical binary column with values(Yes and No). This will be based on values in the Shares column .1.e >1400 = “Yes”.

##introduced new column with nas
newsShort$popular = rep('na', nrow(newsShort))
summary(newsShort$shares)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1     946    1400    3395    2800  843300
##Split the popular column into "yes" and "no" based on the values on the Shares column 
for(i in 1:39644) {
  if(newsShort$shares[i] >= 1400) {
    newsShort$popular[i] = "yes"} 
  else {newsShort$popular[i] = "no"}
}

The data is then randomized before split in a 75%/25% fashion.

##SPlit the data for training and testing
news_rand <- newsShort[order(runif(10000)), ]
set.seed(12345)


##compare the distribution of randomized dataset from the original datset and 
#there is no significant difference in the distribution
summary(news_rand$shares)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       4     973    1500    3310    2800  843300
summary(news$shares)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1     946    1400    3395    2800  843300
#Split the data into training and test datasets
news_train <- news_rand[1:7500, ]
news_test <- news_rand[7501:10000, ]

Distribution is checked between the test and training dataset just to ensure that there is no signifcant differences

##check for distribution between the initial,test and train dataset
##they have same distribution which is good since the difference is not high
prop.table(table(news_train$popular))
## 
##     no    yes 
## 0.4308 0.5692
prop.table(table(news_test$popular))
## 
##     no    yes 
## 0.4244 0.5756

Step 2: Training a Model

#model design
nb_model <- naive_bayes(popular ~ ., data=news_train)
nb_model
## ===================== Naive Bayes ===================== 
## Call: 
## naive_bayes.formula(formula = popular ~ ., data = news_train)
## 
## A priori probabilities: 
## 
##     no    yes 
## 0.4308 0.5692 
## 
## Tables: 
##               
## n_tokens_title       no      yes
##           mean 9.818942 9.712579
##           sd   1.954995 1.987917
## 
##                 
## n_tokens_content       no      yes
##             mean 456.3463 509.9862
##             sd   348.6891 446.9994
## 
##                
## n_unique_tokens        no       yes
##            mean 0.5686352 0.5547841
##            sd   0.1116416 0.1247281
## 
##                 
## n_non_stop_words         no        yes
##             mean 0.99442896 0.99039587
##             sd   0.07444266 0.09754031
## 
##          
## num_hrefs        no       yes
##      mean  9.224389 10.528929
##      sd    8.855362 11.567897
## 
## # ... and 12 more tables

From the results above, 56.8% of the articles are popular.

Step 3:Model evaluation

news_Pred <- predict(nb_model, newdata = news_test)
(conf_nat <- table(news_Pred, news_test$popular))
##          
## news_Pred   no  yes
##       no  1024  435
##       yes   37 1004
summary(conf_nat)
## Number of cases in table: 2500 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 1104.1, df = 1, p-value = 4.184e-242
conf_nat
##          
## news_Pred   no  yes
##       no  1024  435
##       yes   37 1004
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 81.12
Accuracy
## [1] 81.12

From the results above, the model is 82.6% accurate meaning that 82 out of 100 times, our model predicts the class of a record correctly. From the confusion matrix, the number of true positive is (1027) +true negatives(1038), while the number of false positives is (28) and false negatives(1038)

Method 2: Improving the Classifier by removing highly correlated varibles - Feature Selection

We will look at the varibales that are highly correlated to each other and remove them from our model and check how different is the modified model’s performance and accuracy. The method is also called feature selection.We hope that it will improve the accuracy of the model.

Step 1: Data Preparation and Exploration

Data is first randomized - 10,000rows selected

news_rand2<-newsShort[order(runif(10000)), ]

set.seed(12345)

str(news_rand2)
## 'data.frame':    10000 obs. of  18 variables:
##  $ n_tokens_title              : num  8 10 9 10 12 9 11 9 11 10 ...
##  $ n_tokens_content            : num  152 555 457 958 254 ...
##  $ n_unique_tokens             : num  0.722 0.557 0.493 0.465 0.683 ...
##  $ n_non_stop_words            : num  1 1 1 1 1 ...
##  $ num_hrefs                   : num  4 20 2 8 4 10 4 14 3 4 ...
##  $ num_imgs                    : num  0 1 1 1 0 1 1 25 1 0 ...
##  $ num_videos                  : num  0 0 0 0 2 0 0 0 0 0 ...
##  $ average_token_length        : num  4.72 4.73 4.57 4.74 4.53 ...
##  $ num_keywords                : num  10 9 5 8 9 6 8 9 5 9 ...
##  $ kw_max_max                  : num  690400 690400 80400 51900 690400 ...
##  $ global_sentiment_polarity   : num  0.0282 0.1571 0.0336 0.0714 0.1241 ...
##  $ avg_positive_polarity       : num  0.234 0.461 0.319 0.377 0.4 ...
##  $ title_subjectivity          : num  0.455 0 0 0 1 ...
##  $ title_sentiment_polarity    : num  0.136 0 0 0 0.5 ...
##  $ abs_title_subjectivity      : num  0.0455 0.5 0.5 0.5 0.5 ...
##  $ abs_title_sentiment_polarity: num  0.136 0 0 0 0.5 ...
##  $ shares                      : num  1100 6200 750 661 1500 1900 1100 899 2300 1800 ...
##  $ popular                     : chr  "no" "yes" "no" "no" ...

Data is then scaled in order to calculate and find the highly correlated variables. The categorical variable is dropped from the dataset before the scaling process.

##remove the factor variables first from the dataset then scale it
##scaling the dataset credit_rand before using it as we want to bring all variables to a same scale inorder to identify correlations.
news_scale1 <- news_rand2[, -c(18)]
News_Scaled <- scale(news_scale1, center=TRUE, scale = TRUE)

A corrrelation matrix is then used to perform feature : variable selection/ identify variables that are highly correlated. Using the findCorrelation() function, we identify features of a data set are correlated above/below a cutoff value. Our value is 0.3

x<- cor(News_Scaled)
newscor<- findCorrelation(x, 0.30)
newscor
## [1]  3 16  2  4 12 13
#Remove highly correlated columns from the original dataset and then subdivide train and tests
filterednews<- news_rand2[, -(newscor)]
str(filterednews)
## 'data.frame':    10000 obs. of  12 variables:
##  $ n_tokens_title           : num  8 10 9 10 12 9 11 9 11 10 ...
##  $ num_hrefs                : num  4 20 2 8 4 10 4 14 3 4 ...
##  $ num_imgs                 : num  0 1 1 1 0 1 1 25 1 0 ...
##  $ num_videos               : num  0 0 0 0 2 0 0 0 0 0 ...
##  $ average_token_length     : num  4.72 4.73 4.57 4.74 4.53 ...
##  $ num_keywords             : num  10 9 5 8 9 6 8 9 5 9 ...
##  $ kw_max_max               : num  690400 690400 80400 51900 690400 ...
##  $ global_sentiment_polarity: num  0.0282 0.1571 0.0336 0.0714 0.1241 ...
##  $ title_sentiment_polarity : num  0.136 0 0 0 0.5 ...
##  $ abs_title_subjectivity   : num  0.0455 0.5 0.5 0.5 0.5 ...
##  $ shares                   : num  1100 6200 750 661 1500 1900 1100 899 2300 1800 ...
##  $ popular                  : chr  "no" "yes" "no" "no" ...
Step2:Model Training
filteredNewsTrain <- filterednews[1:7500, ]
filteredNewsTest <- filterednews[7501:10000, ]

##Train the Data 
library(naivebayes)
newstrain_model <- naive_bayes(popular ~ ., data=filteredNewsTrain)
Step 3:Model Evaluation
## Evaluate the model on the test dataset
newspred <- predict(newstrain_model, newdata = filteredNewsTest)

table(newspred, filteredNewsTest$popular)
##         
## newspred   no  yes
##      no  1038  410
##      yes   17 1035
(conf_nat <- table(newspred, filteredNewsTest$popular))
##         
## newspred   no  yes
##      no  1038  410
##      yes   17 1035
##ACcuracy didnt improve much . It is 82.9% it improved by 0.3% points.
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 82.92

The accuracy improved by 0.3% to 82.9%. Removing the highly correlated variables didnt increase the accruacy that much.