###PART 4
news <- read.csv("C:/Users/Priya/Desktop/ANLY 500/news.csv")
#Check for missing data
sum(is.na(news))
## [1] 0
#remove non-predictive variables
news <- news[,-(1:2)]
#Check for outliers
news=news[!news$n_unique_tokens==701,]
#Keep variables that are meaningful for our model
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
#Standardize the dataset
for(i in ncol(news)-1){
news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}
#Define articles with shares greater than 1400 as popular articles
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))
###Applying Naive-Bayes on New Popularity Dataset after pre-processing the original dataset as above.
set.seed(12345)
news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:35678, ]
news_test <- news_rand[35679:39643, ]
prop.table(table(news_train$shares))
##
## 0 1
## 0.506867 0.493133
prop.table(table(news_test$shares))
##
## 0 1
## 0.5039092 0.4960908
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.5.3
## naivebayes 0.9.6 loaded
naive_modelNews <- naive_bayes(as.character(shares) ~ ., data= news_train)
naive_modelNews
##
## ================================ Naive Bayes =================================
##
## Call:
## naive_bayes.formula(formula = as.character(shares) ~ ., data = news_train)
##
## ------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.506867 0.493133
##
## ------------------------------------------------------------------------------
##
## Tables:
##
## ------------------------------------------------------------------------------
## ::: n_tokens_title (Gaussian)
## ------------------------------------------------------------------------------
##
## n_tokens_title 0 1
## mean 10.491650 10.305047
## sd 2.088112 2.131096
##
## ------------------------------------------------------------------------------
## ::: n_tokens_content (Gaussian)
## ------------------------------------------------------------------------------
##
## n_tokens_content 0 1
## mean 527.6581 565.0161
## sd 426.8231 509.6529
##
## ------------------------------------------------------------------------------
## ::: n_unique_tokens (Gaussian)
## ------------------------------------------------------------------------------
##
## n_unique_tokens 0 1
## mean 0.5378909 0.5237544
## sd 0.1309307 0.1421511
##
## ------------------------------------------------------------------------------
## ::: n_non_stop_words (Gaussian)
## ------------------------------------------------------------------------------
##
## n_non_stop_words 0 1
## mean 0.9745631 0.9664658
## sd 0.1574522 0.1800318
##
## ------------------------------------------------------------------------------
## ::: num_hrefs (Gaussian)
## ------------------------------------------------------------------------------
##
## num_hrefs 0 1
## mean 9.889847 11.861487
## sd 9.795279 12.502676
##
## ------------------------------------------------------------------------------
##
## # ... and 11 more tables
##
## ------------------------------------------------------------------------------
###Lets check the accuracy of the Naive-Bayes model on News Dataset
conf_natNews <- table(predict(naive_modelNews, news_test), news_test$shares)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
Accuracy <- sum(diag(conf_natNews))/sum(conf_natNews)*100
Accuracy
## [1] 53.97226
###We get a lower accuracy of 54% by using Naive-Bayes for News popularity dataset. Lets try using SVM models.
library(kernlab)
## Warning: package 'kernlab' was built under R version 3.5.2
news_classifier <- ksvm(shares ~., data= news_train,kernel="vanilladot")
## Setting default kernel parameters
summary(news_classifier)
## Length Class Mode
## 1 ksvm S4
news_predictions <- predict(news_classifier, news_test)
(p<- table(news_predictions,news_test$shares))
##
## news_predictions 0 1
## 0 1401 1167
## 1 597 800
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 55.51072
###Using vanilladot as kernel we get a slightly improved accuracy of 55.51%. Lets try using other kernels,
library(kernlab)
news_classifier <- ksvm(shares ~., data= news_train,kernel="rbfdot")
summary(news_classifier)
## Length Class Mode
## 1 ksvm S4
news_predictions <- predict(news_classifier, news_test)
(p<- table(news_predictions,news_test$shares))
##
## news_predictions 0 1
## 0 1326 949
## 1 672 1018
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 59.11728
###Using rbfdot as kernel we get an accuracy of 59% which is improvement from the earlier models so far. Lets try the final model for this lab using kernel as polydot,
library(kernlab)
news_classifier <- ksvm(shares ~., data= news_train,kernel="polydot")
## Setting default kernel parameters
summary(news_classifier)
## Length Class Mode
## 1 ksvm S4
news_predictions <- predict(news_classifier, news_test)
(p<- table(news_predictions,news_test$shares))
##
## news_predictions 0 1
## 0 1401 1166
## 1 597 801
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 55.53594