###PART 4

news <- read.csv("C:/Users/Priya/Desktop/ANLY 500/news.csv")

#Check for missing data
sum(is.na(news))
## [1] 0
#remove non-predictive variables
news <- news[,-(1:2)]

#Check for outliers
news=news[!news$n_unique_tokens==701,]

#Keep variables that are meaningful for our model
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

#Standardize the dataset
for(i in ncol(news)-1){ 
  news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}

#Define articles with shares greater than 1400 as popular articles
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))

###Applying Naive-Bayes on New Popularity Dataset after pre-processing the original dataset as above.

set.seed(12345)
news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:35678, ]
news_test <- news_rand[35679:39643, ]

prop.table(table(news_train$shares))
## 
##        0        1 
## 0.506867 0.493133
prop.table(table(news_test$shares))
## 
##         0         1 
## 0.5039092 0.4960908
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.5.3
## naivebayes 0.9.6 loaded
naive_modelNews <- naive_bayes(as.character(shares) ~ ., data= news_train)
naive_modelNews
## 
## ================================ Naive Bayes ================================= 
##  
##  Call: 
## naive_bayes.formula(formula = as.character(shares) ~ ., data = news_train)
## 
## ------------------------------------------------------------------------------ 
##  
## Laplace smoothing: 0
## 
## ------------------------------------------------------------------------------ 
##  
##  A priori probabilities: 
## 
##        0        1 
## 0.506867 0.493133 
## 
## ------------------------------------------------------------------------------ 
##  
##  Tables: 
## 
## ------------------------------------------------------------------------------ 
##  ::: n_tokens_title (Gaussian) 
## ------------------------------------------------------------------------------ 
##               
## n_tokens_title         0         1
##           mean 10.491650 10.305047
##           sd    2.088112  2.131096
## 
## ------------------------------------------------------------------------------ 
##  ::: n_tokens_content (Gaussian) 
## ------------------------------------------------------------------------------ 
##                 
## n_tokens_content        0        1
##             mean 527.6581 565.0161
##             sd   426.8231 509.6529
## 
## ------------------------------------------------------------------------------ 
##  ::: n_unique_tokens (Gaussian) 
## ------------------------------------------------------------------------------ 
##                
## n_unique_tokens         0         1
##            mean 0.5378909 0.5237544
##            sd   0.1309307 0.1421511
## 
## ------------------------------------------------------------------------------ 
##  ::: n_non_stop_words (Gaussian) 
## ------------------------------------------------------------------------------ 
##                 
## n_non_stop_words         0         1
##             mean 0.9745631 0.9664658
##             sd   0.1574522 0.1800318
## 
## ------------------------------------------------------------------------------ 
##  ::: num_hrefs (Gaussian) 
## ------------------------------------------------------------------------------ 
##          
## num_hrefs         0         1
##      mean  9.889847 11.861487
##      sd    9.795279 12.502676
## 
## ------------------------------------------------------------------------------
## 
## # ... and 11 more tables
## 
## ------------------------------------------------------------------------------

###Lets check the accuracy of the Naive-Bayes model on News Dataset

conf_natNews <- table(predict(naive_modelNews, news_test), news_test$shares)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
Accuracy <- sum(diag(conf_natNews))/sum(conf_natNews)*100
Accuracy
## [1] 53.97226

###We get a lower accuracy of 54% by using Naive-Bayes for News popularity dataset. Lets try using SVM models.

library(kernlab)
## Warning: package 'kernlab' was built under R version 3.5.2
news_classifier <- ksvm(shares ~., data= news_train,kernel="vanilladot")
##  Setting default kernel parameters
summary(news_classifier)
## Length  Class   Mode 
##      1   ksvm     S4
news_predictions <- predict(news_classifier, news_test)
(p<- table(news_predictions,news_test$shares))
##                 
## news_predictions    0    1
##                0 1401 1167
##                1  597  800
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 55.51072

###Using vanilladot as kernel we get a slightly improved accuracy of 55.51%. Lets try using other kernels,

library(kernlab)

news_classifier <- ksvm(shares ~., data= news_train,kernel="rbfdot")

summary(news_classifier)
## Length  Class   Mode 
##      1   ksvm     S4
news_predictions <- predict(news_classifier, news_test)
(p<- table(news_predictions,news_test$shares))
##                 
## news_predictions    0    1
##                0 1326  949
##                1  672 1018
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 59.11728

###Using rbfdot as kernel we get an accuracy of 59% which is improvement from the earlier models so far. Lets try the final model for this lab using kernel as polydot,

library(kernlab)

news_classifier <- ksvm(shares ~., data= news_train,kernel="polydot")
##  Setting default kernel parameters
summary(news_classifier)
## Length  Class   Mode 
##      1   ksvm     S4
news_predictions <- predict(news_classifier, news_test)
(p<- table(news_predictions,news_test$shares))
##                 
## news_predictions    0    1
##                0 1401 1166
##                1  597  801
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 55.53594