library(rpart)
library(rpart.plot)
library(ca)
library(ggplot2)
library(C50)
library(MASS)

News Popularity

Import the Data:

news <- read.csv("C:/Users/charl/Downloads/ANLY 530--OnlineNewsPopularity.csv")
str(news)
## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ timedelta                    : int  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : int  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : int  219 255 211 531 1072 370 960 989 97 231 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : int  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : int  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : int  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : int  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: int  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : int  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : int  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...

Import the Data

newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

Pre-Processing the Data

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular

newsShort$shares <- as.factor(newsShort$shares)
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]

Let us start the classification analysis:

#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]

Check the proportion of Data after randomization:

prop.table(table(news_train$shares))
## 
##        no       yes 
## 0.4308889 0.5691111
prop.table(table(news_test$shares))
## 
##    no   yes 
## 0.414 0.586

Let us Train the Model:

news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Jul 01 19:20:32 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 9000 cases (18 attributes) from undefined.data
## 
## Decision tree:
## 
## popular = no: no (3878)
## popular = yes: yes (5122)
## 
## 
## Evaluation on training data (9000 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    3878          (a): class no
##          5122    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% popular
## 
## 
## Time: 0.1 secs

Evaluate the Model:

news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))
##          
## news_pred  no yes
##       no  414   0
##       yes   0 586
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 100

Let’s Visualize the Model:

plot(newsShort$shares)

As above, our dichotomy is the following:

summary(news_test$shares)
##  no yes 
## 414 586

Finally, let us re-evaluate part of the model:

library(gmodels)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##               | predicted shares 
## actual shares |        no |       yes | Row Total | 
## --------------|-----------|-----------|-----------|
##            no |       414 |         0 |       414 | 
##               |     0.414 |     0.000 |           | 
## --------------|-----------|-----------|-----------|
##           yes |         0 |       586 |       586 | 
##               |     0.000 |     0.586 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |       414 |       586 |      1000 | 
## --------------|-----------|-----------|-----------|
## 
##