Predictive model for online news popularity (number of shares) using linear regression model
data <- read.csv("OnlineNewsPopularityShort.csv")
data <- na.omit(data)
str(data)
## 'data.frame': 4000 obs. of 62 variables:
## $ X : int 38003 19500 1454 2778 295 25639 34775 9529 31349 30609 ...
## $ url : chr "http://mashable.com/2014/12/01/tony-blair-christmas-card/" "http://mashable.com/2014/01/28/nate-silver-staffs-up-as-the-relaunch-of-fivethirtyeight-nears/" "http://mashable.com/2013/01/31/nfl-star-jj-watt-proposes-fan/" "http://mashable.com/2013/02/24/tesla-nyt-comic/" ...
## $ timedelta : int 36 345 707 683 727 231 84 549 137 150 ...
## $ n_tokens_title : int 9 8 12 12 9 11 11 11 12 11 ...
## $ n_tokens_content : int 193 618 1045 95 328 1157 264 626 717 174 ...
## $ n_unique_tokens : num 0.683 0.478 0.506 0.789 0.61 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.841 0.774 0.691 0.867 0.794 ...
## $ num_hrefs : int 2 4 5 4 11 8 7 8 8 2 ...
## $ num_self_hrefs : int 1 3 3 2 0 4 2 5 6 2 ...
## $ num_imgs : int 1 1 1 0 1 11 1 1 1 1 ...
## $ num_videos : int 0 0 26 0 0 1 2 0 2 2 ...
## $ average_token_length : num 4.98 4.25 4.41 4.62 4.95 ...
## $ num_keywords : int 6 7 5 10 7 8 10 4 6 7 ...
## $ data_channel_is_lifestyle : int 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_entertainment: int 0 0 0 0 0 1 1 0 0 1 ...
## $ data_channel_is_bus : int 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : int 0 0 0 0 0 0 0 1 0 0 ...
## $ data_channel_is_tech : int 0 1 0 0 1 0 0 0 0 0 ...
## $ data_channel_is_world : int 1 0 0 0 0 0 0 0 1 0 ...
## $ kw_min_min : int -1 -1 217 217 217 -1 -1 4 -1 -1 ...
## $ kw_max_min : num 502 709 341 3100 593 731 1100 213 385 407 ...
## $ kw_avg_min : num 167 214 291 1014 356 ...
## $ kw_min_max : int 0 0 5000 0 0 4300 0 24900 6700 0 ...
## $ kw_max_max : int 843300 843300 69100 80400 28000 843300 843300 843300 843300 843300 ...
## $ kw_avg_max : num 216200 161986 27800 28370 15986 ...
## $ kw_min_avg : num 0 0 1286 0 0 ...
## $ kw_max_avg : num 3409 3574 2546 8600 5150 ...
## $ kw_avg_avg : num 2037 2429 2055 3490 2072 ...
## $ self_reference_min_shares : num 1800 0 0 1400 0 10700 7600 2400 1400 2500 ...
## $ self_reference_max_shares : int 1800 0 0 1400 0 16300 7600 2400 23100 2500 ...
## $ self_reference_avg_sharess : num 1800 0 0 1400 0 ...
## $ weekday_is_monday : int 0 0 0 0 0 0 0 1 0 1 ...
## $ weekday_is_tuesday : int 0 1 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : int 1 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : int 0 0 1 0 0 1 1 0 0 0 ...
## $ weekday_is_friday : int 0 0 0 0 1 0 0 0 0 0 ...
## $ weekday_is_saturday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : int 0 0 0 1 0 0 0 0 1 0 ...
## $ is_weekend : int 0 0 0 1 0 0 0 0 1 0 ...
## $ LDA_00 : num 0.366 0.316 0.04 0.12 0.31 ...
## $ LDA_01 : num 0.0333 0.4563 0.0405 0.7192 0.0286 ...
## $ LDA_02 : num 0.5341 0.0286 0.2576 0.02 0.0286 ...
## $ LDA_03 : num 0.0333 0.0286 0.6219 0.0222 0.0286 ...
## $ LDA_04 : num 0.0333 0.1708 0.04 0.1186 0.6039 ...
## $ global_subjectivity : num 0.353 0.435 0.452 0.397 0.522 ...
## $ global_sentiment_polarity : num -0.1104 0.1648 -0.0246 0.2197 0.1801 ...
## $ global_rate_positive_words : num 0.0104 0.0502 0.0555 0.0842 0.0457 ...
## $ global_rate_negative_words : num 0.0207 0.0162 0.0517 0.0211 0.0152 ...
## $ rate_positive_words : num 0.333 0.756 0.518 0.8 0.75 ...
## $ rate_negative_words : num 0.667 0.244 0.482 0.2 0.25 ...
## $ avg_positive_polarity : num 0.157 0.423 0.395 0.384 0.409 ...
## $ min_positive_polarity : num 0.1 0.1 0.05 0.136 0.1 ...
## $ max_positive_polarity : num 0.214 1 1 0.8 1 ...
## $ avg_negative_polarity : num -0.438 -0.343 -0.437 -0.328 -0.327 ...
## $ min_negative_polarity : num -0.6 -0.5 -1 -0.5 -0.5 -1 -0.25 -0.4 -0.4 -0.6 ...
## $ max_negative_polarity : num -0.25 -0.167 -0.05 -0.156 -0.167 ...
## $ title_subjectivity : num 0 0.833 0.2 0.244 0 ...
## $ title_sentiment_polarity : num 0 -0.5 0.1 0.0222 0 ...
## $ abs_title_subjectivity : num 0.5 0.333 0.3 0.256 0.5 ...
## $ abs_title_sentiment_polarity : num 0 0.5 0.1 0.0222 0 ...
## $ shares : int 393 1200 1700 2400 1600 22500 4700 6400 1300 1700 ...
data2 <- data[4:62]
Getting the correlation coefficient among independent variables
cor <- cor(data2[,c(1:58)])
#cor
Following loop will return the indices of "cor" matrix where correlation coefficient is greater than 0.7
k <- 1
for(i in 1:58){
for(j in k:58){
if(i != j){
if(cor[i, j] > 0.7){
print(paste0("i =", i, " & j = ", j))
}
}
}
k = k + 1
}
## [1] "i =3 & j = 4"
## [1] "i =3 & j = 5"
## [1] "i =4 & j = 5"
## [1] "i =14 & j = 38"
## [1] "i =16 & j = 42"
## [1] "i =17 & j = 40"
## [1] "i =19 & j = 20"
## [1] "i =25 & j = 26"
## [1] "i =27 & j = 29"
## [1] "i =28 & j = 29"
## [1] "i =44 & j = 47"
## [1] "i =46 & j = 48"
## [1] "i =49 & j = 51"
## [1] "i =52 & j = 53"
## [1] "i =55 & j = 58"
Now removing the dependent variables from "data2" which have high linear relationship with other dependent variable (i.e. cor > 0.7)
data2.filtered <- data2[ , -c(4,5,20,26,29,38,40,42,47,48,51,53,58)]
set.seed(105)
train_index <- sample(1:4000, (0.8*4000))
train_data2.filtered <- data2.filtered[train_index, ]
test_data2.filtered <- data2.filtered[-train_index, ]
lm1 <- lm(shares ~., data = train_data2.filtered)
summary(lm1) # Adjusted R-squared: 0.111 , p-value: < 2.2e-16
##
## Call:
## lm(formula = shares ~ ., data = train_data2.filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168243 -2369 -879 696 566979
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.563e+02 3.312e+03 -0.228 0.819398
## n_tokens_title -2.064e+01 1.121e+02 -0.184 0.853897
## n_tokens_content 3.385e-01 6.882e-01 0.492 0.622858
## n_unique_tokens 1.114e+01 1.882e+01 0.592 0.553844
## num_hrefs 3.063e+01 2.593e+01 1.181 0.237584
## num_self_hrefs -6.356e+01 7.462e+01 -0.852 0.394418
## num_imgs 9.094e-02 3.205e+01 0.003 0.997736
## num_videos -1.600e+01 5.730e+01 -0.279 0.780063
## average_token_length -8.804e+02 4.076e+02 -2.160 0.030857 *
## num_keywords 9.950e+01 1.469e+02 0.677 0.498309
## data_channel_is_lifestyle 9.569e+02 1.524e+03 0.628 0.530160
## data_channel_is_entertainment 1.317e+03 9.643e+02 1.366 0.172092
## data_channel_is_bus 3.235e+03 1.366e+03 2.368 0.017933 *
## data_channel_is_socmed 1.822e+03 1.487e+03 1.226 0.220469
## data_channel_is_tech 2.201e+03 1.375e+03 1.601 0.109494
## data_channel_is_world 1.334e+03 1.383e+03 0.965 0.334765
## kw_min_min 6.284e+00 6.953e+00 0.904 0.366197
## kw_max_min -1.092e-01 9.451e-02 -1.155 0.247989
## kw_min_max -6.664e-03 4.676e-03 -1.425 0.154255
## kw_max_max -1.192e-03 2.423e-03 -0.492 0.622676
## kw_avg_max 6.073e-03 3.129e-03 1.941 0.052356 .
## kw_min_avg 2.578e-02 2.354e-01 0.109 0.912820
## kw_max_avg 2.361e-01 6.520e-02 3.621 0.000299 ***
## self_reference_min_shares 2.454e-01 1.622e-02 15.128 < 2e-16 ***
## self_reference_max_shares 1.362e-03 9.364e-03 0.145 0.884364
## weekday_is_monday 9.523e+02 1.080e+03 0.882 0.378104
## weekday_is_tuesday -7.744e+02 1.070e+03 -0.724 0.469328
## weekday_is_wednesday -2.473e+02 1.070e+03 -0.231 0.817255
## weekday_is_thursday -9.405e+02 1.075e+03 -0.875 0.381696
## weekday_is_friday -6.620e+02 1.104e+03 -0.600 0.548723
## weekday_is_saturday 5.529e+02 1.292e+03 0.428 0.668683
## weekday_is_sunday NA NA NA NA
## is_weekend NA NA NA NA
## LDA_01 1.132e+03 1.740e+03 0.651 0.515265
## LDA_03 2.158e+02 1.633e+03 0.132 0.894881
## global_subjectivity 3.332e+03 3.253e+03 1.024 0.305766
## global_sentiment_polarity 6.117e+03 6.619e+03 0.924 0.355448
## global_rate_positive_words -1.772e+04 2.347e+04 -0.755 0.450235
## global_rate_negative_words 3.545e+04 3.777e+04 0.939 0.347965
## avg_positive_polarity -7.855e+03 4.722e+03 -1.663 0.096328 .
## min_positive_polarity 4.413e+03 4.384e+03 1.007 0.314202
## avg_negative_polarity -7.386e+03 3.346e+03 -2.208 0.027335 *
## max_negative_polarity 1.213e+02 3.491e+03 0.035 0.972282
## title_subjectivity 6.805e+02 8.269e+02 0.823 0.410555
## title_sentiment_polarity 1.465e+03 9.115e+02 1.608 0.108015
## abs_title_subjectivity 2.815e+03 1.404e+03 2.005 0.045059 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12950 on 3156 degrees of freedom
## Multiple R-squared: 0.1229, Adjusted R-squared: 0.111
## F-statistic: 10.29 on 43 and 3156 DF, p-value: < 2.2e-16
prediction1 <- predict(lm1, test_data2.filtered)
## Warning in predict.lm(lm1, test_data2.filtered): prediction from a rank-
## deficient fit may be misleading
In the following five line of code, I am trying to get the dependent variables whose p_value is less than 0.05 (i.e. significant variables) in "lm1"
summary <- summary(lm1)
coeff <- summary$coefficients
str(coeff)
## num [1:44, 1:4] -756.254 -20.644 0.338 11.142 30.626 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:44] "(Intercept)" "n_tokens_title" "n_tokens_content" "n_unique_tokens" ...
## ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
P_value <- coeff[,4]
which(P_value < 0.05)
## average_token_length data_channel_is_bus kw_max_avg
## 9 13 23
## self_reference_min_shares avg_negative_polarity abs_title_subjectivity
## 24 40 44
Now by using output of above code, Subsetting the previous training and test data such that it will only contain signifiant variables
train_data2.reduced <- train_data2.filtered[c("average_token_length", "data_channel_is_bus", "kw_max_avg", "self_reference_min_shares", "avg_negative_polarity", "abs_title_subjectivity", "shares")]
test_data2.reduced <- test_data2.filtered[c("average_token_length", "data_channel_is_bus", "kw_max_avg", "self_reference_min_shares", "avg_negative_polarity", "abs_title_subjectivity", "shares")]
lm2 <- lm(shares ~., data = train_data2.reduced)
summary(lm2)
##
## Call:
## lm(formula = shares ~ ., data = train_data2.reduced)
##
## Residuals:
## Min 1Q Median 3Q Max
## -169205 -2228 -978 294 569055
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.758e+03 1.336e+03 2.065 0.038984 *
## average_token_length -9.041e+02 2.877e+02 -3.143 0.001690 **
## data_channel_is_bus 1.969e+03 6.418e+02 3.068 0.002170 **
## kw_max_avg 1.909e-01 4.809e-02 3.969 7.39e-05 ***
## self_reference_min_shares 2.481e-01 1.322e-02 18.769 < 2e-16 ***
## avg_negative_polarity -7.087e+03 1.880e+03 -3.769 0.000167 ***
## abs_title_subjectivity 1.791e+03 1.195e+03 1.499 0.133856
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12930 on 3193 degrees of freedom
## Multiple R-squared: 0.1146, Adjusted R-squared: 0.1129
## F-statistic: 68.88 on 6 and 3193 DF, p-value: < 2.2e-16
In this model we are getting Adjusted R-squared: 0.1129 , p-value < 2.2e-16
Prediction on test data
prediction2 <- predict(lm2, test_data2.reduced)
From summary of lm1 and lm2, we can see p_value of model is less than 0.05, so model is meaningful but linear model is not suitable for this prediction analysis since Adjuted R-sqaure is very low (also residual is very high).
Predictive model for online news popularity (number of shares) using CHAID model
Data <- read.csv("OnlineNewsPopularityShort.csv")
Data <- na.omit(Data)
str(Data)
## 'data.frame': 4000 obs. of 62 variables:
## $ X : int 38003 19500 1454 2778 295 25639 34775 9529 31349 30609 ...
## $ url : chr "http://mashable.com/2014/12/01/tony-blair-christmas-card/" "http://mashable.com/2014/01/28/nate-silver-staffs-up-as-the-relaunch-of-fivethirtyeight-nears/" "http://mashable.com/2013/01/31/nfl-star-jj-watt-proposes-fan/" "http://mashable.com/2013/02/24/tesla-nyt-comic/" ...
## $ timedelta : int 36 345 707 683 727 231 84 549 137 150 ...
## $ n_tokens_title : int 9 8 12 12 9 11 11 11 12 11 ...
## $ n_tokens_content : int 193 618 1045 95 328 1157 264 626 717 174 ...
## $ n_unique_tokens : num 0.683 0.478 0.506 0.789 0.61 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.841 0.774 0.691 0.867 0.794 ...
## $ num_hrefs : int 2 4 5 4 11 8 7 8 8 2 ...
## $ num_self_hrefs : int 1 3 3 2 0 4 2 5 6 2 ...
## $ num_imgs : int 1 1 1 0 1 11 1 1 1 1 ...
## $ num_videos : int 0 0 26 0 0 1 2 0 2 2 ...
## $ average_token_length : num 4.98 4.25 4.41 4.62 4.95 ...
## $ num_keywords : int 6 7 5 10 7 8 10 4 6 7 ...
## $ data_channel_is_lifestyle : int 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_entertainment: int 0 0 0 0 0 1 1 0 0 1 ...
## $ data_channel_is_bus : int 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : int 0 0 0 0 0 0 0 1 0 0 ...
## $ data_channel_is_tech : int 0 1 0 0 1 0 0 0 0 0 ...
## $ data_channel_is_world : int 1 0 0 0 0 0 0 0 1 0 ...
## $ kw_min_min : int -1 -1 217 217 217 -1 -1 4 -1 -1 ...
## $ kw_max_min : num 502 709 341 3100 593 731 1100 213 385 407 ...
## $ kw_avg_min : num 167 214 291 1014 356 ...
## $ kw_min_max : int 0 0 5000 0 0 4300 0 24900 6700 0 ...
## $ kw_max_max : int 843300 843300 69100 80400 28000 843300 843300 843300 843300 843300 ...
## $ kw_avg_max : num 216200 161986 27800 28370 15986 ...
## $ kw_min_avg : num 0 0 1286 0 0 ...
## $ kw_max_avg : num 3409 3574 2546 8600 5150 ...
## $ kw_avg_avg : num 2037 2429 2055 3490 2072 ...
## $ self_reference_min_shares : num 1800 0 0 1400 0 10700 7600 2400 1400 2500 ...
## $ self_reference_max_shares : int 1800 0 0 1400 0 16300 7600 2400 23100 2500 ...
## $ self_reference_avg_sharess : num 1800 0 0 1400 0 ...
## $ weekday_is_monday : int 0 0 0 0 0 0 0 1 0 1 ...
## $ weekday_is_tuesday : int 0 1 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : int 1 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : int 0 0 1 0 0 1 1 0 0 0 ...
## $ weekday_is_friday : int 0 0 0 0 1 0 0 0 0 0 ...
## $ weekday_is_saturday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : int 0 0 0 1 0 0 0 0 1 0 ...
## $ is_weekend : int 0 0 0 1 0 0 0 0 1 0 ...
## $ LDA_00 : num 0.366 0.316 0.04 0.12 0.31 ...
## $ LDA_01 : num 0.0333 0.4563 0.0405 0.7192 0.0286 ...
## $ LDA_02 : num 0.5341 0.0286 0.2576 0.02 0.0286 ...
## $ LDA_03 : num 0.0333 0.0286 0.6219 0.0222 0.0286 ...
## $ LDA_04 : num 0.0333 0.1708 0.04 0.1186 0.6039 ...
## $ global_subjectivity : num 0.353 0.435 0.452 0.397 0.522 ...
## $ global_sentiment_polarity : num -0.1104 0.1648 -0.0246 0.2197 0.1801 ...
## $ global_rate_positive_words : num 0.0104 0.0502 0.0555 0.0842 0.0457 ...
## $ global_rate_negative_words : num 0.0207 0.0162 0.0517 0.0211 0.0152 ...
## $ rate_positive_words : num 0.333 0.756 0.518 0.8 0.75 ...
## $ rate_negative_words : num 0.667 0.244 0.482 0.2 0.25 ...
## $ avg_positive_polarity : num 0.157 0.423 0.395 0.384 0.409 ...
## $ min_positive_polarity : num 0.1 0.1 0.05 0.136 0.1 ...
## $ max_positive_polarity : num 0.214 1 1 0.8 1 ...
## $ avg_negative_polarity : num -0.438 -0.343 -0.437 -0.328 -0.327 ...
## $ min_negative_polarity : num -0.6 -0.5 -1 -0.5 -0.5 -1 -0.25 -0.4 -0.4 -0.6 ...
## $ max_negative_polarity : num -0.25 -0.167 -0.05 -0.156 -0.167 ...
## $ title_subjectivity : num 0 0.833 0.2 0.244 0 ...
## $ title_sentiment_polarity : num 0 -0.5 0.1 0.0222 0 ...
## $ abs_title_subjectivity : num 0.5 0.333 0.3 0.256 0.5 ...
## $ abs_title_sentiment_polarity : num 0 0.5 0.1 0.0222 0 ...
## $ shares : int 393 1200 1700 2400 1600 22500 4700 6400 1300 1700 ...
Data2 <- Data[4:62]
Data2$Popularity <- ifelse(Data2$shares > 50000, 1, 0)
hist(Data2$Popularity)

table(Data2$Popularity)
##
## 0 1
## 3983 17
cart.Data <- Data2 ## This "cart.Data" will be used in CART model
Data2$Popularity <- as.factor(Data2$Popularity)
Following function will convert a num/int vector argument into categorical variable. If this vector contains more than 5 different element, braking into categorical variable will happen on basis of quantile.
cat <- function(x){
if(length(unique(x)) >5){
temp <- quantile(x, c(0.2, 0.4, 0.6, 0.8, 1))
temp2 <- as.factor(ifelse(x <= temp[1], 1, ifelse(x > temp[1] & x <= temp[2], 2, ifelse(x > temp[2] & x <= temp[3], 3, ifelse(x > temp[3] & x <= temp[4], 4, 5)))))
}
else{
temp2 <- as.factor(x)
}
}
Converting all attributes of "Data2" into categorical attributes
for(i in 1:59){
Data2[,i] <- cat(Data2[,i])
}
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(105)
train.index <- sample(1:nrow(Data2), (0.8*nrow(Data2)))
train.Data2 <- Data2[train.index, ]
test.Data2 <- Data2[-train.index, ]
library(caret)
library(CHAID)
## Loading required package: partykit
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
ch2 <- chaid(Popularity ~., train.Data2)
plot(ch2)

prediction2 <- predict(ch2, test.Data2)
confusionMatrix(prediction2, test.Data2$Popularity)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 796 3
## 1 1 0
##
## Accuracy : 0.995
## 95% CI : (0.9872, 0.9986)
## No Information Rate : 0.9962
## P-Value [Acc > NIR] : 0.8156
##
## Kappa : -0.0019
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9987
## Specificity : 0.0000
## Pos Pred Value : 0.9962
## Neg Pred Value : 0.0000
## Prevalence : 0.9962
## Detection Rate : 0.9950
## Detection Prevalence : 0.9988
## Balanced Accuracy : 0.4994
##
## 'Positive' Class : 0
##
Now creating CHAID model by removing "shares" variable as it is directly relatd to "Popularity"
Data3 <- Data2[,-59]
train.Data3 <- Data3[train.index, ]
test.Data3 <- Data3[-train.index, ]
ch3 <- chaid(Popularity ~., Data3)
plot(ch3)

prediction3 <- predict(ch3, test.Data3)
confusionMatrix(prediction3, test.Data3$Popularity)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 797 3
## 1 0 0
##
## Accuracy : 0.9962
## 95% CI : (0.9891, 0.9992)
## No Information Rate : 0.9962
## P-Value [Acc > NIR] : 0.6472
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9962
## Neg Pred Value : NaN
## Prevalence : 0.9962
## Detection Rate : 0.9962
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
Both CHAID model "Ch2 and ch3" are not at all predicting the Popularity correctly when it is 1. This is because data corresponding to Polularity = 1 is very very less. On other hand when Popularity = 0 both CHAID models are predicting very well.
Predictive model for online news popularity (number of shares) using CART (Classification and Regression Tree) model
set.seed(105)
train.index <- sample(1:nrow(Data2), (0.8*nrow(Data2)))
cart.Data <- cart.Data[, -59] # removing "shares" variable from data
str(cart.Data)
## 'data.frame': 4000 obs. of 59 variables:
## $ n_tokens_title : int 9 8 12 12 9 11 11 11 12 11 ...
## $ n_tokens_content : int 193 618 1045 95 328 1157 264 626 717 174 ...
## $ n_unique_tokens : num 0.683 0.478 0.506 0.789 0.61 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.841 0.774 0.691 0.867 0.794 ...
## $ num_hrefs : int 2 4 5 4 11 8 7 8 8 2 ...
## $ num_self_hrefs : int 1 3 3 2 0 4 2 5 6 2 ...
## $ num_imgs : int 1 1 1 0 1 11 1 1 1 1 ...
## $ num_videos : int 0 0 26 0 0 1 2 0 2 2 ...
## $ average_token_length : num 4.98 4.25 4.41 4.62 4.95 ...
## $ num_keywords : int 6 7 5 10 7 8 10 4 6 7 ...
## $ data_channel_is_lifestyle : int 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_entertainment: int 0 0 0 0 0 1 1 0 0 1 ...
## $ data_channel_is_bus : int 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : int 0 0 0 0 0 0 0 1 0 0 ...
## $ data_channel_is_tech : int 0 1 0 0 1 0 0 0 0 0 ...
## $ data_channel_is_world : int 1 0 0 0 0 0 0 0 1 0 ...
## $ kw_min_min : int -1 -1 217 217 217 -1 -1 4 -1 -1 ...
## $ kw_max_min : num 502 709 341 3100 593 731 1100 213 385 407 ...
## $ kw_avg_min : num 167 214 291 1014 356 ...
## $ kw_min_max : int 0 0 5000 0 0 4300 0 24900 6700 0 ...
## $ kw_max_max : int 843300 843300 69100 80400 28000 843300 843300 843300 843300 843300 ...
## $ kw_avg_max : num 216200 161986 27800 28370 15986 ...
## $ kw_min_avg : num 0 0 1286 0 0 ...
## $ kw_max_avg : num 3409 3574 2546 8600 5150 ...
## $ kw_avg_avg : num 2037 2429 2055 3490 2072 ...
## $ self_reference_min_shares : num 1800 0 0 1400 0 10700 7600 2400 1400 2500 ...
## $ self_reference_max_shares : int 1800 0 0 1400 0 16300 7600 2400 23100 2500 ...
## $ self_reference_avg_sharess : num 1800 0 0 1400 0 ...
## $ weekday_is_monday : int 0 0 0 0 0 0 0 1 0 1 ...
## $ weekday_is_tuesday : int 0 1 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : int 1 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : int 0 0 1 0 0 1 1 0 0 0 ...
## $ weekday_is_friday : int 0 0 0 0 1 0 0 0 0 0 ...
## $ weekday_is_saturday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : int 0 0 0 1 0 0 0 0 1 0 ...
## $ is_weekend : int 0 0 0 1 0 0 0 0 1 0 ...
## $ LDA_00 : num 0.366 0.316 0.04 0.12 0.31 ...
## $ LDA_01 : num 0.0333 0.4563 0.0405 0.7192 0.0286 ...
## $ LDA_02 : num 0.5341 0.0286 0.2576 0.02 0.0286 ...
## $ LDA_03 : num 0.0333 0.0286 0.6219 0.0222 0.0286 ...
## $ LDA_04 : num 0.0333 0.1708 0.04 0.1186 0.6039 ...
## $ global_subjectivity : num 0.353 0.435 0.452 0.397 0.522 ...
## $ global_sentiment_polarity : num -0.1104 0.1648 -0.0246 0.2197 0.1801 ...
## $ global_rate_positive_words : num 0.0104 0.0502 0.0555 0.0842 0.0457 ...
## $ global_rate_negative_words : num 0.0207 0.0162 0.0517 0.0211 0.0152 ...
## $ rate_positive_words : num 0.333 0.756 0.518 0.8 0.75 ...
## $ rate_negative_words : num 0.667 0.244 0.482 0.2 0.25 ...
## $ avg_positive_polarity : num 0.157 0.423 0.395 0.384 0.409 ...
## $ min_positive_polarity : num 0.1 0.1 0.05 0.136 0.1 ...
## $ max_positive_polarity : num 0.214 1 1 0.8 1 ...
## $ avg_negative_polarity : num -0.438 -0.343 -0.437 -0.328 -0.327 ...
## $ min_negative_polarity : num -0.6 -0.5 -1 -0.5 -0.5 -1 -0.25 -0.4 -0.4 -0.6 ...
## $ max_negative_polarity : num -0.25 -0.167 -0.05 -0.156 -0.167 ...
## $ title_subjectivity : num 0 0.833 0.2 0.244 0 ...
## $ title_sentiment_polarity : num 0 -0.5 0.1 0.0222 0 ...
## $ abs_title_subjectivity : num 0.5 0.333 0.3 0.256 0.5 ...
## $ abs_title_sentiment_polarity : num 0 0.5 0.1 0.0222 0 ...
## $ Popularity : num 0 0 0 0 0 0 0 0 0 0 ...
hist(cart.Data$Popularity)

train_cart.Data <- cart.Data[train.index, ]
test_cart.Data <- cart.Data[-train.index, ]
library(rpart)
rpart <- rpart(Popularity ~., data = train_cart.Data)
rpart
## n= 3200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3200 13.938750 0.0043750000
## 2) self_reference_avg_sharess< 179157.1 3193 11.954900 0.0037582210
## 4) title_sentiment_polarity< 0.6583333 3094 6.984163 0.0022624430
## 8) kw_avg_avg< 5637.242 2996 3.994660 0.0013351130
## 16) global_sentiment_polarity>=-0.08537296 2940 1.998639 0.0006802721
## 32) num_imgs< 25.5 2858 0.000000 0.0000000000 *
## 33) num_imgs>=25.5 82 1.951220 0.0243902400
## 66) global_subjectivity< 0.5738459 75 0.000000 0.0000000000 *
## 67) global_subjectivity>=0.5738459 7 1.428571 0.2857143000 *
## 17) global_sentiment_polarity< -0.08537296 56 1.928571 0.0357142900
## 34) avg_negative_polarity< -0.2775871 49 0.000000 0.0000000000 *
## 35) avg_negative_polarity>=-0.2775871 7 1.428571 0.2857143000 *
## 9) kw_avg_avg>=5637.242 98 2.908163 0.0306122400
## 18) kw_avg_max< 559342.9 89 0.988764 0.0112359600 *
## 19) kw_avg_max>=559342.9 9 1.555556 0.2222222000 *
## 5) title_sentiment_polarity>=0.6583333 99 4.747475 0.0505050500
## 10) LDA_00< 0.6423025 91 1.956044 0.0219780200
## 20) kw_max_min< 1750 78 0.000000 0.0000000000 *
## 21) kw_max_min>=1750 13 1.692308 0.1538462000 *
## 11) LDA_00>=0.6423025 8 1.875000 0.3750000000 *
## 3) self_reference_avg_sharess>=179157.1 7 1.428571 0.2857143000 *
library(rpart.plot)
rpart.plot(rpart, digits = 3)

prediction.rpart <- predict(rpart, test_cart.Data)
table(prediction.rpart)
## prediction.rpart
## 0 0.0112359550561798 0.153846153846154 0.222222222222222
## 771 22 1 1
## 0.285714285714286 0.375
## 3 2
Assigning prediction as 1 if it is greater than 0.5, otherwise 0.
temp <- ifelse(prediction.rpart > 0.5, 1, 0)
confusionMatrix(as.factor(temp), as.factor(test_cart.Data$Popularity))
## Warning in confusionMatrix.default(as.factor(temp),
## as.factor(test_cart.Data$Popularity)): Levels are not in the same order for
## reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 797 3
## 1 0 0
##
## Accuracy : 0.9962
## 95% CI : (0.9891, 0.9992)
## No Information Rate : 0.9962
## P-Value [Acc > NIR] : 0.6472
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9962
## Neg Pred Value : NaN
## Prevalence : 0.9962
## Detection Rate : 0.9962
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##