Predictive model for online news popularity (number of shares) using linear regression model

data <- read.csv("OnlineNewsPopularityShort.csv")
data <- na.omit(data)
str(data)
## 'data.frame':    4000 obs. of  62 variables:
##  $ X                            : int  38003 19500 1454 2778 295 25639 34775 9529 31349 30609 ...
##  $ url                          : chr  "http://mashable.com/2014/12/01/tony-blair-christmas-card/" "http://mashable.com/2014/01/28/nate-silver-staffs-up-as-the-relaunch-of-fivethirtyeight-nears/" "http://mashable.com/2013/01/31/nfl-star-jj-watt-proposes-fan/" "http://mashable.com/2013/02/24/tesla-nyt-comic/" ...
##  $ timedelta                    : int  36 345 707 683 727 231 84 549 137 150 ...
##  $ n_tokens_title               : int  9 8 12 12 9 11 11 11 12 11 ...
##  $ n_tokens_content             : int  193 618 1045 95 328 1157 264 626 717 174 ...
##  $ n_unique_tokens              : num  0.683 0.478 0.506 0.789 0.61 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.841 0.774 0.691 0.867 0.794 ...
##  $ num_hrefs                    : int  2 4 5 4 11 8 7 8 8 2 ...
##  $ num_self_hrefs               : int  1 3 3 2 0 4 2 5 6 2 ...
##  $ num_imgs                     : int  1 1 1 0 1 11 1 1 1 1 ...
##  $ num_videos                   : int  0 0 26 0 0 1 2 0 2 2 ...
##  $ average_token_length         : num  4.98 4.25 4.41 4.62 4.95 ...
##  $ num_keywords                 : int  6 7 5 10 7 8 10 4 6 7 ...
##  $ data_channel_is_lifestyle    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_entertainment: int  0 0 0 0 0 1 1 0 0 1 ...
##  $ data_channel_is_bus          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ data_channel_is_tech         : int  0 1 0 0 1 0 0 0 0 0 ...
##  $ data_channel_is_world        : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ kw_min_min                   : int  -1 -1 217 217 217 -1 -1 4 -1 -1 ...
##  $ kw_max_min                   : num  502 709 341 3100 593 731 1100 213 385 407 ...
##  $ kw_avg_min                   : num  167 214 291 1014 356 ...
##  $ kw_min_max                   : int  0 0 5000 0 0 4300 0 24900 6700 0 ...
##  $ kw_max_max                   : int  843300 843300 69100 80400 28000 843300 843300 843300 843300 843300 ...
##  $ kw_avg_max                   : num  216200 161986 27800 28370 15986 ...
##  $ kw_min_avg                   : num  0 0 1286 0 0 ...
##  $ kw_max_avg                   : num  3409 3574 2546 8600 5150 ...
##  $ kw_avg_avg                   : num  2037 2429 2055 3490 2072 ...
##  $ self_reference_min_shares    : num  1800 0 0 1400 0 10700 7600 2400 1400 2500 ...
##  $ self_reference_max_shares    : int  1800 0 0 1400 0 16300 7600 2400 23100 2500 ...
##  $ self_reference_avg_sharess   : num  1800 0 0 1400 0 ...
##  $ weekday_is_monday            : int  0 0 0 0 0 0 0 1 0 1 ...
##  $ weekday_is_tuesday           : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : int  0 0 1 0 0 1 1 0 0 0 ...
##  $ weekday_is_friday            : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ weekday_is_saturday          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ is_weekend                   : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ LDA_00                       : num  0.366 0.316 0.04 0.12 0.31 ...
##  $ LDA_01                       : num  0.0333 0.4563 0.0405 0.7192 0.0286 ...
##  $ LDA_02                       : num  0.5341 0.0286 0.2576 0.02 0.0286 ...
##  $ LDA_03                       : num  0.0333 0.0286 0.6219 0.0222 0.0286 ...
##  $ LDA_04                       : num  0.0333 0.1708 0.04 0.1186 0.6039 ...
##  $ global_subjectivity          : num  0.353 0.435 0.452 0.397 0.522 ...
##  $ global_sentiment_polarity    : num  -0.1104 0.1648 -0.0246 0.2197 0.1801 ...
##  $ global_rate_positive_words   : num  0.0104 0.0502 0.0555 0.0842 0.0457 ...
##  $ global_rate_negative_words   : num  0.0207 0.0162 0.0517 0.0211 0.0152 ...
##  $ rate_positive_words          : num  0.333 0.756 0.518 0.8 0.75 ...
##  $ rate_negative_words          : num  0.667 0.244 0.482 0.2 0.25 ...
##  $ avg_positive_polarity        : num  0.157 0.423 0.395 0.384 0.409 ...
##  $ min_positive_polarity        : num  0.1 0.1 0.05 0.136 0.1 ...
##  $ max_positive_polarity        : num  0.214 1 1 0.8 1 ...
##  $ avg_negative_polarity        : num  -0.438 -0.343 -0.437 -0.328 -0.327 ...
##  $ min_negative_polarity        : num  -0.6 -0.5 -1 -0.5 -0.5 -1 -0.25 -0.4 -0.4 -0.6 ...
##  $ max_negative_polarity        : num  -0.25 -0.167 -0.05 -0.156 -0.167 ...
##  $ title_subjectivity           : num  0 0.833 0.2 0.244 0 ...
##  $ title_sentiment_polarity     : num  0 -0.5 0.1 0.0222 0 ...
##  $ abs_title_subjectivity       : num  0.5 0.333 0.3 0.256 0.5 ...
##  $ abs_title_sentiment_polarity : num  0 0.5 0.1 0.0222 0 ...
##  $ shares                       : int  393 1200 1700 2400 1600 22500 4700 6400 1300 1700 ...
data2 <- data[4:62]

Getting the correlation coefficient among independent variables

cor <- cor(data2[,c(1:58)])
#cor

Following loop will return the indices of "cor" matrix where correlation coefficient is greater than 0.7

k <- 1
  for(i in 1:58){
      
      for(j in k:58){
        
          if(i != j){
            
              if(cor[i, j] > 0.7){
                print(paste0("i =", i, " & j = ", j))
              } 
          }
      }
    
    
    k = k + 1
  }
## [1] "i =3 & j = 4"
## [1] "i =3 & j = 5"
## [1] "i =4 & j = 5"
## [1] "i =14 & j = 38"
## [1] "i =16 & j = 42"
## [1] "i =17 & j = 40"
## [1] "i =19 & j = 20"
## [1] "i =25 & j = 26"
## [1] "i =27 & j = 29"
## [1] "i =28 & j = 29"
## [1] "i =44 & j = 47"
## [1] "i =46 & j = 48"
## [1] "i =49 & j = 51"
## [1] "i =52 & j = 53"
## [1] "i =55 & j = 58"

Now removing the dependent variables from "data2" which have high linear relationship with other dependent variable (i.e. cor > 0.7)

data2.filtered <- data2[ , -c(4,5,20,26,29,38,40,42,47,48,51,53,58)]

set.seed(105)
train_index <- sample(1:4000, (0.8*4000))
train_data2.filtered <- data2.filtered[train_index, ]
test_data2.filtered <- data2.filtered[-train_index, ]

lm1 <- lm(shares ~., data = train_data2.filtered)
summary(lm1) # Adjusted R-squared:  0.111 , p-value: < 2.2e-16
## 
## Call:
## lm(formula = shares ~ ., data = train_data2.filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -168243   -2369    -879     696  566979 
## 
## Coefficients: (2 not defined because of singularities)
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -7.563e+02  3.312e+03  -0.228 0.819398    
## n_tokens_title                -2.064e+01  1.121e+02  -0.184 0.853897    
## n_tokens_content               3.385e-01  6.882e-01   0.492 0.622858    
## n_unique_tokens                1.114e+01  1.882e+01   0.592 0.553844    
## num_hrefs                      3.063e+01  2.593e+01   1.181 0.237584    
## num_self_hrefs                -6.356e+01  7.462e+01  -0.852 0.394418    
## num_imgs                       9.094e-02  3.205e+01   0.003 0.997736    
## num_videos                    -1.600e+01  5.730e+01  -0.279 0.780063    
## average_token_length          -8.804e+02  4.076e+02  -2.160 0.030857 *  
## num_keywords                   9.950e+01  1.469e+02   0.677 0.498309    
## data_channel_is_lifestyle      9.569e+02  1.524e+03   0.628 0.530160    
## data_channel_is_entertainment  1.317e+03  9.643e+02   1.366 0.172092    
## data_channel_is_bus            3.235e+03  1.366e+03   2.368 0.017933 *  
## data_channel_is_socmed         1.822e+03  1.487e+03   1.226 0.220469    
## data_channel_is_tech           2.201e+03  1.375e+03   1.601 0.109494    
## data_channel_is_world          1.334e+03  1.383e+03   0.965 0.334765    
## kw_min_min                     6.284e+00  6.953e+00   0.904 0.366197    
## kw_max_min                    -1.092e-01  9.451e-02  -1.155 0.247989    
## kw_min_max                    -6.664e-03  4.676e-03  -1.425 0.154255    
## kw_max_max                    -1.192e-03  2.423e-03  -0.492 0.622676    
## kw_avg_max                     6.073e-03  3.129e-03   1.941 0.052356 .  
## kw_min_avg                     2.578e-02  2.354e-01   0.109 0.912820    
## kw_max_avg                     2.361e-01  6.520e-02   3.621 0.000299 ***
## self_reference_min_shares      2.454e-01  1.622e-02  15.128  < 2e-16 ***
## self_reference_max_shares      1.362e-03  9.364e-03   0.145 0.884364    
## weekday_is_monday              9.523e+02  1.080e+03   0.882 0.378104    
## weekday_is_tuesday            -7.744e+02  1.070e+03  -0.724 0.469328    
## weekday_is_wednesday          -2.473e+02  1.070e+03  -0.231 0.817255    
## weekday_is_thursday           -9.405e+02  1.075e+03  -0.875 0.381696    
## weekday_is_friday             -6.620e+02  1.104e+03  -0.600 0.548723    
## weekday_is_saturday            5.529e+02  1.292e+03   0.428 0.668683    
## weekday_is_sunday                     NA         NA      NA       NA    
## is_weekend                            NA         NA      NA       NA    
## LDA_01                         1.132e+03  1.740e+03   0.651 0.515265    
## LDA_03                         2.158e+02  1.633e+03   0.132 0.894881    
## global_subjectivity            3.332e+03  3.253e+03   1.024 0.305766    
## global_sentiment_polarity      6.117e+03  6.619e+03   0.924 0.355448    
## global_rate_positive_words    -1.772e+04  2.347e+04  -0.755 0.450235    
## global_rate_negative_words     3.545e+04  3.777e+04   0.939 0.347965    
## avg_positive_polarity         -7.855e+03  4.722e+03  -1.663 0.096328 .  
## min_positive_polarity          4.413e+03  4.384e+03   1.007 0.314202    
## avg_negative_polarity         -7.386e+03  3.346e+03  -2.208 0.027335 *  
## max_negative_polarity          1.213e+02  3.491e+03   0.035 0.972282    
## title_subjectivity             6.805e+02  8.269e+02   0.823 0.410555    
## title_sentiment_polarity       1.465e+03  9.115e+02   1.608 0.108015    
## abs_title_subjectivity         2.815e+03  1.404e+03   2.005 0.045059 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12950 on 3156 degrees of freedom
## Multiple R-squared:  0.1229, Adjusted R-squared:  0.111 
## F-statistic: 10.29 on 43 and 3156 DF,  p-value: < 2.2e-16
prediction1 <- predict(lm1, test_data2.filtered)
## Warning in predict.lm(lm1, test_data2.filtered): prediction from a rank-
## deficient fit may be misleading

In the following five line of code, I am trying to get the dependent variables whose p_value is less than 0.05 (i.e. significant variables) in "lm1"

summary <- summary(lm1)
coeff <- summary$coefficients
str(coeff)
##  num [1:44, 1:4] -756.254 -20.644 0.338 11.142 30.626 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:44] "(Intercept)" "n_tokens_title" "n_tokens_content" "n_unique_tokens" ...
##   ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
P_value <- coeff[,4]
which(P_value < 0.05)
##      average_token_length       data_channel_is_bus                kw_max_avg 
##                         9                        13                        23 
## self_reference_min_shares     avg_negative_polarity    abs_title_subjectivity 
##                        24                        40                        44

Now by using output of above code, Subsetting the previous training and test data such that it will only contain signifiant variables

train_data2.reduced <- train_data2.filtered[c("average_token_length", "data_channel_is_bus", "kw_max_avg", "self_reference_min_shares", "avg_negative_polarity", "abs_title_subjectivity", "shares")]

test_data2.reduced <- test_data2.filtered[c("average_token_length", "data_channel_is_bus", "kw_max_avg", "self_reference_min_shares", "avg_negative_polarity", "abs_title_subjectivity", "shares")]
lm2 <- lm(shares ~., data = train_data2.reduced)
summary(lm2)
## 
## Call:
## lm(formula = shares ~ ., data = train_data2.reduced)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -169205   -2228    -978     294  569055 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.758e+03  1.336e+03   2.065 0.038984 *  
## average_token_length      -9.041e+02  2.877e+02  -3.143 0.001690 ** 
## data_channel_is_bus        1.969e+03  6.418e+02   3.068 0.002170 ** 
## kw_max_avg                 1.909e-01  4.809e-02   3.969 7.39e-05 ***
## self_reference_min_shares  2.481e-01  1.322e-02  18.769  < 2e-16 ***
## avg_negative_polarity     -7.087e+03  1.880e+03  -3.769 0.000167 ***
## abs_title_subjectivity     1.791e+03  1.195e+03   1.499 0.133856    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12930 on 3193 degrees of freedom
## Multiple R-squared:  0.1146, Adjusted R-squared:  0.1129 
## F-statistic: 68.88 on 6 and 3193 DF,  p-value: < 2.2e-16

In this model we are getting Adjusted R-squared: 0.1129 , p-value < 2.2e-16

Prediction on test data

prediction2 <- predict(lm2, test_data2.reduced)
From summary of lm1 and lm2, we can see p_value of model is less than 0.05, so model is meaningful but linear model is not suitable for this prediction analysis since Adjuted R-sqaure is very low (also residual is very high).

Predictive model for online news popularity (number of shares) using CHAID model

Data <- read.csv("OnlineNewsPopularityShort.csv")
Data <- na.omit(Data)
str(Data)
## 'data.frame':    4000 obs. of  62 variables:
##  $ X                            : int  38003 19500 1454 2778 295 25639 34775 9529 31349 30609 ...
##  $ url                          : chr  "http://mashable.com/2014/12/01/tony-blair-christmas-card/" "http://mashable.com/2014/01/28/nate-silver-staffs-up-as-the-relaunch-of-fivethirtyeight-nears/" "http://mashable.com/2013/01/31/nfl-star-jj-watt-proposes-fan/" "http://mashable.com/2013/02/24/tesla-nyt-comic/" ...
##  $ timedelta                    : int  36 345 707 683 727 231 84 549 137 150 ...
##  $ n_tokens_title               : int  9 8 12 12 9 11 11 11 12 11 ...
##  $ n_tokens_content             : int  193 618 1045 95 328 1157 264 626 717 174 ...
##  $ n_unique_tokens              : num  0.683 0.478 0.506 0.789 0.61 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.841 0.774 0.691 0.867 0.794 ...
##  $ num_hrefs                    : int  2 4 5 4 11 8 7 8 8 2 ...
##  $ num_self_hrefs               : int  1 3 3 2 0 4 2 5 6 2 ...
##  $ num_imgs                     : int  1 1 1 0 1 11 1 1 1 1 ...
##  $ num_videos                   : int  0 0 26 0 0 1 2 0 2 2 ...
##  $ average_token_length         : num  4.98 4.25 4.41 4.62 4.95 ...
##  $ num_keywords                 : int  6 7 5 10 7 8 10 4 6 7 ...
##  $ data_channel_is_lifestyle    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_entertainment: int  0 0 0 0 0 1 1 0 0 1 ...
##  $ data_channel_is_bus          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ data_channel_is_tech         : int  0 1 0 0 1 0 0 0 0 0 ...
##  $ data_channel_is_world        : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ kw_min_min                   : int  -1 -1 217 217 217 -1 -1 4 -1 -1 ...
##  $ kw_max_min                   : num  502 709 341 3100 593 731 1100 213 385 407 ...
##  $ kw_avg_min                   : num  167 214 291 1014 356 ...
##  $ kw_min_max                   : int  0 0 5000 0 0 4300 0 24900 6700 0 ...
##  $ kw_max_max                   : int  843300 843300 69100 80400 28000 843300 843300 843300 843300 843300 ...
##  $ kw_avg_max                   : num  216200 161986 27800 28370 15986 ...
##  $ kw_min_avg                   : num  0 0 1286 0 0 ...
##  $ kw_max_avg                   : num  3409 3574 2546 8600 5150 ...
##  $ kw_avg_avg                   : num  2037 2429 2055 3490 2072 ...
##  $ self_reference_min_shares    : num  1800 0 0 1400 0 10700 7600 2400 1400 2500 ...
##  $ self_reference_max_shares    : int  1800 0 0 1400 0 16300 7600 2400 23100 2500 ...
##  $ self_reference_avg_sharess   : num  1800 0 0 1400 0 ...
##  $ weekday_is_monday            : int  0 0 0 0 0 0 0 1 0 1 ...
##  $ weekday_is_tuesday           : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : int  0 0 1 0 0 1 1 0 0 0 ...
##  $ weekday_is_friday            : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ weekday_is_saturday          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ is_weekend                   : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ LDA_00                       : num  0.366 0.316 0.04 0.12 0.31 ...
##  $ LDA_01                       : num  0.0333 0.4563 0.0405 0.7192 0.0286 ...
##  $ LDA_02                       : num  0.5341 0.0286 0.2576 0.02 0.0286 ...
##  $ LDA_03                       : num  0.0333 0.0286 0.6219 0.0222 0.0286 ...
##  $ LDA_04                       : num  0.0333 0.1708 0.04 0.1186 0.6039 ...
##  $ global_subjectivity          : num  0.353 0.435 0.452 0.397 0.522 ...
##  $ global_sentiment_polarity    : num  -0.1104 0.1648 -0.0246 0.2197 0.1801 ...
##  $ global_rate_positive_words   : num  0.0104 0.0502 0.0555 0.0842 0.0457 ...
##  $ global_rate_negative_words   : num  0.0207 0.0162 0.0517 0.0211 0.0152 ...
##  $ rate_positive_words          : num  0.333 0.756 0.518 0.8 0.75 ...
##  $ rate_negative_words          : num  0.667 0.244 0.482 0.2 0.25 ...
##  $ avg_positive_polarity        : num  0.157 0.423 0.395 0.384 0.409 ...
##  $ min_positive_polarity        : num  0.1 0.1 0.05 0.136 0.1 ...
##  $ max_positive_polarity        : num  0.214 1 1 0.8 1 ...
##  $ avg_negative_polarity        : num  -0.438 -0.343 -0.437 -0.328 -0.327 ...
##  $ min_negative_polarity        : num  -0.6 -0.5 -1 -0.5 -0.5 -1 -0.25 -0.4 -0.4 -0.6 ...
##  $ max_negative_polarity        : num  -0.25 -0.167 -0.05 -0.156 -0.167 ...
##  $ title_subjectivity           : num  0 0.833 0.2 0.244 0 ...
##  $ title_sentiment_polarity     : num  0 -0.5 0.1 0.0222 0 ...
##  $ abs_title_subjectivity       : num  0.5 0.333 0.3 0.256 0.5 ...
##  $ abs_title_sentiment_polarity : num  0 0.5 0.1 0.0222 0 ...
##  $ shares                       : int  393 1200 1700 2400 1600 22500 4700 6400 1300 1700 ...
Data2 <- Data[4:62]
Data2$Popularity <- ifelse(Data2$shares > 50000, 1, 0)
hist(Data2$Popularity)

table(Data2$Popularity)
## 
##    0    1 
## 3983   17
cart.Data <- Data2   ## This "cart.Data" will be used in CART model
Data2$Popularity <- as.factor(Data2$Popularity)
Following function will convert a num/int vector argument into categorical variable. If this vector contains more than 5 different element, braking into categorical variable will happen on basis of quantile.
cat <- function(x){
        if(length(unique(x)) >5){
            temp <- quantile(x, c(0.2, 0.4, 0.6, 0.8, 1))
            temp2 <- as.factor(ifelse(x <= temp[1], 1, ifelse(x > temp[1] & x <= temp[2], 2, ifelse(x > temp[2] & x <= temp[3], 3, ifelse(x > temp[3] & x <= temp[4], 4, 5)))))
        }
        else{
            temp2 <- as.factor(x)
        }
}

Converting all attributes of "Data2" into categorical attributes

for(i in 1:59){
  Data2[,i] <- cat(Data2[,i])
}
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(105)
train.index <- sample(1:nrow(Data2), (0.8*nrow(Data2)))
train.Data2 <- Data2[train.index, ]
test.Data2 <- Data2[-train.index, ]

library(caret)
library(CHAID)
## Loading required package: partykit
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
ch2 <- chaid(Popularity ~., train.Data2)
plot(ch2)

prediction2 <- predict(ch2, test.Data2)
confusionMatrix(prediction2, test.Data2$Popularity)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 796   3
##          1   1   0
##                                           
##                Accuracy : 0.995           
##                  95% CI : (0.9872, 0.9986)
##     No Information Rate : 0.9962          
##     P-Value [Acc > NIR] : 0.8156          
##                                           
##                   Kappa : -0.0019         
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.9987          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9962          
##          Neg Pred Value : 0.0000          
##              Prevalence : 0.9962          
##          Detection Rate : 0.9950          
##    Detection Prevalence : 0.9988          
##       Balanced Accuracy : 0.4994          
##                                           
##        'Positive' Class : 0               
## 

Now creating CHAID model by removing "shares" variable as it is directly relatd to "Popularity"

Data3 <- Data2[,-59]
train.Data3 <- Data3[train.index, ]
test.Data3 <- Data3[-train.index, ]
ch3 <- chaid(Popularity ~., Data3)
plot(ch3)

prediction3 <- predict(ch3, test.Data3)
confusionMatrix(prediction3, test.Data3$Popularity)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 797   3
##          1   0   0
##                                           
##                Accuracy : 0.9962          
##                  95% CI : (0.9891, 0.9992)
##     No Information Rate : 0.9962          
##     P-Value [Acc > NIR] : 0.6472          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9962          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9962          
##          Detection Rate : 0.9962          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 
Both CHAID model "Ch2 and ch3" are not at all predicting the Popularity correctly when it is 1. This is because data corresponding to Polularity = 1 is very very less. On other hand when Popularity = 0 both CHAID models are predicting very well.

Predictive model for online news popularity (number of shares) using CART (Classification and Regression Tree) model

set.seed(105)
train.index <- sample(1:nrow(Data2), (0.8*nrow(Data2)))
cart.Data <- cart.Data[, -59] # removing "shares" variable from data
str(cart.Data)
## 'data.frame':    4000 obs. of  59 variables:
##  $ n_tokens_title               : int  9 8 12 12 9 11 11 11 12 11 ...
##  $ n_tokens_content             : int  193 618 1045 95 328 1157 264 626 717 174 ...
##  $ n_unique_tokens              : num  0.683 0.478 0.506 0.789 0.61 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.841 0.774 0.691 0.867 0.794 ...
##  $ num_hrefs                    : int  2 4 5 4 11 8 7 8 8 2 ...
##  $ num_self_hrefs               : int  1 3 3 2 0 4 2 5 6 2 ...
##  $ num_imgs                     : int  1 1 1 0 1 11 1 1 1 1 ...
##  $ num_videos                   : int  0 0 26 0 0 1 2 0 2 2 ...
##  $ average_token_length         : num  4.98 4.25 4.41 4.62 4.95 ...
##  $ num_keywords                 : int  6 7 5 10 7 8 10 4 6 7 ...
##  $ data_channel_is_lifestyle    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_entertainment: int  0 0 0 0 0 1 1 0 0 1 ...
##  $ data_channel_is_bus          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ data_channel_is_tech         : int  0 1 0 0 1 0 0 0 0 0 ...
##  $ data_channel_is_world        : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ kw_min_min                   : int  -1 -1 217 217 217 -1 -1 4 -1 -1 ...
##  $ kw_max_min                   : num  502 709 341 3100 593 731 1100 213 385 407 ...
##  $ kw_avg_min                   : num  167 214 291 1014 356 ...
##  $ kw_min_max                   : int  0 0 5000 0 0 4300 0 24900 6700 0 ...
##  $ kw_max_max                   : int  843300 843300 69100 80400 28000 843300 843300 843300 843300 843300 ...
##  $ kw_avg_max                   : num  216200 161986 27800 28370 15986 ...
##  $ kw_min_avg                   : num  0 0 1286 0 0 ...
##  $ kw_max_avg                   : num  3409 3574 2546 8600 5150 ...
##  $ kw_avg_avg                   : num  2037 2429 2055 3490 2072 ...
##  $ self_reference_min_shares    : num  1800 0 0 1400 0 10700 7600 2400 1400 2500 ...
##  $ self_reference_max_shares    : int  1800 0 0 1400 0 16300 7600 2400 23100 2500 ...
##  $ self_reference_avg_sharess   : num  1800 0 0 1400 0 ...
##  $ weekday_is_monday            : int  0 0 0 0 0 0 0 1 0 1 ...
##  $ weekday_is_tuesday           : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : int  0 0 1 0 0 1 1 0 0 0 ...
##  $ weekday_is_friday            : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ weekday_is_saturday          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ is_weekend                   : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ LDA_00                       : num  0.366 0.316 0.04 0.12 0.31 ...
##  $ LDA_01                       : num  0.0333 0.4563 0.0405 0.7192 0.0286 ...
##  $ LDA_02                       : num  0.5341 0.0286 0.2576 0.02 0.0286 ...
##  $ LDA_03                       : num  0.0333 0.0286 0.6219 0.0222 0.0286 ...
##  $ LDA_04                       : num  0.0333 0.1708 0.04 0.1186 0.6039 ...
##  $ global_subjectivity          : num  0.353 0.435 0.452 0.397 0.522 ...
##  $ global_sentiment_polarity    : num  -0.1104 0.1648 -0.0246 0.2197 0.1801 ...
##  $ global_rate_positive_words   : num  0.0104 0.0502 0.0555 0.0842 0.0457 ...
##  $ global_rate_negative_words   : num  0.0207 0.0162 0.0517 0.0211 0.0152 ...
##  $ rate_positive_words          : num  0.333 0.756 0.518 0.8 0.75 ...
##  $ rate_negative_words          : num  0.667 0.244 0.482 0.2 0.25 ...
##  $ avg_positive_polarity        : num  0.157 0.423 0.395 0.384 0.409 ...
##  $ min_positive_polarity        : num  0.1 0.1 0.05 0.136 0.1 ...
##  $ max_positive_polarity        : num  0.214 1 1 0.8 1 ...
##  $ avg_negative_polarity        : num  -0.438 -0.343 -0.437 -0.328 -0.327 ...
##  $ min_negative_polarity        : num  -0.6 -0.5 -1 -0.5 -0.5 -1 -0.25 -0.4 -0.4 -0.6 ...
##  $ max_negative_polarity        : num  -0.25 -0.167 -0.05 -0.156 -0.167 ...
##  $ title_subjectivity           : num  0 0.833 0.2 0.244 0 ...
##  $ title_sentiment_polarity     : num  0 -0.5 0.1 0.0222 0 ...
##  $ abs_title_subjectivity       : num  0.5 0.333 0.3 0.256 0.5 ...
##  $ abs_title_sentiment_polarity : num  0 0.5 0.1 0.0222 0 ...
##  $ Popularity                   : num  0 0 0 0 0 0 0 0 0 0 ...
hist(cart.Data$Popularity)

train_cart.Data <- cart.Data[train.index, ]
test_cart.Data <- cart.Data[-train.index, ]
library(rpart)
rpart <- rpart(Popularity ~., data = train_cart.Data)
rpart
## n= 3200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3200 13.938750 0.0043750000  
##    2) self_reference_avg_sharess< 179157.1 3193 11.954900 0.0037582210  
##      4) title_sentiment_polarity< 0.6583333 3094  6.984163 0.0022624430  
##        8) kw_avg_avg< 5637.242 2996  3.994660 0.0013351130  
##         16) global_sentiment_polarity>=-0.08537296 2940  1.998639 0.0006802721  
##           32) num_imgs< 25.5 2858  0.000000 0.0000000000 *
##           33) num_imgs>=25.5 82  1.951220 0.0243902400  
##             66) global_subjectivity< 0.5738459 75  0.000000 0.0000000000 *
##             67) global_subjectivity>=0.5738459 7  1.428571 0.2857143000 *
##         17) global_sentiment_polarity< -0.08537296 56  1.928571 0.0357142900  
##           34) avg_negative_polarity< -0.2775871 49  0.000000 0.0000000000 *
##           35) avg_negative_polarity>=-0.2775871 7  1.428571 0.2857143000 *
##        9) kw_avg_avg>=5637.242 98  2.908163 0.0306122400  
##         18) kw_avg_max< 559342.9 89  0.988764 0.0112359600 *
##         19) kw_avg_max>=559342.9 9  1.555556 0.2222222000 *
##      5) title_sentiment_polarity>=0.6583333 99  4.747475 0.0505050500  
##       10) LDA_00< 0.6423025 91  1.956044 0.0219780200  
##         20) kw_max_min< 1750 78  0.000000 0.0000000000 *
##         21) kw_max_min>=1750 13  1.692308 0.1538462000 *
##       11) LDA_00>=0.6423025 8  1.875000 0.3750000000 *
##    3) self_reference_avg_sharess>=179157.1 7  1.428571 0.2857143000 *
library(rpart.plot)
rpart.plot(rpart, digits = 3)

prediction.rpart <- predict(rpart, test_cart.Data)
table(prediction.rpart)
## prediction.rpart
##                  0 0.0112359550561798  0.153846153846154  0.222222222222222 
##                771                 22                  1                  1 
##  0.285714285714286              0.375 
##                  3                  2

Assigning prediction as 1 if it is greater than 0.5, otherwise 0.

temp <- ifelse(prediction.rpart > 0.5, 1, 0)
confusionMatrix(as.factor(temp), as.factor(test_cart.Data$Popularity))
## Warning in confusionMatrix.default(as.factor(temp),
## as.factor(test_cart.Data$Popularity)): Levels are not in the same order for
## reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 797   3
##          1   0   0
##                                           
##                Accuracy : 0.9962          
##                  95% CI : (0.9891, 0.9992)
##     No Information Rate : 0.9962          
##     P-Value [Acc > NIR] : 0.6472          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9962          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9962          
##          Detection Rate : 0.9962          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
##