Summary

The dataset is obtained from https://archive.ics.uci.edu/ml/datasets/Online+News+Popularity

This dataset summarizes a heterogeneous set of features about articles published by Mashable News in a period of two years (Fernandes, et al., 2015). The goal is to predict the number of shares in social networks which indicates the popularity of news.

After considering various models, the Lasso model proved to be the best model because of its high accuracy and straightforward interpretation. By running the selected variables in quantile regression, Mashable will be able to not only predict how many shares an article will likely receive but also how journalists can increase the number of shares based on the quantile regression information. Moreover, Mashable could run all pre-published articles through this model to predict shares and also analyze where it will fall in the quantiles, and therefore change the article based on relevant features to receive more shares. The causality analysis also provides important information on which variables directly cause shares to increase; Mashable can focus on factors such as the number of images in an article or publishing on a Sunday in the future to increase popularity.

You can check my code for this project at https://github.com/XuebinZhu/Projects/tree/master/Mashable%20News%20Article%20Sharing%20Project

More interesting projects could be found here https://github.com/XuebinZhu/Projects

                            #############################################
                            ### Mashable News Article Sharing Project ###
                            #############################################
rm(list=ls())

#load all the required library
library(factoextra)
library(usmap)
library(randomForest)
library(quantreg)
library(rpart)
library(rpart.plot)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(MASS)
library(usmap)
library(glmnet)
library(MLmetrics)
library(scales)
library(plfm)
library(caTools)
library(psych)
library(VIM)

#set seed for spliting data set
set.seed(123)

Data Cleaning

I will check for null or erroneous values in the data, evaluate each variable, and remove those were not beneficial to the analysis. Then, I will do some data transformation for feature engineering.

###################
###Data Cleaning###
###################

#read the data
dat0 <- read.csv("OnlineNewsPopularity.csv", header = T)

#combine and create new variables
dat0$weekday <-  as.numeric(dat0$weekday_is_monday) * 1 + 
  as.numeric(dat0$weekday_is_tuesday) * 2 + 
  as.numeric(dat0$weekday_is_wednesday) * 3 + 
  as.numeric(dat0$weekday_is_thursday) * 4 + 
  as.numeric(dat0$weekday_is_friday) * 5 + 
  as.numeric(dat0$weekday_is_saturday) * 6 +
  as.numeric(dat0$weekday_is_sunday) * 7

dat0$channel <-  as.numeric(dat0$data_channel_is_lifestyle) * 1 + 
  as.numeric(dat0$data_channel_is_entertainment) * 2 + 
  as.numeric(dat0$data_channel_is_bus) * 3 + 
  as.numeric(dat0$data_channel_is_socmed) * 4 + 
  as.numeric(dat0$data_channel_is_tech) * 5 + 
  as.numeric(dat0$data_channel_is_world) * 6
dat0$channel <- (dat0$channel +1)

#rename the variables
dat0$weekday <- factor(dat0$weekday, levels = 1:7, labels = c("Mon", "Tues", "Wed", 
                                                              "Thurs", "Fri", "Sat", "Sun"))
dat0$channel <- factor(dat0$channel, levels = 1:7, labels = c("Other", "Lifestyle", "Entertainment", "Business", 
                                                              "Social_Media", "Technology", "World"))
#use log transformation for shares
hist(dat0$shares)
dat0$log_shares <- log(dat0$shares)
hist(dat0$log_shares)

#drop meaningless variables
drop <- c("url","n_tokens_content", "n_unique_tokens", "num_hrefs", " kw_min_min", "kw_max_min","kw_avg_min", 
          "kw_min_max","kw_max_max", "kw_avg_max", "kw_min_avg"," kw_max_avg", "kw_avg_avg", "self_reference_min_shares", 
          "self_reference_max_shares", "abs_title_subjectivity", "abs_title_sentiment_polarity", "min_positive_polarity",
          "max_positive_polarity", "min_negative_polarity", "max_negative_polarity", "rate_positive_words", "rate_negative_words", 
          "kw_min_min", "kw_max_avg", "weekday_is_monday","weekday_is_tuesday" , "weekday_is_wednesday", "weekday_is_thursday", 
          "weekday_is_friday","weekday_is_saturday","weekday_is_sunday", "data_channel_is_lifestyle", "data_channel_is_entertainment", 
          "data_channel_is_bus","data_channel_is_socmed", "data_channel_is_tech","data_channel_is_world", "shares", 
          "LDA_00", "LDA_01", "LDA_02", "LDA_03", "LDA_04")

dat0 <- dat0[,!(names(dat0) %in% drop)]

#convert data type
dat0$is_weekend <- as.factor(dat0$is_weekend)

#create full data set for this project
news <- dat0

#remove outliers
n_non_stop_words_Q1SP <- summary(news$n_non_stop_words)[2]
n_non_stop_words_Q3SP <- summary(news$n_non_stop_words)[5]
n_non_stop_words_IQRSP <- n_non_stop_words_Q3SP - n_non_stop_words_Q1SP
min_n_non_stop_words <- as.numeric(n_non_stop_words_Q1SP - 1.5*n_non_stop_words_IQRSP)
max_n_non_stop_words <- as.numeric(n_non_stop_words_Q3SP + 1.5*n_non_stop_words_IQRSP)
news <- news %>% filter(n_non_stop_words < max_n_non_stop_words)
news <- news %>% filter(n_non_stop_words > min_n_non_stop_words)

summary(dat0$n_non_stop_words)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.0000    1.0000    1.0000    0.9965    1.0000 1042.0000
boxplot(news$n_non_stop_words)
hist(news$n_non_stop_words)#this variable seems to have lots of errors, so I decided to remove it

#"is_weekend" is highly co-related with weekday, so I decided to remove it
#"timedelta" gives no predictive information for our y variable, so I decided to remove it

news <- dat0
drop1 <- c("n_non_stop_words", "is_weekend", "timedelta")
news <- news[,!(names(news) %in% drop1)]

#remove outliers
n_non_stop_unique_tokens_Q1SP <- summary(news$n_non_stop_unique_tokens)[2]
n_non_stop_unique_tokens_Q3SP <- summary(news$n_non_stop_unique_tokens)[5]
n_non_stop_unique_tokens_IQRSP <- n_non_stop_unique_tokens_Q3SP - n_non_stop_unique_tokens_Q1SP
min_n_non_stop_unique_tokens <- as.numeric(n_non_stop_unique_tokens_Q1SP - 1.5*n_non_stop_unique_tokens_IQRSP)
max_n_non_stop_unique_tokens <- as.numeric(n_non_stop_unique_tokens_Q3SP + 1.5*n_non_stop_unique_tokens_IQRSP)
news <- news %>% filter(n_non_stop_unique_tokens < max_n_non_stop_unique_tokens)
news <- news %>% filter(n_non_stop_unique_tokens > min_n_non_stop_unique_tokens)

summary(news$n_non_stop_unique_tokens)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.4324  0.6346  0.6946  0.6962  0.7569  0.9474
boxplot(news$n_non_stop_unique_tokens)
hist(news$n_non_stop_unique_tokens)#keep

#num_self_hrefs, num_imgs, num_videos, average_token_length, self_reference_avg_sharess should do log transformation
news$num_self_hrefs <- log(1 + news$num_self_hrefs)
news$num_imgs <- log(1 + news$num_imgs)
news$num_videos <- log(1 + news$num_videos)
news$average_token_length <- log(1 + news$average_token_length)
news$self_reference_avg_sharess <- log(1 + news$self_reference_avg_sharess)

summary(news)
##  n_tokens_title  n_non_stop_unique_tokens num_self_hrefs      num_imgs     
##  Min.   : 2.00   Min.   :0.4324           Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 9.00   1st Qu.:0.6346           1st Qu.:0.6931   1st Qu.:0.6931  
##  Median :10.00   Median :0.6946           Median :1.3863   Median :0.6931  
##  Mean   :10.38   Mean   :0.6962           Mean   :1.2448   Mean   :1.1105  
##  3rd Qu.:12.00   3rd Qu.:0.7569           3rd Qu.:1.6094   3rd Qu.:1.6094  
##  Max.   :23.00   Max.   :0.9474           Max.   :4.3175   Max.   :4.6250  
##                                                                            
##    num_videos     average_token_length  num_keywords   
##  Min.   :0.0000   Min.   :1.526        Min.   : 1.000  
##  1st Qu.:0.0000   1st Qu.:1.704        1st Qu.: 6.000  
##  Median :0.0000   Median :1.736        Median : 7.000  
##  Mean   :0.3973   Mean   :1.737        Mean   : 7.206  
##  3rd Qu.:0.6931   3rd Qu.:1.768        3rd Qu.: 9.000  
##  Max.   :4.5218   Max.   :2.202        Max.   :10.000  
##                                                        
##  self_reference_avg_sharess global_subjectivity global_sentiment_polarity
##  Min.   : 0.000             Min.   :0.0000      Min.   :-0.39375         
##  1st Qu.: 7.004             1st Qu.:0.4030      1st Qu.: 0.06461         
##  Median : 7.741             Median :0.4568      Median : 0.12253         
##  Mean   : 6.867             Mean   :0.4574      Mean   : 0.12316         
##  3rd Qu.: 8.576             3rd Qu.:0.5104      3rd Qu.: 0.17973         
##  Max.   :13.645             Max.   :1.0000      Max.   : 0.72784         
##                                                                          
##  global_rate_positive_words global_rate_negative_words avg_positive_polarity
##  Min.   :0.00000            Min.   :0.00000            Min.   :0.0000       
##  1st Qu.:0.02961            1st Qu.:0.01024            1st Qu.:0.3120       
##  Median :0.03968            Median :0.01571            Median :0.3618       
##  Mean   :0.04092            Mean   :0.01711            Mean   :0.3646       
##  3rd Qu.:0.05072            3rd Qu.:0.02200            3rd Qu.:0.4130       
##  Max.   :0.15549            Max.   :0.18493            Max.   :1.0000       
##                                                                             
##  avg_negative_polarity title_subjectivity title_sentiment_polarity  weekday    
##  Min.   :-1.0000       Min.   :0.000      Min.   :-1.00000         Mon  :6365  
##  1st Qu.:-0.3312       1st Qu.:0.000      1st Qu.: 0.00000         Tues :7078  
##  Median :-0.2579       Median :0.125      Median : 0.00000         Wed  :7128  
##  Mean   :-0.2677       Mean   :0.280      Mean   : 0.07048         Thurs:6960  
##  3rd Qu.:-0.1937       3rd Qu.:0.500      3rd Qu.: 0.13636         Fri  :5451  
##  Max.   : 0.0000       Max.   :1.000      Max.   : 1.00000         Sat  :2331  
##                                                                    Sun  :2598  
##           channel       log_shares    
##  Other        :5335   Min.   : 0.000  
##  Lifestyle    :2060   1st Qu.: 6.851  
##  Entertainment:6698   Median : 7.244  
##  Business     :6212   Mean   : 7.468  
##  Social_Media :2279   3rd Qu.: 7.901  
##  Technology   :7231   Max.   :13.645  
##  World        :8096
#impute NA value in Channel Column
sum(is.na(news$channel))
## [1] 0
summary(news$channel)
##         Other     Lifestyle Entertainment      Business  Social_Media 
##          5335          2060          6698          6212          2279 
##    Technology         World 
##          7231          8096
# Get levels and add "Unknown"
levels <- levels(news$channel)
levels[length(levels) + 1] <- "Unknown"
# refactor to include "Unknown" as a factor level
# and replace NA with "Unknown"
news$channel <- factor(news$channel, levels = levels)
news$channel[is.na(news$channel)] <- "Unknown"
#split the data
split <- sample.split(news, SplitRatio = 0.8)
data_train <- subset(news, split==T)
data_test <- subset(news, split==F)

#the final train data set
head(data_train)
##   n_tokens_title n_non_stop_unique_tokens num_self_hrefs  num_imgs num_videos
## 1             12                0.8153846      1.0986123 0.6931472          0
## 2              9                0.7919463      0.6931472 0.6931472          0
## 3              9                0.6638655      0.6931472 0.6931472          0
## 4              9                0.6656347      0.0000000 0.6931472          0
## 6             10                0.6981982      1.0986123 0.0000000          0
## 7              8                0.5498339      3.0445224 3.0445224          0
##   average_token_length num_keywords self_reference_avg_sharess
## 1             1.737016            5                   6.208590
## 2             1.777276            4                   0.000000
## 3             1.685169            6                   6.823286
## 4             1.687305            7                   0.000000
## 6             1.678863            9                   9.047939
## 7             1.732393           10                   8.055843
##   global_subjectivity global_sentiment_polarity global_rate_positive_words
## 1           0.5216171                0.09256198                 0.04566210
## 2           0.3412458                0.14894781                 0.04313725
## 3           0.7022222                0.32333333                 0.05687204
## 4           0.4298497                0.10070467                 0.04143126
## 6           0.4374086                0.07118419                 0.02972973
## 7           0.5144803                0.26830272                 0.08020833
##   global_rate_negative_words avg_positive_polarity avg_negative_polarity
## 1                0.013698630             0.3786364            -0.3500000
## 2                0.015686275             0.2869146            -0.1187500
## 3                0.009478673             0.4958333            -0.4666667
## 4                0.020715631             0.3859652            -0.3696970
## 6                0.027027027             0.3506100            -0.1950000
## 7                0.016666667             0.4020386            -0.2244792
##   title_subjectivity title_sentiment_polarity weekday       channel log_shares
## 1          0.5000000               -0.1875000     Mon Entertainment   6.385194
## 2          0.0000000                0.0000000     Mon      Business   6.566672
## 3          0.0000000                0.0000000     Mon      Business   7.313220
## 4          0.0000000                0.0000000     Mon Entertainment   7.090077
## 6          0.6428571                0.2142857     Mon    Technology   6.751101
## 7          0.0000000                0.0000000     Mon     Lifestyle   6.320768
#the final test data set
head(data_test)
##    n_tokens_title n_non_stop_unique_tokens num_self_hrefs  num_imgs num_videos
## 5              13                0.5408895      2.9957323 3.0445224          0
## 8              12                0.5721078      3.0445224 3.0445224          0
## 11              9                0.7316384      0.0000000 0.6931472          0
## 16             12                0.6349614      0.0000000 0.6931472          0
## 24             11                0.7974683      0.6931472 0.6931472          0
## 27              8                0.7244898      1.3862944 0.6931472          0
##    average_token_length num_keywords self_reference_avg_sharess
## 5              1.737450            7                   8.055843
## 8              1.725939            9                   8.055843
## 11             1.725938            8                   0.000000
## 16             1.726373            6                   0.000000
## 24             1.761987            6                   9.686637
## 27             1.754832            9                   7.824446
##    global_subjectivity global_sentiment_polarity global_rate_positive_words
## 5            0.5135021                0.28100348                 0.07462687
## 8            0.5434742                0.29861347                 0.08392315
## 11           0.4820598                0.10235015                 0.03846154
## 16           0.4732850                0.06222666                 0.04985337
## 24           0.3964015                0.21079545                 0.04800000
## 27           0.5606957                0.29541435                 0.06042296
##    global_rate_negative_words avg_positive_polarity avg_negative_polarity
## 5                 0.012126866             0.4111274            -0.2201923
## 8                 0.015166835             0.4277205            -0.2427778
## 11                0.020833333             0.4044801            -0.4150641
## 16                0.039589443             0.3430704            -0.2201499
## 24                0.000000000             0.2810606             0.0000000
## 27                0.009063444             0.5565584            -0.2638889
##    title_subjectivity title_sentiment_polarity weekday    channel log_shares
## 5           0.4545455                0.1363636     Mon Technology   6.224558
## 8           1.0000000                0.5000000     Mon Technology   6.792344
## 11          0.0000000                0.0000000     Mon      World   7.696213
## 16          0.7500000               -0.2500000     Mon      World   7.377759
## 24          0.4500000                0.4000000     Mon      World   7.313220
## 27          0.0000000                0.0000000     Mon Technology   7.313220

Data Visualization

I will build several plots using ggplot to conduct exploratory analysis.

########################
###Data Visualization###
########################

ggplot(data_train, aes(x = weekday, fill = channel)) +
  geom_bar(show.legend = T) +
  theme_minimal() +
  labs(title = 'Number of Articles in Different Weekday', 
       subtitle = 'divided by different channel',
       x = 'Weekday', y = 'Number of Articles') +
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 1))

ggplot(data_train, aes(x = channel, fill = channel)) +
  geom_bar(show.legend = F) +
  theme_minimal() +
  labs(title = 'Number of Articles in Different Channel',
       x = 'Channel', y = 'Number of Articles') + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1))

ggplot(dat0, aes(x = channel, y = log_shares, fill = as.factor(is_weekend))) +
  geom_boxplot(show.legend = T) +
  theme_minimal() +
  labs(title = 'Log Number of Shares versus Different Channel', 
       subtitle = 'divided by weekday and weekend',
       x = 'Channel', y = 'Log Number of Shares') + 
  scale_fill_discrete(name="Weekday or Weekend",
                      breaks=c("0", "1"),
                      labels=c("Weekday", "Weekend"))+
  scale_x_discrete(labels=c("Other", "Lifestyle", "Entertainment", "Business", "Social Media", "Technology", "World")) +
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 1))+ 
  theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1))

ggplot(data_train, aes(x = global_subjectivity, y = log_shares, color = channel)) +
  geom_point(show.legend = T) +
  theme_minimal() +
  labs(title = 'Log Number of Shares versus Article Subjectivity', 
       subtitle = 'divided by different channel',
       x = 'Article Subjectivity', y = 'Log Number of Shares') + 
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 1))

ggplot(data_train, aes(x = global_sentiment_polarity, y = log_shares, color = channel)) +
  geom_point(show.legend = T) +
  theme_minimal() +
  labs(title = 'Log Number of Shares versus Article Sentiment Polarity', 
       subtitle = 'divided by different channel',
       x = 'Article Sentiment Polarity', y = 'Log Number of Shares') + 
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 1))

Modeling

Core Task: Unsupervised Learning

Data Mining Method: PCA and K-Means Clustering

#PCA
#Computing the full PCA
clustering_set <- data_train[,1:16]
clustering_set <- scale(clustering_set)
#Computing the full PCA
pca.news <- prcomp(clustering_set, scale=TRUE)
###Plotting the variance that each component explains
plot(pca.news,main="PCA: Variance Explained by Factors")
mtext(side=1, "Factors",  line=1, font=2)
screeplot(pca.news,type = "line", main = "Scree Plot for PCA", npcs = length(pca.news$sdev))
mtext(side=3, "Number of Factors",  line= -18, font=2)
abline(v = 5)

#get the scores and the loadings of the five selected PC
pcScore <- predict(pca.news)
pcScore_selected <- pcScore[,1:2]
pcScore_selected2 <- pcScore[,3:4]
loadings <- pca.news$rotation[,1:5]
nrow(loadings)
## [1] 16
#### PC1
# Positive sentiment articles
v1 <- loadings[order(abs(loadings[,1]), decreasing=TRUE)[1:19],1]
loadingfit1 <- lapply(1:27, function(k) ( t(v1[1:k])%*%v1[1:k] - 3/4 )^2)
v1[1:which.min(loadingfit1)]
##  global_sentiment_polarity      avg_positive_polarity 
##                  0.5340055                  0.3985148 
## global_rate_positive_words        global_subjectivity 
##                  0.3977888                  0.3675426
#### PC2
# Negative Sentiment aritcles
v2 <- loadings[order(abs(loadings[,2]), decreasing=TRUE)[1:19],2]
loadingfit2 <- lapply(1:27, function(k) ( t(v2[1:k])%*%v2[1:k] - 3/4 )^2)
v2[1:which.min(loadingfit2)]
## self_reference_avg_sharess             num_self_hrefs 
##                  0.4093628                  0.3816014 
##      avg_negative_polarity global_rate_negative_words 
##                 -0.3779098                  0.3531464 
##                   num_imgs  global_sentiment_polarity 
##                  0.3304523                 -0.3189810
#### PC3
# short articles with more links and images
v3 <- loadings[order(abs(loadings[,3]), decreasing=TRUE)[1:19],3]
loadingfit3 <- lapply(1:27, function(k) ( t(v3[1:k])%*%v3[1:k] - 3/4 )^2)
v3[1:which.min(loadingfit3)]
## global_rate_negative_words   n_non_stop_unique_tokens 
##                 -0.3809297                 -0.3710102 
##                   num_imgs      avg_negative_polarity 
##                  0.3419505                  0.3293635 
##             num_self_hrefs        global_subjectivity 
##                  0.3184503                 -0.3161948 
##                 num_videos 
##                 -0.2600131
#### PC4
# Long articles Less image but use more popular links
v4 <- loadings[order(abs(loadings[,4]), decreasing=TRUE)[1:19],4]
loadingfit4 <- lapply(1:27, function(k) ( t(v4[1:k])%*%v4[1:k] - 3/4 )^2)
v4[1:which.min(loadingfit4)]
## self_reference_avg_sharess                   num_imgs 
##                  0.4151198                 -0.3905513 
##             num_self_hrefs   n_non_stop_unique_tokens 
##                  0.3662411                  0.3604998 
##                 num_videos       average_token_length 
##                  0.3319199                 -0.3119188
#### PC5
v5 <- loadings[order(abs(loadings[,4]), decreasing=TRUE)[1:19],5]
loadingfit5 <- lapply(1:27, function(k) ( t(v5[1:k])%*%v5[1:k] - 3/4 )^2)
v5[1:which.min(loadingfit5)]
## self_reference_avg_sharess                   num_imgs 
##                0.075425116               -0.017136496 
##             num_self_hrefs   n_non_stop_unique_tokens 
##                0.081664837                0.125430474 
##                 num_videos       average_token_length 
##               -0.037025280                0.019432338 
##             n_tokens_title      avg_negative_polarity 
##               -0.409804085               -0.098834041 
##        global_subjectivity               num_keywords 
##                0.199209652                0.078995546 
## global_rate_negative_words         title_subjectivity 
##               -0.013396554               -0.604340824 
##      avg_positive_polarity  global_sentiment_polarity 
##                0.240303354                0.106171824 
## global_rate_positive_words 
##                0.005825282

#K-mean clustering
SpcScore3 <- scale(pcScore, scale=TRUE)
kfit <- lapply(1:30, function(k) kmeans(SpcScore3,k,nstart=10))

## Selects the Number of Clusters via an Information Criteria
## get AIC (option "A"), BIC (option "B"), HDIC (option "C") for the output of kmeans
kIC <- function(fit, rule=c("A","B","C")){
  df <- length(fit$centers) # K*dim
  n <- sum(fit$size)
  D <- fit$tot.withinss # deviance
  rule=match.arg(rule)
  if(rule=="A")
    #return(D + 2*df*n/max(1,n-df-1))
    return(D + 2*df)
  else if(rule=="B") 
    return(D + log(n)*df)
  else 
    return(D +  sqrt( n * log(df) )*df)
}

kaic <- sapply(kfit,kIC)
kbic  <- sapply(kfit,kIC,"B")
kcic  <- sapply(kfit,kIC,"C")
SpcScore<- scale(pcScore_selected, scale=TRUE)
K_clustering <- kmeans(as.data.frame(SpcScore),which.min(kcic),nstart=10)
K_clustering
fviz_cluster(K_clustering, data = as.data.frame(SpcScore), geom = "point", 
             show.clust.cent = FALSE, ellipse = FALSE,palette = "Set5", main = "K-means clustering with PC1 and PC2")

Core Task: Supervised Learning: Regression

Data Mining Method: Linear Regression, Stepwise Selection, CART, Random Forest, Lasso, Post Lasso

#full model
full_fit <- lm(log_shares ~ . , data = data_train)
summary(full_fit)
## 
## Call:
## lm(formula = log_shares ~ ., data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7460 -0.5672 -0.1786  0.4066  5.9923 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 7.7641324  0.2195487  35.364  < 2e-16 ***
## n_tokens_title             -0.0012955  0.0024951  -0.519 0.603596    
## n_non_stop_unique_tokens   -0.2370074  0.0666601  -3.555 0.000378 ***
## num_self_hrefs             -0.1162882  0.0100241 -11.601  < 2e-16 ***
## num_imgs                    0.0529891  0.0070219   7.546 4.61e-14 ***
## num_videos                  0.0670971  0.0082127   8.170 3.21e-16 ***
## average_token_length       -0.2652572  0.1132977  -2.341 0.019226 *  
## num_keywords                0.0173763  0.0028118   6.180 6.50e-10 ***
## self_reference_avg_sharess  0.0389144  0.0021084  18.457  < 2e-16 ***
## global_subjectivity         0.5866778  0.0731207   8.023 1.07e-15 ***
## global_sentiment_polarity  -0.1099720  0.1382201  -0.796 0.426254    
## global_rate_positive_words  0.3441118  0.5142576   0.669 0.503409    
## global_rate_negative_words -1.7157769  0.8387466  -2.046 0.040800 *  
## avg_positive_polarity      -0.1060966  0.0925714  -1.146 0.251760    
## avg_negative_polarity      -0.1350650  0.0544898  -2.479 0.013191 *  
## title_subjectivity          0.0456180  0.0167662   2.721 0.006516 ** 
## title_sentiment_polarity    0.0765887  0.0206930   3.701 0.000215 ***
## weekdayTues                -0.0733780  0.0172023  -4.266 2.00e-05 ***
## weekdayWed                 -0.0723752  0.0171706  -4.215 2.50e-05 ***
## weekdayThurs               -0.0598058  0.0172737  -3.462 0.000536 ***
## weekdayFri                  0.0005677  0.0183814   0.031 0.975362    
## weekdaySat                  0.2649675  0.0242289  10.936  < 2e-16 ***
## weekdaySun                  0.2322852  0.0233014   9.969  < 2e-16 ***
## channelLifestyle           -0.1771966  0.0269101  -6.585 4.63e-11 ***
## channelEntertainment       -0.4290908  0.0191382 -22.421  < 2e-16 ***
## channelBusiness            -0.2148456  0.0210432 -10.210  < 2e-16 ***
## channelSocial_Media         0.0935417  0.0263201   3.554 0.000380 ***
## channelTechnology          -0.1134975  0.0201425  -5.635 1.77e-08 ***
## channelWorld               -0.4686965  0.0201523 -23.258  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8841 on 29901 degrees of freedom
## Multiple R-squared:  0.09445,    Adjusted R-squared:  0.0936 
## F-statistic: 111.4 on 28 and 29901 DF,  p-value: < 2.2e-16
AIC(full_fit)
## [1] 77592.87
#full model with interaction
fit_interaction <- lm(log_shares ~ .^2, data = data_train)
#summary(fit_interaction)
AIC(fit_interaction)
#stepwise regression
#step(full_fit, trace = 1, direction = "both")
step_fit <- lm(log_shares ~ n_non_stop_unique_tokens + num_self_hrefs + 
                 num_imgs + num_videos + num_keywords + self_reference_avg_sharess + 
                 global_subjectivity + global_rate_negative_words + avg_positive_polarity + 
                 avg_negative_polarity + title_subjectivity + title_sentiment_polarity + 
                 weekday + channel, data = data_train)
summary(step_fit)
AIC(step_fit)
#PCA
pca_y <- data_train$log_shares
pca_x <- data.frame(pcScore)[,1:5]
pca_fit <- lm(pca_y ~ ., data = pca_x)
summary(pca_fit)
## 
## Call:
## lm(formula = pca_y ~ ., data = pca_x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6598 -0.6014 -0.1922  0.4416  5.8654 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  7.471547   0.005282 1414.534   <2e-16 ***
## PC1          0.084642   0.003433   24.654   <2e-16 ***
## PC2          0.071612   0.003725   19.227   <2e-16 ***
## PC3         -0.001528   0.003931   -0.389   0.6975    
## PC4         -0.005910   0.004482   -1.319   0.1872    
## PC5          0.008469   0.004920    1.721   0.0852 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9138 on 29924 degrees of freedom
## Multiple R-squared:  0.03178,    Adjusted R-squared:  0.03162 
## F-statistic: 196.5 on 5 and 29924 DF,  p-value: < 2.2e-16
#CART
cart_fit <- rpart(log_shares ~ . , data = data_train, method = "anova", control = rpart.control(cp = .0001))
bestcp <- cart_fit$cptable[which.min(cart_fit$cptable[,"xerror"]),"CP"]
best_cart_fit <- prune(cart_fit, cp = bestcp)
rpart.plot(best_cart_fit, type = 3, digits = 3, fallen.leaves = T)
printcp(best_cart_fit)
## 
## Regression tree:
## rpart(formula = log_shares ~ ., data = data_train, method = "anova", 
##     control = rpart.control(cp = 1e-04))
## 
## Variables actually used in tree construction:
## [1] channel                    global_subjectivity       
## [3] n_non_stop_unique_tokens   num_imgs                  
## [5] num_keywords               self_reference_avg_sharess
## [7] weekday                   
## 
## Root node error: 25808/29930 = 0.86227
## 
## n= 29930 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.0445863      0   1.00000 1.00003 0.012098
## 2  0.0140354      1   0.95541 0.95549 0.011848
## 3  0.0136494      2   0.94138 0.94158 0.011634
## 4  0.0066960      3   0.92773 0.92811 0.011431
## 5  0.0037318      4   0.92103 0.92152 0.011415
## 6  0.0026249      5   0.91730 0.91791 0.011338
## 7  0.0022728      6   0.91468 0.91589 0.011336
## 8  0.0022174      7   0.91240 0.91326 0.011332
## 9  0.0022087      8   0.91019 0.91305 0.011331
## 10 0.0018070      9   0.90798 0.91053 0.011307
## 11 0.0016526     10   0.90617 0.90915 0.011282
## 12 0.0015984     11   0.90452 0.90829 0.011280
## 13 0.0015448     12   0.90292 0.90772 0.011272
## 14 0.0015408     13   0.90137 0.90711 0.011263
## 15 0.0013608     14   0.89983 0.90571 0.011243
## 16 0.0012490     15   0.89847 0.90488 0.011235
## 17 0.0011461     16   0.89722 0.90423 0.011239
## 18 0.0010946     17   0.89608 0.90368 0.011219
## 19 0.0010281     18   0.89498 0.90315 0.011214
plotcp(best_cart_fit)
tot_count <- function(x, labs, digits, varlen) {paste(labs, "\n\nn = ", x$frame$n)}
prp(best_cart_fit, faclen = 0, cex = 0.8, node.fun = tot_count)
mtext(side=1, "Weekday: 1 = Mon, 2 = Tues, \n 3 = Wed, 4 = Thurs, 5 = Fri", line = 3, font=1, adj = 1)
#summary(best_cart_fit)

#Random Forest
#use which.min(regForest$mse) to get ntree
#it may take a while to run
forest_fit <- randomForest(log_shares ~ ., data = data_train, ntree=495, na.action=na.roughfix)
#lasso
model_train_matrix <- model.matrix(log_shares ~ ., data = data_train)
model_train_y <- data_train$log_shares

#lasso selection
lasso_model1 <- glmnet(model_train_matrix, model_train_y, alpha = 1, family = 'gaussian')
plot(lasso_model1, xvar = 'lambda', label = T)
lasso_model2 <- cv.glmnet(model_train_matrix, model_train_y, alpha = 1, family = 'gaussian', type.measure = 'mse')
lasso_lam_1se <- lasso_model2$lambda.1se
lasso_coef_1se <- coef(lasso_model1, lasso_lam_1se)
summary(lasso_coef_1se)
## 31 x 1 sparse Matrix of class "dgCMatrix", with 18 entries 
##     i j            x
## 1   1 1  6.968776462
## 2   5 1 -0.036913631
## 3   6 1  0.051128359
## 4   7 1  0.049523288
## 5   9 1  0.008979073
## 6  10 1  0.027569509
## 7  11 1  0.629684819
## 8  16 1 -0.046078665
## 9  17 1  0.022313879
## 10 18 1  0.027782519
## 11 19 1 -0.004097269
## 12 20 1 -0.003930313
## 13 23 1  0.237280848
## 14 24 1  0.209896529
## 15 26 1 -0.272958139
## 16 27 1 -0.069372148
## 17 28 1  0.131438093
## 18 30 1 -0.311906486
plot(lasso_model2)

lasso_model_best <- glmnet(model_train_matrix, model_train_y, alpha = 1, family = 'gaussian', lambda = lasso_lam_1se)
### Returns the indices for which |x[i]| > tr
support<- function(x, tr = 10e-6) {
  m<- rep(0, length(x))
  for (i in 1:length(x)) if( abs(x[i])> tr ) m[i]<- i
  m <- m[m>0]
  m
}
## get the support
lasso_supp <- support(lasso_model_best$beta)
length(lasso_supp)
## [1] 17
colnames(model_train_matrix[,lasso_supp])#get the features for lasso model
##  [1] "num_self_hrefs"             "num_imgs"                  
##  [3] "num_videos"                 "num_keywords"              
##  [5] "self_reference_avg_sharess" "global_subjectivity"       
##  [7] "avg_negative_polarity"      "title_subjectivity"        
##  [9] "title_sentiment_polarity"   "weekdayTues"               
## [11] "weekdayWed"                 "weekdaySat"                
## [13] "weekdaySun"                 "channelEntertainment"      
## [15] "channelBusiness"            "channelSocial_Media"       
## [17] "channelWorld"
inthemodel_train <- unique(c(lasso_supp))# unique grabs union
lasso_train_data <- cBind(model_train_matrix[,inthemodel_train]) 
## Warning: 'cBind' is deprecated.
##  Since R version 3.2.0, base's cbind() should work fine with S4 objects
lasso_train_data <- as.data.frame(as.matrix(lasso_train_data))# make it a data.frame
dim(lasso_train_data) ## p about half n
## [1] 29930    17
#fit the final model
lasso_fit <- lm(model_train_y ~., data=lasso_train_data)
summary(lasso_fit)
## 
## Call:
## lm(formula = model_train_y ~ ., data = lasso_train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7404 -0.5694 -0.1780  0.4098  5.9785 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 6.928537   0.039323 176.197  < 2e-16 ***
## num_self_hrefs             -0.114896   0.009757 -11.775  < 2e-16 ***
## num_imgs                    0.068475   0.005890  11.625  < 2e-16 ***
## num_videos                  0.078153   0.007889   9.906  < 2e-16 ***
## num_keywords                0.016148   0.002793   5.782 7.45e-09 ***
## self_reference_avg_sharess  0.039632   0.002067  19.170  < 2e-16 ***
## global_subjectivity         0.573377   0.065297   8.781  < 2e-16 ***
## avg_negative_polarity      -0.169516   0.045607  -3.717 0.000202 ***
## title_subjectivity          0.048054   0.016557   2.902 0.003708 ** 
## title_sentiment_polarity    0.078060   0.020202   3.864 0.000112 ***
## weekdayTues                -0.051140   0.013898  -3.680 0.000234 ***
## weekdayWed                 -0.050414   0.013859  -3.638 0.000276 ***
## weekdaySat                  0.285209   0.021994  12.968  < 2e-16 ***
## weekdaySun                  0.255872   0.020946  12.216  < 2e-16 ***
## channelEntertainment       -0.352930   0.015172 -23.262  < 2e-16 ***
## channelBusiness            -0.123422   0.016052  -7.689 1.53e-14 ***
## channelSocial_Media         0.182459   0.022780   8.010 1.19e-15 ***
## channelWorld               -0.380933   0.014762 -25.805  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8852 on 29912 degrees of freedom
## Multiple R-squared:  0.0919, Adjusted R-squared:  0.09139 
## F-statistic: 178.1 on 17 and 29912 DF,  p-value: < 2.2e-16
AIC(lasso_fit)
## [1] 77654.97

#post lasso
pl_train_matrix <- model.matrix(log_shares ~ ., data = data_train)
pl_train_y <- data_train$log_shares
pl_model1 <- glmnet(pl_train_matrix, pl_train_y, alpha = 1, family = 'gaussian')
plot(pl_model1, xvar = 'lambda', label = T)
pl_model2 <- cv.glmnet(pl_train_matrix, pl_train_y, alpha = 1, family = 'gaussian', type.measure = 'mse')
pl_lam_1se <- pl_model2$lambda.1se
pl_coef_1se <- coef(pl_model1, pl_lam_1se)
summary(pl_coef_1se)
## 31 x 1 sparse Matrix of class "dgCMatrix", with 16 entries 
##     i j            x
## 1   1 1  6.979439918
## 2   5 1 -0.020821091
## 3   6 1  0.047601859
## 4   7 1  0.043720526
## 5   9 1  0.007505051
## 6  10 1  0.025090320
## 7  11 1  0.641029572
## 8  16 1 -0.020683435
## 9  17 1  0.017048762
## 10 18 1  0.017549375
## 11 23 1  0.225082099
## 12 24 1  0.198107258
## 13 26 1 -0.256664988
## 14 27 1 -0.058287491
## 15 28 1  0.120993301
## 16 30 1 -0.297817589
plot(pl_model2)
pl_model_best <- glmnet(pl_train_matrix, pl_train_y, alpha = 1, family = 'gaussian', lambda = pl_lam_1se)

## get the support
pl_supp <- support(pl_model_best$beta)
length(pl_supp)
## [1] 15
colnames(pl_train_matrix[,pl_supp])#get the features for post lasso model
##  [1] "num_self_hrefs"             "num_imgs"                  
##  [3] "num_videos"                 "num_keywords"              
##  [5] "self_reference_avg_sharess" "global_subjectivity"       
##  [7] "avg_negative_polarity"      "title_subjectivity"        
##  [9] "title_sentiment_polarity"   "weekdaySat"                
## [11] "weekdaySun"                 "channelEntertainment"      
## [13] "channelBusiness"            "channelSocial_Media"       
## [15] "channelWorld"
data_pl_1se <- data.frame(pl_train_matrix[,pl_supp],pl_train_y)
#fit the pl
post_lasso <- lm(pl_train_y~., data=data_pl_1se)
summary(post_lasso)
## 
## Call:
## lm(formula = pl_train_y ~ ., data = data_pl_1se)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7179 -0.5701 -0.1758  0.4093  6.0012 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 6.904692   0.038989 177.092  < 2e-16 ***
## num_self_hrefs             -0.114914   0.009759 -11.775  < 2e-16 ***
## num_imgs                    0.068388   0.005892  11.607  < 2e-16 ***
## num_videos                  0.077785   0.007891   9.858  < 2e-16 ***
## num_keywords                0.016200   0.002794   5.799 6.74e-09 ***
## self_reference_avg_sharess  0.039587   0.002068  19.143  < 2e-16 ***
## global_subjectivity         0.576647   0.065313   8.829  < 2e-16 ***
## avg_negative_polarity      -0.170595   0.045621  -3.739 0.000185 ***
## title_subjectivity          0.048122   0.016563   2.905 0.003670 ** 
## title_sentiment_polarity    0.077653   0.020207   3.843 0.000122 ***
## weekdaySat                  0.307065   0.021478  14.297  < 2e-16 ***
## weekdaySun                  0.277690   0.020404  13.610  < 2e-16 ***
## channelEntertainment       -0.352011   0.015175 -23.196  < 2e-16 ***
## channelBusiness            -0.123099   0.016057  -7.666 1.82e-14 ***
## channelSocial_Media         0.182829   0.022787   8.023 1.07e-15 ***
## channelWorld               -0.380107   0.014766 -25.743  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8854 on 29914 degrees of freedom
## Multiple R-squared:  0.09127,    Adjusted R-squared:  0.09081 
## F-statistic: 200.3 on 15 and 29914 DF,  p-value: < 2.2e-16
AIC(post_lasso)
## [1] 77671.99

Prediction

Let’s get the prediction based on the models above.

################
###Prediction###
################

MAE <- function(actual, predicted) {mean(abs(actual - predicted))}
R2 <- function(actual, predicted) {1 - (sum((predicted - actual)^2))/(sum((mean(actual) - actual)^2))}

test_with_prediction <- data_test
#Full
test_with_prediction$pred_full <- predict.lm(full_fit, newdata = test_with_prediction)
#Interaction
test_with_prediction$pred_interaction <- predict.lm(fit_interaction, newdata = test_with_prediction)
#Stepwise
test_with_prediction$pred_step <- predict.lm(step_fit, newdata = test_with_prediction)
#PCA
pca_test <- data_test[,1:16]
pca_test <- scale(pca_test)
#Computing the full PCA
pca_test_fit <- prcomp(pca_test, scale=TRUE)
#build PCA model
pca_test_scores <- predict(pca_test_fit)
pca_test_loadings <- pca_test_fit$rotation[,1:5]
pca_test_y <- data_test$log_shares
pca_test_x <- data.frame(pca_test_scores)[,1:5]
pca_test_lmfit <- lm(pca_test_y ~ ., data = pca_test_x)
test_with_prediction$pred_pca <- predict(pca_test_lmfit, data = pca_test_x)
#CART
test_with_prediction$pred_cart <- predict(best_cart_fit, newdata = data_test, type="vector")
#Random Forest
test_with_prediction$pred_forest <- predict(forest_fit, newdata = data_test, type="response")
#Lasso
model_test <- data_test
model_test_matrix <- model.matrix( ~ ., data=model_test)
inthemodel_test <- unique(c(lasso_supp))# unique grabs union
lasso_test_data <- cBind(model_test_matrix[,inthemodel_test]) 
lasso_test_data <- as.data.frame(as.matrix(lasso_test_data))# make it a data.frame
lasso_pred <- predict.lm(lasso_fit, newdata = lasso_test_data,type="response")
model_test_with_prediction <- model_test
model_test_with_prediction$lasso_pred <- lasso_pred
test_with_prediction$pred_lasso <- predict.lm(lasso_fit, newdata = lasso_test_data,type="response")
#Post Lasso
pl_model_test <- data_test
pl_model_test_matrix <- model.matrix( ~ ., data=model_test)
pl_inthemodel_test <- unique(c(pl_supp))# unique grabs union
pl_test_data <- cBind(pl_model_test_matrix[,pl_inthemodel_test]) 
pl_test_data <- as.data.frame(as.matrix(pl_test_data))# make it a data.frame
test_with_prediction$pred_pl <- predict.lm(post_lasso, newdata = pl_test_data,type="response")

Evaluation

I have chosen two different evaluation measures – out-of-sample mean squares error (MSE) and R2. MSE is an easy to understand and communicate measure, best suited to explain to the digital platform stakeholders and business decision makers. The lower the MSE, the better the model. R2 is an effective measure to explain how much the prediction can explain the variability of the actual log shares. The higher the R2, the better the model.

################
###Evaluation###
################
aic_full <- AIC(full_fit)
coef_full <- data.frame(full_fit$coefficients)
ncoef_full <- nrow(coef_full)
mae_full <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_full)
mse_full <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_full)
R2_full <- R2(test_with_prediction$log_shares, test_with_prediction$pred_full)

coef_interaction <- data.frame(fit_interaction$coefficients)
ncoef_interaction <- nrow(coef_interaction)
aic_interaction <- AIC(fit_interaction)
mae_interaction <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_interaction)
mse_interaction <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_interaction)
R2_interaction <- R2(test_with_prediction$log_shares, test_with_prediction$pred_interaction)

coef_step <- data.frame(step_fit$coefficients)
ncoef_step <- nrow(coef_step)
aic_step <- AIC(step_fit)
mae_step <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_step)
mse_step <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_step)
R2_step <- R2(test_with_prediction$log_shares, test_with_prediction$pred_step)

coef_pca <- data.frame(pca_fit$coefficients)
ncoef_pca <- nrow(coef_pca)
aic_pca <- AIC(pca_fit) 
mae_pca <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_pca)
mse_pca <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_pca)
R2_pca <- R2(test_with_prediction$log_shares, test_with_prediction$pred_pca)

coef_cart <- data.frame(best_cart_fit$cptable)
ncoef_cart <- nrow(coef_cart)
#aic_cart <- AIC(best_cart_fit) #no aic
mae_cart <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_cart)
mse_cart <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_cart)
R2_cart <- R2(test_with_prediction$log_shares, test_with_prediction$pred_cart)

coef_forest <- data.frame(forest_fit$cptable)#can't make it a data frame
ncoef_forest <- forest_fit$ntree
#aic_forest <- AIC(forest_fit) #no aic
mae_forest <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_forest)
mse_forest <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_forest)
R2_forest <- R2(test_with_prediction$log_shares, test_with_prediction$pred_forest)

coef_lasso <- data.frame(lasso_fit$coefficients)
ncoef_lasso <- nrow(coef_lasso)
aic_lasso <- AIC(lasso_fit) 
mae_lasso <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_lasso)
mse_lasso <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_lasso)
R2_lasso <- R2(test_with_prediction$log_shares, test_with_prediction$pred_lasso)

coef_pl <- data.frame(post_lasso$coefficients)
ncoef_pl <- nrow(coef_pl)
aic_pl <- AIC(post_lasso)
mae_pl <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_pl)
mse_pl <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_pl)
R2_pl <- R2(test_with_prediction$log_shares, test_with_prediction$pred_pl)

#create evaluation metrics data frame
mae_data <- data.frame(cbind(mae_full, mae_interaction, mae_step, mae_pca, mae_cart, mae_forest, mae_lasso, mae_pl))
mse_data <- data.frame(cbind(mse_full, mse_interaction, mse_step, mse_pca, mse_cart, mse_forest, mse_lasso, mse_pl))
aic_data <- data.frame(cbind(aic_full, aic_interaction, aic_step, aic_pca, aic_lasso, aic_pl))
R2_data <- data.frame(cbind(R2_full, R2_interaction, R2_step, R2_pca, R2_cart, R2_forest, R2_lasso, R2_pl))
num_coef <- data.frame(cbind(ncoef_full, ncoef_interaction, ncoef_step, ncoef_pca, ncoef_cart, ncoef_forest, ncoef_lasso, ncoef_pl))

#visualize OOS performance
barplot(height = cbind(mae_full, mae_interaction, mae_step, mae_pca, mae_cart, mae_forest, mae_lasso, mae_pl), space = 0.1, ylim =c(0, 0.8), 
        names.arg = c("Full", "INT", "Step", "PCA", "CART", "Forest", "Lasso", "PL"), ylab = "OOS Mean Absolute Error",
        main = "OOS MAE Across Different Models")
abline(h = sum(mae_data[1,])/ncol(mae_data), col = "red", lwd = 2)
mtext(side=1, "Line represents the average", col = "red", line = 4, font=1, adj = 1)

barplot(height = cbind(mse_full, mse_interaction, mse_step, mse_pca, mse_cart, mae_forest, mse_lasso, mse_pl), space = 0.1, ylim =c(0, 0.8), 
        names.arg = c("Full", "INT", "Step", "PCA", "CART", "Forest", "Lasso", "PL"), ylab = "OOS Mean Square Error",
        main = "OOS MSE Across Different Models")
abline(h = sum(mse_data[1,])/ncol(mse_data), col = "red", lwd = 2)
mtext(side=1, "Line represents the average", col = "red", line = 4, font=1, adj = 1)

barplot(height = cbind(aic_full, aic_interaction, aic_step, aic_pca, aic_lasso, aic_pl), space = 0.1, 
        names.arg = c("Full", "INT", "Step", "PCA", "Lasso", "PL"), ylab = "AIC Value", ylim =c(0, 80000),
        main = "AIC Values Across Different Models")
abline(h = sum(aic_data[1,])/ncol(aic_data), col = "red", lwd = 2)

barplot(height = cbind(R2_full, R2_interaction, R2_step, R2_pca, R2_cart, R2_forest, R2_lasso, R2_pl), space = 0.1, ylim =c(0, 0.14), 
        names.arg = c("Full", "INT", "Step", "PCA", "CART", "Forest", "Lasso", "PL"), ylab = "OOS R Square",
        main = "OOS R2 Across Different Models")
abline(h = sum(R2_data[1,])/ncol(R2_data), col = "red", lwd = 2)
mtext(side=1, "Line represents the average", col = "red", line = 4, font=1, adj = 1)

barplot(height = cbind(ncoef_full, ncoef_interaction, ncoef_step, ncoef_pca, ncoef_cart, ncoef_forest, ncoef_lasso, ncoef_pl), space = 0.1, ylim =c(0, 500), 
        names.arg = c("Full", "INT", "Step", "PCA", "CART", "Forest", "Lasso", "PL"), ylab = "Number of Coefficients",
        main = "Number of Coefficients Across Different Models")
abline(h = sum(num_coef[1,])/ncol(num_coef), col = "red", lwd = 2)
mtext(side=1, " Number of splits for CART \n Number of trees for Random Forest", line = 4, font=1, adj = 0)
mtext(side=1, "Line represents the average", col = "red", line = 4, font=1, adj = 1)

barplot(height = cbind(ncoef_full, ncoef_step, ncoef_pca, ncoef_cart, ncoef_lasso, ncoef_pl), space = 0.1, ylim =c(0, 30), 
        names.arg = c("Full", "Step", "PCA", "CART", "Lasso", "PL"), ylab = "Number of Coefficients",
        main = "Number of Coefficients Across Different Models")
abline(h = (sum(num_coef[1,])-num_coef[1,2]-num_coef[1,6])/(ncol(num_coef)-2), col = "red", lwd = 2)

Further Investigation

Quantile regression enabled us to provide decision makers with information to optimize articles that received very few shares and those that received the most shares, in order to improve the features of a low-share articles and to further boost the shares of high-share articles.

#########################
###Quantile Regression###
#########################

data_quan <- lasso_train_data
data_quan$log_shares <- data_train$log_shares
### Look at the lower tail say tau = 0.05
lower.tail <- rq(log_shares ~ ., tau = 0.05, data = data_quan) 
coef(lower.tail)
##                (Intercept)             num_self_hrefs 
##                6.087359807               -0.015601806 
##                   num_imgs                 num_videos 
##                0.021442596                0.018427350 
##               num_keywords self_reference_avg_sharess 
##                0.008774502                0.042859417 
##        global_subjectivity      avg_negative_polarity 
##               -0.027662853                0.009750629 
##         title_subjectivity   title_sentiment_polarity 
##                0.026853212                0.011745622 
##                weekdayTues                 weekdayWed 
##               -0.036166449               -0.026117106 
##                 weekdaySat                 weekdaySun 
##                0.378346900                0.307290861 
##       channelEntertainment            channelBusiness 
##               -0.197466432               -0.021798876 
##        channelSocial_Media               channelWorld 
##                0.207189058               -0.171960893
### Look at the upper tail say tau = 0.95
upper.tail <- rq(log_shares ~ ., tau = 0.95, data = data_quan) 
coef(upper.tail)
##                (Intercept)             num_self_hrefs 
##               7.9924650511              -0.2926611937 
##                   num_imgs                 num_videos 
##               0.0931196590               0.2093122312 
##               num_keywords self_reference_avg_sharess 
##               0.0008896307               0.0690336155 
##        global_subjectivity      avg_negative_polarity 
##               1.8821370105              -0.7273687290 
##         title_subjectivity   title_sentiment_polarity 
##               0.0522939074               0.1558140529 
##                weekdayTues                 weekdayWed 
##              -0.0634786415              -0.0585003226 
##                 weekdaySat                 weekdaySun 
##               0.2254553195               0.1469037495 
##       channelEntertainment            channelBusiness 
##              -0.2444643967              -0.3444075481 
##        channelSocial_Media               channelWorld 
##               0.0671010580              -0.5070925192
###taus from 0.05 to 0.95
taus <- seq(from=0.05,to=0.95,by=0.05)

rq_taus <-rq(log_shares ~ ., tau = taus, data = data_quan)
fittaus_rq <- summary(rq_taus)


#visualize the different variables' effects across different taus
plot(fittaus_rq)
mtext("Values of taus", side=1, line = 4.2, font=1, col = 'blue')
mtext("Values of coefficients", side=2, line = 3.3, font=1, col = 'blue')

plot(fittaus_rq, parm = 1, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Intercept ")
plot(fittaus_rq, parm = 2, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for num_self_hrefs ")
plot(fittaus_rq, parm = 3, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for num_imgs ")
plot(fittaus_rq, parm = 4, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for num_videos ")
plot(fittaus_rq, parm = 5, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for num_keywords ")
plot(fittaus_rq, parm = 6, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for self_reference_avg_sharess ")
plot(fittaus_rq, parm = 7, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for global_subjectivity ")
plot(fittaus_rq, parm = 8, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for avg_negative_polarity ")
plot(fittaus_rq, parm = 9, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for title_subjectivity ")
plot(fittaus_rq, parm = 10, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for title_sentiment_polarity ")
plot(fittaus_rq, parm = 11, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Saturday ")
plot(fittaus_rq, parm = 12, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Sunday ")
plot(fittaus_rq, parm = 13, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Entertainment ")
plot(fittaus_rq, parm = 14, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Business ")
plot(fittaus_rq, parm = 15, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Social_Media ")
plot(fittaus_rq, parm = 16, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for World ")

Causality

Double Selection Procedure for Robust confidence Intervals

3 steps: 1. run Lasso of Y on controls X 2. run Lasso of d on controls X 3. Take all selectedX (Step 1 and 2) and run Linear regression of Y on treatment d and selectedX

causality_data_train <- lasso_train_data
causality_y <- data_train$log_shares
### treatment is num_imgs
causality_x_num_imgs <- model.matrix(causality_y ~ .-num_imgs, data = causality_data_train )
treatment_num_imgs <- causality_data_train$num_imgs

## Step 1.Select controls that are good to predict the outcome
causality_model1_num_imgs <- cv.glmnet(causality_x_num_imgs, causality_y, alpha = 1, family = 'gaussian', type.measure = 'mse')
causality_lam_1se_num_imgs <- causality_model1_num_imgs$lambda.1se
# Call Lasso 
causality_lasso_1se_num_imgs <- glmnet(causality_x_num_imgs, causality_y,alpha = 1, family = 'gaussian',lambda = causality_lam_1se_num_imgs)
# Get the support
causality_supp1_num_imgs <- support(causality_lasso_1se_num_imgs$beta)
# Step 1 selected
length(causality_supp1_num_imgs)
## [1] 14
### controls
colnames(causality_x_num_imgs[,causality_supp1_num_imgs])
##  [1] "num_self_hrefs"             "num_videos"                
##  [3] "num_keywords"               "self_reference_avg_sharess"
##  [5] "global_subjectivity"        "avg_negative_polarity"     
##  [7] "title_subjectivity"         "title_sentiment_polarity"  
##  [9] "weekdaySat"                 "weekdaySun"                
## [11] "channelEntertainment"       "channelBusiness"           
## [13] "channelSocial_Media"        "channelWorld"
###
### Step 2.Select controls that are good to predict the treatment
causality_model2_num_imgs <- cv.glmnet(causality_x_num_imgs, treatment_num_imgs, alpha = 1, family = 'gaussian', type.measure = 'mse')
causality_lam_d_1se_num_imgs <- causality_model2_num_imgs$lambda.1se
# Call Lasso 
causality_lam_d_1se_num_imgs <- glmnet(causality_x_num_imgs, treatment_num_imgs, alpha = 1, family = 'gaussian',lambda = causality_lam_d_1se_num_imgs)
# Get the support
causality_supp2_num_imgs <-support(causality_lam_d_1se_num_imgs$beta)
### Step 2 selected
length(causality_supp2_num_imgs)
## [1] 11
### controls
colnames(causality_x_num_imgs[,causality_supp2_num_imgs])
##  [1] "num_self_hrefs"             "num_videos"                
##  [3] "num_keywords"               "self_reference_avg_sharess"
##  [5] "global_subjectivity"        "avg_negative_polarity"     
##  [7] "title_subjectivity"         "title_sentiment_polarity"  
##  [9] "channelBusiness"            "channelSocial_Media"       
## [11] "channelWorld"
###
### Step 3.Combine all selected and refit linear regression
causality_inthemodel_num_imgs <- unique(c(causality_supp1_num_imgs, causality_supp2_num_imgs)) # unique grabs union
selectdata_num_imgs <- cBind(treatment_num_imgs, causality_x_num_imgs[,causality_inthemodel_num_imgs]) 
selectdata_num_imgs <- as.data.frame(as.matrix(selectdata_num_imgs)) # make it a data.frame
dim(selectdata_num_imgs) ## p about half n
## [1] 29930    15
causality_lm_num_imgs <- lm(causality_y ~ ., data = selectdata_num_imgs)
summary(causality_lm_num_imgs)$coef["treatment_num_imgs",]
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 6.838849e-02 5.891901e-03 1.160720e+01 4.416458e-31
####
#### Compare/contrast the results with the following models
#### 
simple_num_imgs <- lm(causality_y ~ num_imgs, data = causality_data_train)
summary(simple_num_imgs)
## 
## Call:
## lm(formula = causality_y ~ num_imgs, data = causality_data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3564 -0.6100 -0.1863  0.4379  6.0886 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.356441   0.008266  889.97   <2e-16 ***
## num_imgs    0.103193   0.005658   18.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9235 on 29928 degrees of freedom
## Multiple R-squared:  0.01099,    Adjusted R-squared:  0.01096 
## F-statistic: 332.6 on 1 and 29928 DF,  p-value: < 2.2e-16
summary(full_fit)
## 
## Call:
## lm(formula = log_shares ~ ., data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7460 -0.5672 -0.1786  0.4066  5.9923 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 7.7641324  0.2195487  35.364  < 2e-16 ***
## n_tokens_title             -0.0012955  0.0024951  -0.519 0.603596    
## n_non_stop_unique_tokens   -0.2370074  0.0666601  -3.555 0.000378 ***
## num_self_hrefs             -0.1162882  0.0100241 -11.601  < 2e-16 ***
## num_imgs                    0.0529891  0.0070219   7.546 4.61e-14 ***
## num_videos                  0.0670971  0.0082127   8.170 3.21e-16 ***
## average_token_length       -0.2652572  0.1132977  -2.341 0.019226 *  
## num_keywords                0.0173763  0.0028118   6.180 6.50e-10 ***
## self_reference_avg_sharess  0.0389144  0.0021084  18.457  < 2e-16 ***
## global_subjectivity         0.5866778  0.0731207   8.023 1.07e-15 ***
## global_sentiment_polarity  -0.1099720  0.1382201  -0.796 0.426254    
## global_rate_positive_words  0.3441118  0.5142576   0.669 0.503409    
## global_rate_negative_words -1.7157769  0.8387466  -2.046 0.040800 *  
## avg_positive_polarity      -0.1060966  0.0925714  -1.146 0.251760    
## avg_negative_polarity      -0.1350650  0.0544898  -2.479 0.013191 *  
## title_subjectivity          0.0456180  0.0167662   2.721 0.006516 ** 
## title_sentiment_polarity    0.0765887  0.0206930   3.701 0.000215 ***
## weekdayTues                -0.0733780  0.0172023  -4.266 2.00e-05 ***
## weekdayWed                 -0.0723752  0.0171706  -4.215 2.50e-05 ***
## weekdayThurs               -0.0598058  0.0172737  -3.462 0.000536 ***
## weekdayFri                  0.0005677  0.0183814   0.031 0.975362    
## weekdaySat                  0.2649675  0.0242289  10.936  < 2e-16 ***
## weekdaySun                  0.2322852  0.0233014   9.969  < 2e-16 ***
## channelLifestyle           -0.1771966  0.0269101  -6.585 4.63e-11 ***
## channelEntertainment       -0.4290908  0.0191382 -22.421  < 2e-16 ***
## channelBusiness            -0.2148456  0.0210432 -10.210  < 2e-16 ***
## channelSocial_Media         0.0935417  0.0263201   3.554 0.000380 ***
## channelTechnology          -0.1134975  0.0201425  -5.635 1.77e-08 ***
## channelWorld               -0.4686965  0.0201523 -23.258  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8841 on 29901 degrees of freedom
## Multiple R-squared:  0.09445,    Adjusted R-squared:  0.0936 
## F-statistic: 111.4 on 28 and 29901 DF,  p-value: < 2.2e-16
summary(step_fit)
## 
## Call:
## lm(formula = log_shares ~ n_non_stop_unique_tokens + num_self_hrefs + 
##     num_imgs + num_videos + num_keywords + self_reference_avg_sharess + 
##     global_subjectivity + global_rate_negative_words + avg_positive_polarity + 
##     avg_negative_polarity + title_subjectivity + title_sentiment_polarity + 
##     weekday + channel, data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7492 -0.5675 -0.1788  0.4064  5.9928 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 7.2838030  0.0714406 101.956  < 2e-16 ***
## n_non_stop_unique_tokens   -0.2303441  0.0665819  -3.460 0.000542 ***
## num_self_hrefs             -0.1160584  0.0099967 -11.610  < 2e-16 ***
## num_imgs                    0.0521318  0.0070055   7.442 1.02e-13 ***
## num_videos                  0.0680752  0.0081815   8.321  < 2e-16 ***
## num_keywords                0.0171564  0.0028031   6.120 9.45e-10 ***
## self_reference_avg_sharess  0.0390412  0.0020856  18.719  < 2e-16 ***
## global_subjectivity         0.5896006  0.0695995   8.471  < 2e-16 ***
## global_rate_negative_words -1.0899775  0.5290109  -2.060 0.039368 *  
## avg_positive_polarity      -0.1526391  0.0672672  -2.269 0.023267 *  
## avg_negative_polarity      -0.1560336  0.0471303  -3.311 0.000932 ***
## title_subjectivity          0.0457695  0.0166566   2.748 0.006003 ** 
## title_sentiment_polarity    0.0764401  0.0204928   3.730 0.000192 ***
## weekdayTues                -0.0734032  0.0172020  -4.267 1.99e-05 ***
## weekdayWed                 -0.0724128  0.0171707  -4.217 2.48e-05 ***
## weekdayThurs               -0.0599418  0.0172725  -3.470 0.000520 ***
## weekdayFri                  0.0005836  0.0183815   0.032 0.974674    
## weekdaySat                  0.2644808  0.0242208  10.920  < 2e-16 ***
## weekdaySun                  0.2316073  0.0232899   9.945  < 2e-16 ***
## channelLifestyle           -0.1714681  0.0267797  -6.403 1.55e-10 ***
## channelEntertainment       -0.4245561  0.0189213 -22.438  < 2e-16 ***
## channelBusiness            -0.2125001  0.0209687 -10.134  < 2e-16 ***
## channelSocial_Media         0.0983462  0.0261616   3.759 0.000171 ***
## channelTechnology          -0.1059221  0.0198937  -5.324 1.02e-07 ***
## channelWorld               -0.4722948  0.0199803 -23.638  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8841 on 29905 degrees of freedom
## Multiple R-squared:  0.09426,    Adjusted R-squared:  0.09353 
## F-statistic: 129.7 on 24 and 29905 DF,  p-value: < 2.2e-16
summary(lasso_fit)
## 
## Call:
## lm(formula = model_train_y ~ ., data = lasso_train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7404 -0.5694 -0.1780  0.4098  5.9785 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 6.928537   0.039323 176.197  < 2e-16 ***
## num_self_hrefs             -0.114896   0.009757 -11.775  < 2e-16 ***
## num_imgs                    0.068475   0.005890  11.625  < 2e-16 ***
## num_videos                  0.078153   0.007889   9.906  < 2e-16 ***
## num_keywords                0.016148   0.002793   5.782 7.45e-09 ***
## self_reference_avg_sharess  0.039632   0.002067  19.170  < 2e-16 ***
## global_subjectivity         0.573377   0.065297   8.781  < 2e-16 ***
## avg_negative_polarity      -0.169516   0.045607  -3.717 0.000202 ***
## title_subjectivity          0.048054   0.016557   2.902 0.003708 ** 
## title_sentiment_polarity    0.078060   0.020202   3.864 0.000112 ***
## weekdayTues                -0.051140   0.013898  -3.680 0.000234 ***
## weekdayWed                 -0.050414   0.013859  -3.638 0.000276 ***
## weekdaySat                  0.285209   0.021994  12.968  < 2e-16 ***
## weekdaySun                  0.255872   0.020946  12.216  < 2e-16 ***
## channelEntertainment       -0.352930   0.015172 -23.262  < 2e-16 ***
## channelBusiness            -0.123422   0.016052  -7.689 1.53e-14 ***
## channelSocial_Media         0.182459   0.022780   8.010 1.19e-15 ***
## channelWorld               -0.380933   0.014762 -25.805  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8852 on 29912 degrees of freedom
## Multiple R-squared:  0.0919, Adjusted R-squared:  0.09139 
## F-statistic: 178.1 on 17 and 29912 DF,  p-value: < 2.2e-16
summary(post_lasso)
## 
## Call:
## lm(formula = pl_train_y ~ ., data = data_pl_1se)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7179 -0.5701 -0.1758  0.4093  6.0012 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 6.904692   0.038989 177.092  < 2e-16 ***
## num_self_hrefs             -0.114914   0.009759 -11.775  < 2e-16 ***
## num_imgs                    0.068388   0.005892  11.607  < 2e-16 ***
## num_videos                  0.077785   0.007891   9.858  < 2e-16 ***
## num_keywords                0.016200   0.002794   5.799 6.74e-09 ***
## self_reference_avg_sharess  0.039587   0.002068  19.143  < 2e-16 ***
## global_subjectivity         0.576647   0.065313   8.829  < 2e-16 ***
## avg_negative_polarity      -0.170595   0.045621  -3.739 0.000185 ***
## title_subjectivity          0.048122   0.016563   2.905 0.003670 ** 
## title_sentiment_polarity    0.077653   0.020207   3.843 0.000122 ***
## weekdaySat                  0.307065   0.021478  14.297  < 2e-16 ***
## weekdaySun                  0.277690   0.020404  13.610  < 2e-16 ***
## channelEntertainment       -0.352011   0.015175 -23.196  < 2e-16 ***
## channelBusiness            -0.123099   0.016057  -7.666 1.82e-14 ***
## channelSocial_Media         0.182829   0.022787   8.023 1.07e-15 ***
## channelWorld               -0.380107   0.014766 -25.743  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8854 on 29914 degrees of freedom
## Multiple R-squared:  0.09127,    Adjusted R-squared:  0.09081 
## F-statistic: 200.3 on 15 and 29914 DF,  p-value: < 2.2e-16
coef(lower.tail)
##                (Intercept)             num_self_hrefs 
##                6.087359807               -0.015601806 
##                   num_imgs                 num_videos 
##                0.021442596                0.018427350 
##               num_keywords self_reference_avg_sharess 
##                0.008774502                0.042859417 
##        global_subjectivity      avg_negative_polarity 
##               -0.027662853                0.009750629 
##         title_subjectivity   title_sentiment_polarity 
##                0.026853212                0.011745622 
##                weekdayTues                 weekdayWed 
##               -0.036166449               -0.026117106 
##                 weekdaySat                 weekdaySun 
##                0.378346900                0.307290861 
##       channelEntertainment            channelBusiness 
##               -0.197466432               -0.021798876 
##        channelSocial_Media               channelWorld 
##                0.207189058               -0.171960893
coef(upper.tail)
##                (Intercept)             num_self_hrefs 
##               7.9924650511              -0.2926611937 
##                   num_imgs                 num_videos 
##               0.0931196590               0.2093122312 
##               num_keywords self_reference_avg_sharess 
##               0.0008896307               0.0690336155 
##        global_subjectivity      avg_negative_polarity 
##               1.8821370105              -0.7273687290 
##         title_subjectivity   title_sentiment_polarity 
##               0.0522939074               0.1558140529 
##                weekdayTues                 weekdayWed 
##              -0.0634786415              -0.0585003226 
##                 weekdaySat                 weekdaySun 
##               0.2254553195               0.1469037495 
##       channelEntertainment            channelBusiness 
##              -0.2444643967              -0.3444075481 
##        channelSocial_Media               channelWorld 
##               0.0671010580              -0.5070925192

More

According to the output from quantile regression, we can do the same for variables like num_videos, global_subjectivity, Sunday, Business, World, etc. to explore the causality relationship between each variable and the predictive variable.