The dataset is obtained from https://archive.ics.uci.edu/ml/datasets/Online+News+Popularity
This dataset summarizes a heterogeneous set of features about articles published by Mashable News in a period of two years (Fernandes, et al., 2015). The goal is to predict the number of shares in social networks which indicates the popularity of news.
After considering various models, the Lasso model proved to be the best model because of its high accuracy and straightforward interpretation. By running the selected variables in quantile regression, Mashable will be able to not only predict how many shares an article will likely receive but also how journalists can increase the number of shares based on the quantile regression information. Moreover, Mashable could run all pre-published articles through this model to predict shares and also analyze where it will fall in the quantiles, and therefore change the article based on relevant features to receive more shares. The causality analysis also provides important information on which variables directly cause shares to increase; Mashable can focus on factors such as the number of images in an article or publishing on a Sunday in the future to increase popularity.
You can check my code for this project at https://github.com/XuebinZhu/Projects/tree/master/Mashable%20News%20Article%20Sharing%20Project
More interesting projects could be found here https://github.com/XuebinZhu/Projects
#############################################
### Mashable News Article Sharing Project ###
#############################################
rm(list=ls())
#load all the required library
library(factoextra)
library(usmap)
library(randomForest)
library(quantreg)
library(rpart)
library(rpart.plot)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(MASS)
library(usmap)
library(glmnet)
library(MLmetrics)
library(scales)
library(plfm)
library(caTools)
library(psych)
library(VIM)
#set seed for spliting data set
set.seed(123)
I will check for null or erroneous values in the data, evaluate each variable, and remove those were not beneficial to the analysis. Then, I will do some data transformation for feature engineering.
###################
###Data Cleaning###
###################
#read the data
dat0 <- read.csv("OnlineNewsPopularity.csv", header = T)
#combine and create new variables
dat0$weekday <- as.numeric(dat0$weekday_is_monday) * 1 +
as.numeric(dat0$weekday_is_tuesday) * 2 +
as.numeric(dat0$weekday_is_wednesday) * 3 +
as.numeric(dat0$weekday_is_thursday) * 4 +
as.numeric(dat0$weekday_is_friday) * 5 +
as.numeric(dat0$weekday_is_saturday) * 6 +
as.numeric(dat0$weekday_is_sunday) * 7
dat0$channel <- as.numeric(dat0$data_channel_is_lifestyle) * 1 +
as.numeric(dat0$data_channel_is_entertainment) * 2 +
as.numeric(dat0$data_channel_is_bus) * 3 +
as.numeric(dat0$data_channel_is_socmed) * 4 +
as.numeric(dat0$data_channel_is_tech) * 5 +
as.numeric(dat0$data_channel_is_world) * 6
dat0$channel <- (dat0$channel +1)
#rename the variables
dat0$weekday <- factor(dat0$weekday, levels = 1:7, labels = c("Mon", "Tues", "Wed",
"Thurs", "Fri", "Sat", "Sun"))
dat0$channel <- factor(dat0$channel, levels = 1:7, labels = c("Other", "Lifestyle", "Entertainment", "Business",
"Social_Media", "Technology", "World"))
#use log transformation for shares
hist(dat0$shares)
dat0$log_shares <- log(dat0$shares)
hist(dat0$log_shares)
#drop meaningless variables
drop <- c("url","n_tokens_content", "n_unique_tokens", "num_hrefs", " kw_min_min", "kw_max_min","kw_avg_min",
"kw_min_max","kw_max_max", "kw_avg_max", "kw_min_avg"," kw_max_avg", "kw_avg_avg", "self_reference_min_shares",
"self_reference_max_shares", "abs_title_subjectivity", "abs_title_sentiment_polarity", "min_positive_polarity",
"max_positive_polarity", "min_negative_polarity", "max_negative_polarity", "rate_positive_words", "rate_negative_words",
"kw_min_min", "kw_max_avg", "weekday_is_monday","weekday_is_tuesday" , "weekday_is_wednesday", "weekday_is_thursday",
"weekday_is_friday","weekday_is_saturday","weekday_is_sunday", "data_channel_is_lifestyle", "data_channel_is_entertainment",
"data_channel_is_bus","data_channel_is_socmed", "data_channel_is_tech","data_channel_is_world", "shares",
"LDA_00", "LDA_01", "LDA_02", "LDA_03", "LDA_04")
dat0 <- dat0[,!(names(dat0) %in% drop)]
#convert data type
dat0$is_weekend <- as.factor(dat0$is_weekend)
#create full data set for this project
news <- dat0
#remove outliers
n_non_stop_words_Q1SP <- summary(news$n_non_stop_words)[2]
n_non_stop_words_Q3SP <- summary(news$n_non_stop_words)[5]
n_non_stop_words_IQRSP <- n_non_stop_words_Q3SP - n_non_stop_words_Q1SP
min_n_non_stop_words <- as.numeric(n_non_stop_words_Q1SP - 1.5*n_non_stop_words_IQRSP)
max_n_non_stop_words <- as.numeric(n_non_stop_words_Q3SP + 1.5*n_non_stop_words_IQRSP)
news <- news %>% filter(n_non_stop_words < max_n_non_stop_words)
news <- news %>% filter(n_non_stop_words > min_n_non_stop_words)
summary(dat0$n_non_stop_words)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.9965 1.0000 1042.0000
boxplot(news$n_non_stop_words)
hist(news$n_non_stop_words)#this variable seems to have lots of errors, so I decided to remove it
#"is_weekend" is highly co-related with weekday, so I decided to remove it
#"timedelta" gives no predictive information for our y variable, so I decided to remove it
news <- dat0
drop1 <- c("n_non_stop_words", "is_weekend", "timedelta")
news <- news[,!(names(news) %in% drop1)]
#remove outliers
n_non_stop_unique_tokens_Q1SP <- summary(news$n_non_stop_unique_tokens)[2]
n_non_stop_unique_tokens_Q3SP <- summary(news$n_non_stop_unique_tokens)[5]
n_non_stop_unique_tokens_IQRSP <- n_non_stop_unique_tokens_Q3SP - n_non_stop_unique_tokens_Q1SP
min_n_non_stop_unique_tokens <- as.numeric(n_non_stop_unique_tokens_Q1SP - 1.5*n_non_stop_unique_tokens_IQRSP)
max_n_non_stop_unique_tokens <- as.numeric(n_non_stop_unique_tokens_Q3SP + 1.5*n_non_stop_unique_tokens_IQRSP)
news <- news %>% filter(n_non_stop_unique_tokens < max_n_non_stop_unique_tokens)
news <- news %>% filter(n_non_stop_unique_tokens > min_n_non_stop_unique_tokens)
summary(news$n_non_stop_unique_tokens)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4324 0.6346 0.6946 0.6962 0.7569 0.9474
boxplot(news$n_non_stop_unique_tokens)
hist(news$n_non_stop_unique_tokens)#keep
#num_self_hrefs, num_imgs, num_videos, average_token_length, self_reference_avg_sharess should do log transformation
news$num_self_hrefs <- log(1 + news$num_self_hrefs)
news$num_imgs <- log(1 + news$num_imgs)
news$num_videos <- log(1 + news$num_videos)
news$average_token_length <- log(1 + news$average_token_length)
news$self_reference_avg_sharess <- log(1 + news$self_reference_avg_sharess)
summary(news)
## n_tokens_title n_non_stop_unique_tokens num_self_hrefs num_imgs
## Min. : 2.00 Min. :0.4324 Min. :0.0000 Min. :0.0000
## 1st Qu.: 9.00 1st Qu.:0.6346 1st Qu.:0.6931 1st Qu.:0.6931
## Median :10.00 Median :0.6946 Median :1.3863 Median :0.6931
## Mean :10.38 Mean :0.6962 Mean :1.2448 Mean :1.1105
## 3rd Qu.:12.00 3rd Qu.:0.7569 3rd Qu.:1.6094 3rd Qu.:1.6094
## Max. :23.00 Max. :0.9474 Max. :4.3175 Max. :4.6250
##
## num_videos average_token_length num_keywords
## Min. :0.0000 Min. :1.526 Min. : 1.000
## 1st Qu.:0.0000 1st Qu.:1.704 1st Qu.: 6.000
## Median :0.0000 Median :1.736 Median : 7.000
## Mean :0.3973 Mean :1.737 Mean : 7.206
## 3rd Qu.:0.6931 3rd Qu.:1.768 3rd Qu.: 9.000
## Max. :4.5218 Max. :2.202 Max. :10.000
##
## self_reference_avg_sharess global_subjectivity global_sentiment_polarity
## Min. : 0.000 Min. :0.0000 Min. :-0.39375
## 1st Qu.: 7.004 1st Qu.:0.4030 1st Qu.: 0.06461
## Median : 7.741 Median :0.4568 Median : 0.12253
## Mean : 6.867 Mean :0.4574 Mean : 0.12316
## 3rd Qu.: 8.576 3rd Qu.:0.5104 3rd Qu.: 0.17973
## Max. :13.645 Max. :1.0000 Max. : 0.72784
##
## global_rate_positive_words global_rate_negative_words avg_positive_polarity
## Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.02961 1st Qu.:0.01024 1st Qu.:0.3120
## Median :0.03968 Median :0.01571 Median :0.3618
## Mean :0.04092 Mean :0.01711 Mean :0.3646
## 3rd Qu.:0.05072 3rd Qu.:0.02200 3rd Qu.:0.4130
## Max. :0.15549 Max. :0.18493 Max. :1.0000
##
## avg_negative_polarity title_subjectivity title_sentiment_polarity weekday
## Min. :-1.0000 Min. :0.000 Min. :-1.00000 Mon :6365
## 1st Qu.:-0.3312 1st Qu.:0.000 1st Qu.: 0.00000 Tues :7078
## Median :-0.2579 Median :0.125 Median : 0.00000 Wed :7128
## Mean :-0.2677 Mean :0.280 Mean : 0.07048 Thurs:6960
## 3rd Qu.:-0.1937 3rd Qu.:0.500 3rd Qu.: 0.13636 Fri :5451
## Max. : 0.0000 Max. :1.000 Max. : 1.00000 Sat :2331
## Sun :2598
## channel log_shares
## Other :5335 Min. : 0.000
## Lifestyle :2060 1st Qu.: 6.851
## Entertainment:6698 Median : 7.244
## Business :6212 Mean : 7.468
## Social_Media :2279 3rd Qu.: 7.901
## Technology :7231 Max. :13.645
## World :8096
#impute NA value in Channel Column
sum(is.na(news$channel))
## [1] 0
summary(news$channel)
## Other Lifestyle Entertainment Business Social_Media
## 5335 2060 6698 6212 2279
## Technology World
## 7231 8096
# Get levels and add "Unknown"
levels <- levels(news$channel)
levels[length(levels) + 1] <- "Unknown"
# refactor to include "Unknown" as a factor level
# and replace NA with "Unknown"
news$channel <- factor(news$channel, levels = levels)
news$channel[is.na(news$channel)] <- "Unknown"
#split the data
split <- sample.split(news, SplitRatio = 0.8)
data_train <- subset(news, split==T)
data_test <- subset(news, split==F)
#the final train data set
head(data_train)
## n_tokens_title n_non_stop_unique_tokens num_self_hrefs num_imgs num_videos
## 1 12 0.8153846 1.0986123 0.6931472 0
## 2 9 0.7919463 0.6931472 0.6931472 0
## 3 9 0.6638655 0.6931472 0.6931472 0
## 4 9 0.6656347 0.0000000 0.6931472 0
## 6 10 0.6981982 1.0986123 0.0000000 0
## 7 8 0.5498339 3.0445224 3.0445224 0
## average_token_length num_keywords self_reference_avg_sharess
## 1 1.737016 5 6.208590
## 2 1.777276 4 0.000000
## 3 1.685169 6 6.823286
## 4 1.687305 7 0.000000
## 6 1.678863 9 9.047939
## 7 1.732393 10 8.055843
## global_subjectivity global_sentiment_polarity global_rate_positive_words
## 1 0.5216171 0.09256198 0.04566210
## 2 0.3412458 0.14894781 0.04313725
## 3 0.7022222 0.32333333 0.05687204
## 4 0.4298497 0.10070467 0.04143126
## 6 0.4374086 0.07118419 0.02972973
## 7 0.5144803 0.26830272 0.08020833
## global_rate_negative_words avg_positive_polarity avg_negative_polarity
## 1 0.013698630 0.3786364 -0.3500000
## 2 0.015686275 0.2869146 -0.1187500
## 3 0.009478673 0.4958333 -0.4666667
## 4 0.020715631 0.3859652 -0.3696970
## 6 0.027027027 0.3506100 -0.1950000
## 7 0.016666667 0.4020386 -0.2244792
## title_subjectivity title_sentiment_polarity weekday channel log_shares
## 1 0.5000000 -0.1875000 Mon Entertainment 6.385194
## 2 0.0000000 0.0000000 Mon Business 6.566672
## 3 0.0000000 0.0000000 Mon Business 7.313220
## 4 0.0000000 0.0000000 Mon Entertainment 7.090077
## 6 0.6428571 0.2142857 Mon Technology 6.751101
## 7 0.0000000 0.0000000 Mon Lifestyle 6.320768
#the final test data set
head(data_test)
## n_tokens_title n_non_stop_unique_tokens num_self_hrefs num_imgs num_videos
## 5 13 0.5408895 2.9957323 3.0445224 0
## 8 12 0.5721078 3.0445224 3.0445224 0
## 11 9 0.7316384 0.0000000 0.6931472 0
## 16 12 0.6349614 0.0000000 0.6931472 0
## 24 11 0.7974683 0.6931472 0.6931472 0
## 27 8 0.7244898 1.3862944 0.6931472 0
## average_token_length num_keywords self_reference_avg_sharess
## 5 1.737450 7 8.055843
## 8 1.725939 9 8.055843
## 11 1.725938 8 0.000000
## 16 1.726373 6 0.000000
## 24 1.761987 6 9.686637
## 27 1.754832 9 7.824446
## global_subjectivity global_sentiment_polarity global_rate_positive_words
## 5 0.5135021 0.28100348 0.07462687
## 8 0.5434742 0.29861347 0.08392315
## 11 0.4820598 0.10235015 0.03846154
## 16 0.4732850 0.06222666 0.04985337
## 24 0.3964015 0.21079545 0.04800000
## 27 0.5606957 0.29541435 0.06042296
## global_rate_negative_words avg_positive_polarity avg_negative_polarity
## 5 0.012126866 0.4111274 -0.2201923
## 8 0.015166835 0.4277205 -0.2427778
## 11 0.020833333 0.4044801 -0.4150641
## 16 0.039589443 0.3430704 -0.2201499
## 24 0.000000000 0.2810606 0.0000000
## 27 0.009063444 0.5565584 -0.2638889
## title_subjectivity title_sentiment_polarity weekday channel log_shares
## 5 0.4545455 0.1363636 Mon Technology 6.224558
## 8 1.0000000 0.5000000 Mon Technology 6.792344
## 11 0.0000000 0.0000000 Mon World 7.696213
## 16 0.7500000 -0.2500000 Mon World 7.377759
## 24 0.4500000 0.4000000 Mon World 7.313220
## 27 0.0000000 0.0000000 Mon Technology 7.313220
I will build several plots using ggplot to conduct exploratory analysis.
########################
###Data Visualization###
########################
ggplot(data_train, aes(x = weekday, fill = channel)) +
geom_bar(show.legend = T) +
theme_minimal() +
labs(title = 'Number of Articles in Different Weekday',
subtitle = 'divided by different channel',
x = 'Weekday', y = 'Number of Articles') +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 1))
ggplot(data_train, aes(x = channel, fill = channel)) +
geom_bar(show.legend = F) +
theme_minimal() +
labs(title = 'Number of Articles in Different Channel',
x = 'Channel', y = 'Number of Articles') +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1))
ggplot(dat0, aes(x = channel, y = log_shares, fill = as.factor(is_weekend))) +
geom_boxplot(show.legend = T) +
theme_minimal() +
labs(title = 'Log Number of Shares versus Different Channel',
subtitle = 'divided by weekday and weekend',
x = 'Channel', y = 'Log Number of Shares') +
scale_fill_discrete(name="Weekday or Weekend",
breaks=c("0", "1"),
labels=c("Weekday", "Weekend"))+
scale_x_discrete(labels=c("Other", "Lifestyle", "Entertainment", "Business", "Social Media", "Technology", "World")) +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 1))+
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1))
ggplot(data_train, aes(x = global_subjectivity, y = log_shares, color = channel)) +
geom_point(show.legend = T) +
theme_minimal() +
labs(title = 'Log Number of Shares versus Article Subjectivity',
subtitle = 'divided by different channel',
x = 'Article Subjectivity', y = 'Log Number of Shares') +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 1))
ggplot(data_train, aes(x = global_sentiment_polarity, y = log_shares, color = channel)) +
geom_point(show.legend = T) +
theme_minimal() +
labs(title = 'Log Number of Shares versus Article Sentiment Polarity',
subtitle = 'divided by different channel',
x = 'Article Sentiment Polarity', y = 'Log Number of Shares') +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 1))
Data Mining Method: PCA and K-Means Clustering
#PCA
#Computing the full PCA
clustering_set <- data_train[,1:16]
clustering_set <- scale(clustering_set)
#Computing the full PCA
pca.news <- prcomp(clustering_set, scale=TRUE)
###Plotting the variance that each component explains
plot(pca.news,main="PCA: Variance Explained by Factors")
mtext(side=1, "Factors", line=1, font=2)
screeplot(pca.news,type = "line", main = "Scree Plot for PCA", npcs = length(pca.news$sdev))
mtext(side=3, "Number of Factors", line= -18, font=2)
abline(v = 5)
#get the scores and the loadings of the five selected PC
pcScore <- predict(pca.news)
pcScore_selected <- pcScore[,1:2]
pcScore_selected2 <- pcScore[,3:4]
loadings <- pca.news$rotation[,1:5]
nrow(loadings)
## [1] 16
#### PC1
# Positive sentiment articles
v1 <- loadings[order(abs(loadings[,1]), decreasing=TRUE)[1:19],1]
loadingfit1 <- lapply(1:27, function(k) ( t(v1[1:k])%*%v1[1:k] - 3/4 )^2)
v1[1:which.min(loadingfit1)]
## global_sentiment_polarity avg_positive_polarity
## 0.5340055 0.3985148
## global_rate_positive_words global_subjectivity
## 0.3977888 0.3675426
#### PC2
# Negative Sentiment aritcles
v2 <- loadings[order(abs(loadings[,2]), decreasing=TRUE)[1:19],2]
loadingfit2 <- lapply(1:27, function(k) ( t(v2[1:k])%*%v2[1:k] - 3/4 )^2)
v2[1:which.min(loadingfit2)]
## self_reference_avg_sharess num_self_hrefs
## 0.4093628 0.3816014
## avg_negative_polarity global_rate_negative_words
## -0.3779098 0.3531464
## num_imgs global_sentiment_polarity
## 0.3304523 -0.3189810
#### PC3
# short articles with more links and images
v3 <- loadings[order(abs(loadings[,3]), decreasing=TRUE)[1:19],3]
loadingfit3 <- lapply(1:27, function(k) ( t(v3[1:k])%*%v3[1:k] - 3/4 )^2)
v3[1:which.min(loadingfit3)]
## global_rate_negative_words n_non_stop_unique_tokens
## -0.3809297 -0.3710102
## num_imgs avg_negative_polarity
## 0.3419505 0.3293635
## num_self_hrefs global_subjectivity
## 0.3184503 -0.3161948
## num_videos
## -0.2600131
#### PC4
# Long articles Less image but use more popular links
v4 <- loadings[order(abs(loadings[,4]), decreasing=TRUE)[1:19],4]
loadingfit4 <- lapply(1:27, function(k) ( t(v4[1:k])%*%v4[1:k] - 3/4 )^2)
v4[1:which.min(loadingfit4)]
## self_reference_avg_sharess num_imgs
## 0.4151198 -0.3905513
## num_self_hrefs n_non_stop_unique_tokens
## 0.3662411 0.3604998
## num_videos average_token_length
## 0.3319199 -0.3119188
#### PC5
v5 <- loadings[order(abs(loadings[,4]), decreasing=TRUE)[1:19],5]
loadingfit5 <- lapply(1:27, function(k) ( t(v5[1:k])%*%v5[1:k] - 3/4 )^2)
v5[1:which.min(loadingfit5)]
## self_reference_avg_sharess num_imgs
## 0.075425116 -0.017136496
## num_self_hrefs n_non_stop_unique_tokens
## 0.081664837 0.125430474
## num_videos average_token_length
## -0.037025280 0.019432338
## n_tokens_title avg_negative_polarity
## -0.409804085 -0.098834041
## global_subjectivity num_keywords
## 0.199209652 0.078995546
## global_rate_negative_words title_subjectivity
## -0.013396554 -0.604340824
## avg_positive_polarity global_sentiment_polarity
## 0.240303354 0.106171824
## global_rate_positive_words
## 0.005825282
#K-mean clustering
SpcScore3 <- scale(pcScore, scale=TRUE)
kfit <- lapply(1:30, function(k) kmeans(SpcScore3,k,nstart=10))
## Selects the Number of Clusters via an Information Criteria
## get AIC (option "A"), BIC (option "B"), HDIC (option "C") for the output of kmeans
kIC <- function(fit, rule=c("A","B","C")){
df <- length(fit$centers) # K*dim
n <- sum(fit$size)
D <- fit$tot.withinss # deviance
rule=match.arg(rule)
if(rule=="A")
#return(D + 2*df*n/max(1,n-df-1))
return(D + 2*df)
else if(rule=="B")
return(D + log(n)*df)
else
return(D + sqrt( n * log(df) )*df)
}
kaic <- sapply(kfit,kIC)
kbic <- sapply(kfit,kIC,"B")
kcic <- sapply(kfit,kIC,"C")
SpcScore<- scale(pcScore_selected, scale=TRUE)
K_clustering <- kmeans(as.data.frame(SpcScore),which.min(kcic),nstart=10)
K_clustering
fviz_cluster(K_clustering, data = as.data.frame(SpcScore), geom = "point",
show.clust.cent = FALSE, ellipse = FALSE,palette = "Set5", main = "K-means clustering with PC1 and PC2")
Data Mining Method: Linear Regression, Stepwise Selection, CART, Random Forest, Lasso, Post Lasso
#full model
full_fit <- lm(log_shares ~ . , data = data_train)
summary(full_fit)
##
## Call:
## lm(formula = log_shares ~ ., data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7460 -0.5672 -0.1786 0.4066 5.9923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.7641324 0.2195487 35.364 < 2e-16 ***
## n_tokens_title -0.0012955 0.0024951 -0.519 0.603596
## n_non_stop_unique_tokens -0.2370074 0.0666601 -3.555 0.000378 ***
## num_self_hrefs -0.1162882 0.0100241 -11.601 < 2e-16 ***
## num_imgs 0.0529891 0.0070219 7.546 4.61e-14 ***
## num_videos 0.0670971 0.0082127 8.170 3.21e-16 ***
## average_token_length -0.2652572 0.1132977 -2.341 0.019226 *
## num_keywords 0.0173763 0.0028118 6.180 6.50e-10 ***
## self_reference_avg_sharess 0.0389144 0.0021084 18.457 < 2e-16 ***
## global_subjectivity 0.5866778 0.0731207 8.023 1.07e-15 ***
## global_sentiment_polarity -0.1099720 0.1382201 -0.796 0.426254
## global_rate_positive_words 0.3441118 0.5142576 0.669 0.503409
## global_rate_negative_words -1.7157769 0.8387466 -2.046 0.040800 *
## avg_positive_polarity -0.1060966 0.0925714 -1.146 0.251760
## avg_negative_polarity -0.1350650 0.0544898 -2.479 0.013191 *
## title_subjectivity 0.0456180 0.0167662 2.721 0.006516 **
## title_sentiment_polarity 0.0765887 0.0206930 3.701 0.000215 ***
## weekdayTues -0.0733780 0.0172023 -4.266 2.00e-05 ***
## weekdayWed -0.0723752 0.0171706 -4.215 2.50e-05 ***
## weekdayThurs -0.0598058 0.0172737 -3.462 0.000536 ***
## weekdayFri 0.0005677 0.0183814 0.031 0.975362
## weekdaySat 0.2649675 0.0242289 10.936 < 2e-16 ***
## weekdaySun 0.2322852 0.0233014 9.969 < 2e-16 ***
## channelLifestyle -0.1771966 0.0269101 -6.585 4.63e-11 ***
## channelEntertainment -0.4290908 0.0191382 -22.421 < 2e-16 ***
## channelBusiness -0.2148456 0.0210432 -10.210 < 2e-16 ***
## channelSocial_Media 0.0935417 0.0263201 3.554 0.000380 ***
## channelTechnology -0.1134975 0.0201425 -5.635 1.77e-08 ***
## channelWorld -0.4686965 0.0201523 -23.258 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8841 on 29901 degrees of freedom
## Multiple R-squared: 0.09445, Adjusted R-squared: 0.0936
## F-statistic: 111.4 on 28 and 29901 DF, p-value: < 2.2e-16
AIC(full_fit)
## [1] 77592.87
#full model with interaction
fit_interaction <- lm(log_shares ~ .^2, data = data_train)
#summary(fit_interaction)
AIC(fit_interaction)
#stepwise regression
#step(full_fit, trace = 1, direction = "both")
step_fit <- lm(log_shares ~ n_non_stop_unique_tokens + num_self_hrefs +
num_imgs + num_videos + num_keywords + self_reference_avg_sharess +
global_subjectivity + global_rate_negative_words + avg_positive_polarity +
avg_negative_polarity + title_subjectivity + title_sentiment_polarity +
weekday + channel, data = data_train)
summary(step_fit)
AIC(step_fit)
#PCA
pca_y <- data_train$log_shares
pca_x <- data.frame(pcScore)[,1:5]
pca_fit <- lm(pca_y ~ ., data = pca_x)
summary(pca_fit)
##
## Call:
## lm(formula = pca_y ~ ., data = pca_x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6598 -0.6014 -0.1922 0.4416 5.8654
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.471547 0.005282 1414.534 <2e-16 ***
## PC1 0.084642 0.003433 24.654 <2e-16 ***
## PC2 0.071612 0.003725 19.227 <2e-16 ***
## PC3 -0.001528 0.003931 -0.389 0.6975
## PC4 -0.005910 0.004482 -1.319 0.1872
## PC5 0.008469 0.004920 1.721 0.0852 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9138 on 29924 degrees of freedom
## Multiple R-squared: 0.03178, Adjusted R-squared: 0.03162
## F-statistic: 196.5 on 5 and 29924 DF, p-value: < 2.2e-16
#CART
cart_fit <- rpart(log_shares ~ . , data = data_train, method = "anova", control = rpart.control(cp = .0001))
bestcp <- cart_fit$cptable[which.min(cart_fit$cptable[,"xerror"]),"CP"]
best_cart_fit <- prune(cart_fit, cp = bestcp)
rpart.plot(best_cart_fit, type = 3, digits = 3, fallen.leaves = T)
printcp(best_cart_fit)
##
## Regression tree:
## rpart(formula = log_shares ~ ., data = data_train, method = "anova",
## control = rpart.control(cp = 1e-04))
##
## Variables actually used in tree construction:
## [1] channel global_subjectivity
## [3] n_non_stop_unique_tokens num_imgs
## [5] num_keywords self_reference_avg_sharess
## [7] weekday
##
## Root node error: 25808/29930 = 0.86227
##
## n= 29930
##
## CP nsplit rel error xerror xstd
## 1 0.0445863 0 1.00000 1.00003 0.012098
## 2 0.0140354 1 0.95541 0.95549 0.011848
## 3 0.0136494 2 0.94138 0.94158 0.011634
## 4 0.0066960 3 0.92773 0.92811 0.011431
## 5 0.0037318 4 0.92103 0.92152 0.011415
## 6 0.0026249 5 0.91730 0.91791 0.011338
## 7 0.0022728 6 0.91468 0.91589 0.011336
## 8 0.0022174 7 0.91240 0.91326 0.011332
## 9 0.0022087 8 0.91019 0.91305 0.011331
## 10 0.0018070 9 0.90798 0.91053 0.011307
## 11 0.0016526 10 0.90617 0.90915 0.011282
## 12 0.0015984 11 0.90452 0.90829 0.011280
## 13 0.0015448 12 0.90292 0.90772 0.011272
## 14 0.0015408 13 0.90137 0.90711 0.011263
## 15 0.0013608 14 0.89983 0.90571 0.011243
## 16 0.0012490 15 0.89847 0.90488 0.011235
## 17 0.0011461 16 0.89722 0.90423 0.011239
## 18 0.0010946 17 0.89608 0.90368 0.011219
## 19 0.0010281 18 0.89498 0.90315 0.011214
plotcp(best_cart_fit)
tot_count <- function(x, labs, digits, varlen) {paste(labs, "\n\nn = ", x$frame$n)}
prp(best_cart_fit, faclen = 0, cex = 0.8, node.fun = tot_count)
mtext(side=1, "Weekday: 1 = Mon, 2 = Tues, \n 3 = Wed, 4 = Thurs, 5 = Fri", line = 3, font=1, adj = 1)
#summary(best_cart_fit)
#Random Forest
#use which.min(regForest$mse) to get ntree
#it may take a while to run
forest_fit <- randomForest(log_shares ~ ., data = data_train, ntree=495, na.action=na.roughfix)
#lasso
model_train_matrix <- model.matrix(log_shares ~ ., data = data_train)
model_train_y <- data_train$log_shares
#lasso selection
lasso_model1 <- glmnet(model_train_matrix, model_train_y, alpha = 1, family = 'gaussian')
plot(lasso_model1, xvar = 'lambda', label = T)
lasso_model2 <- cv.glmnet(model_train_matrix, model_train_y, alpha = 1, family = 'gaussian', type.measure = 'mse')
lasso_lam_1se <- lasso_model2$lambda.1se
lasso_coef_1se <- coef(lasso_model1, lasso_lam_1se)
summary(lasso_coef_1se)
## 31 x 1 sparse Matrix of class "dgCMatrix", with 18 entries
## i j x
## 1 1 1 6.968776462
## 2 5 1 -0.036913631
## 3 6 1 0.051128359
## 4 7 1 0.049523288
## 5 9 1 0.008979073
## 6 10 1 0.027569509
## 7 11 1 0.629684819
## 8 16 1 -0.046078665
## 9 17 1 0.022313879
## 10 18 1 0.027782519
## 11 19 1 -0.004097269
## 12 20 1 -0.003930313
## 13 23 1 0.237280848
## 14 24 1 0.209896529
## 15 26 1 -0.272958139
## 16 27 1 -0.069372148
## 17 28 1 0.131438093
## 18 30 1 -0.311906486
plot(lasso_model2)
lasso_model_best <- glmnet(model_train_matrix, model_train_y, alpha = 1, family = 'gaussian', lambda = lasso_lam_1se)
### Returns the indices for which |x[i]| > tr
support<- function(x, tr = 10e-6) {
m<- rep(0, length(x))
for (i in 1:length(x)) if( abs(x[i])> tr ) m[i]<- i
m <- m[m>0]
m
}
## get the support
lasso_supp <- support(lasso_model_best$beta)
length(lasso_supp)
## [1] 17
colnames(model_train_matrix[,lasso_supp])#get the features for lasso model
## [1] "num_self_hrefs" "num_imgs"
## [3] "num_videos" "num_keywords"
## [5] "self_reference_avg_sharess" "global_subjectivity"
## [7] "avg_negative_polarity" "title_subjectivity"
## [9] "title_sentiment_polarity" "weekdayTues"
## [11] "weekdayWed" "weekdaySat"
## [13] "weekdaySun" "channelEntertainment"
## [15] "channelBusiness" "channelSocial_Media"
## [17] "channelWorld"
inthemodel_train <- unique(c(lasso_supp))# unique grabs union
lasso_train_data <- cBind(model_train_matrix[,inthemodel_train])
## Warning: 'cBind' is deprecated.
## Since R version 3.2.0, base's cbind() should work fine with S4 objects
lasso_train_data <- as.data.frame(as.matrix(lasso_train_data))# make it a data.frame
dim(lasso_train_data) ## p about half n
## [1] 29930 17
#fit the final model
lasso_fit <- lm(model_train_y ~., data=lasso_train_data)
summary(lasso_fit)
##
## Call:
## lm(formula = model_train_y ~ ., data = lasso_train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7404 -0.5694 -0.1780 0.4098 5.9785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.928537 0.039323 176.197 < 2e-16 ***
## num_self_hrefs -0.114896 0.009757 -11.775 < 2e-16 ***
## num_imgs 0.068475 0.005890 11.625 < 2e-16 ***
## num_videos 0.078153 0.007889 9.906 < 2e-16 ***
## num_keywords 0.016148 0.002793 5.782 7.45e-09 ***
## self_reference_avg_sharess 0.039632 0.002067 19.170 < 2e-16 ***
## global_subjectivity 0.573377 0.065297 8.781 < 2e-16 ***
## avg_negative_polarity -0.169516 0.045607 -3.717 0.000202 ***
## title_subjectivity 0.048054 0.016557 2.902 0.003708 **
## title_sentiment_polarity 0.078060 0.020202 3.864 0.000112 ***
## weekdayTues -0.051140 0.013898 -3.680 0.000234 ***
## weekdayWed -0.050414 0.013859 -3.638 0.000276 ***
## weekdaySat 0.285209 0.021994 12.968 < 2e-16 ***
## weekdaySun 0.255872 0.020946 12.216 < 2e-16 ***
## channelEntertainment -0.352930 0.015172 -23.262 < 2e-16 ***
## channelBusiness -0.123422 0.016052 -7.689 1.53e-14 ***
## channelSocial_Media 0.182459 0.022780 8.010 1.19e-15 ***
## channelWorld -0.380933 0.014762 -25.805 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8852 on 29912 degrees of freedom
## Multiple R-squared: 0.0919, Adjusted R-squared: 0.09139
## F-statistic: 178.1 on 17 and 29912 DF, p-value: < 2.2e-16
AIC(lasso_fit)
## [1] 77654.97
#post lasso
pl_train_matrix <- model.matrix(log_shares ~ ., data = data_train)
pl_train_y <- data_train$log_shares
pl_model1 <- glmnet(pl_train_matrix, pl_train_y, alpha = 1, family = 'gaussian')
plot(pl_model1, xvar = 'lambda', label = T)
pl_model2 <- cv.glmnet(pl_train_matrix, pl_train_y, alpha = 1, family = 'gaussian', type.measure = 'mse')
pl_lam_1se <- pl_model2$lambda.1se
pl_coef_1se <- coef(pl_model1, pl_lam_1se)
summary(pl_coef_1se)
## 31 x 1 sparse Matrix of class "dgCMatrix", with 16 entries
## i j x
## 1 1 1 6.979439918
## 2 5 1 -0.020821091
## 3 6 1 0.047601859
## 4 7 1 0.043720526
## 5 9 1 0.007505051
## 6 10 1 0.025090320
## 7 11 1 0.641029572
## 8 16 1 -0.020683435
## 9 17 1 0.017048762
## 10 18 1 0.017549375
## 11 23 1 0.225082099
## 12 24 1 0.198107258
## 13 26 1 -0.256664988
## 14 27 1 -0.058287491
## 15 28 1 0.120993301
## 16 30 1 -0.297817589
plot(pl_model2)
pl_model_best <- glmnet(pl_train_matrix, pl_train_y, alpha = 1, family = 'gaussian', lambda = pl_lam_1se)
## get the support
pl_supp <- support(pl_model_best$beta)
length(pl_supp)
## [1] 15
colnames(pl_train_matrix[,pl_supp])#get the features for post lasso model
## [1] "num_self_hrefs" "num_imgs"
## [3] "num_videos" "num_keywords"
## [5] "self_reference_avg_sharess" "global_subjectivity"
## [7] "avg_negative_polarity" "title_subjectivity"
## [9] "title_sentiment_polarity" "weekdaySat"
## [11] "weekdaySun" "channelEntertainment"
## [13] "channelBusiness" "channelSocial_Media"
## [15] "channelWorld"
data_pl_1se <- data.frame(pl_train_matrix[,pl_supp],pl_train_y)
#fit the pl
post_lasso <- lm(pl_train_y~., data=data_pl_1se)
summary(post_lasso)
##
## Call:
## lm(formula = pl_train_y ~ ., data = data_pl_1se)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7179 -0.5701 -0.1758 0.4093 6.0012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.904692 0.038989 177.092 < 2e-16 ***
## num_self_hrefs -0.114914 0.009759 -11.775 < 2e-16 ***
## num_imgs 0.068388 0.005892 11.607 < 2e-16 ***
## num_videos 0.077785 0.007891 9.858 < 2e-16 ***
## num_keywords 0.016200 0.002794 5.799 6.74e-09 ***
## self_reference_avg_sharess 0.039587 0.002068 19.143 < 2e-16 ***
## global_subjectivity 0.576647 0.065313 8.829 < 2e-16 ***
## avg_negative_polarity -0.170595 0.045621 -3.739 0.000185 ***
## title_subjectivity 0.048122 0.016563 2.905 0.003670 **
## title_sentiment_polarity 0.077653 0.020207 3.843 0.000122 ***
## weekdaySat 0.307065 0.021478 14.297 < 2e-16 ***
## weekdaySun 0.277690 0.020404 13.610 < 2e-16 ***
## channelEntertainment -0.352011 0.015175 -23.196 < 2e-16 ***
## channelBusiness -0.123099 0.016057 -7.666 1.82e-14 ***
## channelSocial_Media 0.182829 0.022787 8.023 1.07e-15 ***
## channelWorld -0.380107 0.014766 -25.743 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8854 on 29914 degrees of freedom
## Multiple R-squared: 0.09127, Adjusted R-squared: 0.09081
## F-statistic: 200.3 on 15 and 29914 DF, p-value: < 2.2e-16
AIC(post_lasso)
## [1] 77671.99
Let’s get the prediction based on the models above.
################
###Prediction###
################
MAE <- function(actual, predicted) {mean(abs(actual - predicted))}
R2 <- function(actual, predicted) {1 - (sum((predicted - actual)^2))/(sum((mean(actual) - actual)^2))}
test_with_prediction <- data_test
#Full
test_with_prediction$pred_full <- predict.lm(full_fit, newdata = test_with_prediction)
#Interaction
test_with_prediction$pred_interaction <- predict.lm(fit_interaction, newdata = test_with_prediction)
#Stepwise
test_with_prediction$pred_step <- predict.lm(step_fit, newdata = test_with_prediction)
#PCA
pca_test <- data_test[,1:16]
pca_test <- scale(pca_test)
#Computing the full PCA
pca_test_fit <- prcomp(pca_test, scale=TRUE)
#build PCA model
pca_test_scores <- predict(pca_test_fit)
pca_test_loadings <- pca_test_fit$rotation[,1:5]
pca_test_y <- data_test$log_shares
pca_test_x <- data.frame(pca_test_scores)[,1:5]
pca_test_lmfit <- lm(pca_test_y ~ ., data = pca_test_x)
test_with_prediction$pred_pca <- predict(pca_test_lmfit, data = pca_test_x)
#CART
test_with_prediction$pred_cart <- predict(best_cart_fit, newdata = data_test, type="vector")
#Random Forest
test_with_prediction$pred_forest <- predict(forest_fit, newdata = data_test, type="response")
#Lasso
model_test <- data_test
model_test_matrix <- model.matrix( ~ ., data=model_test)
inthemodel_test <- unique(c(lasso_supp))# unique grabs union
lasso_test_data <- cBind(model_test_matrix[,inthemodel_test])
lasso_test_data <- as.data.frame(as.matrix(lasso_test_data))# make it a data.frame
lasso_pred <- predict.lm(lasso_fit, newdata = lasso_test_data,type="response")
model_test_with_prediction <- model_test
model_test_with_prediction$lasso_pred <- lasso_pred
test_with_prediction$pred_lasso <- predict.lm(lasso_fit, newdata = lasso_test_data,type="response")
#Post Lasso
pl_model_test <- data_test
pl_model_test_matrix <- model.matrix( ~ ., data=model_test)
pl_inthemodel_test <- unique(c(pl_supp))# unique grabs union
pl_test_data <- cBind(pl_model_test_matrix[,pl_inthemodel_test])
pl_test_data <- as.data.frame(as.matrix(pl_test_data))# make it a data.frame
test_with_prediction$pred_pl <- predict.lm(post_lasso, newdata = pl_test_data,type="response")
I have chosen two different evaluation measures – out-of-sample mean squares error (MSE) and R2. MSE is an easy to understand and communicate measure, best suited to explain to the digital platform stakeholders and business decision makers. The lower the MSE, the better the model. R2 is an effective measure to explain how much the prediction can explain the variability of the actual log shares. The higher the R2, the better the model.
################
###Evaluation###
################
aic_full <- AIC(full_fit)
coef_full <- data.frame(full_fit$coefficients)
ncoef_full <- nrow(coef_full)
mae_full <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_full)
mse_full <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_full)
R2_full <- R2(test_with_prediction$log_shares, test_with_prediction$pred_full)
coef_interaction <- data.frame(fit_interaction$coefficients)
ncoef_interaction <- nrow(coef_interaction)
aic_interaction <- AIC(fit_interaction)
mae_interaction <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_interaction)
mse_interaction <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_interaction)
R2_interaction <- R2(test_with_prediction$log_shares, test_with_prediction$pred_interaction)
coef_step <- data.frame(step_fit$coefficients)
ncoef_step <- nrow(coef_step)
aic_step <- AIC(step_fit)
mae_step <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_step)
mse_step <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_step)
R2_step <- R2(test_with_prediction$log_shares, test_with_prediction$pred_step)
coef_pca <- data.frame(pca_fit$coefficients)
ncoef_pca <- nrow(coef_pca)
aic_pca <- AIC(pca_fit)
mae_pca <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_pca)
mse_pca <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_pca)
R2_pca <- R2(test_with_prediction$log_shares, test_with_prediction$pred_pca)
coef_cart <- data.frame(best_cart_fit$cptable)
ncoef_cart <- nrow(coef_cart)
#aic_cart <- AIC(best_cart_fit) #no aic
mae_cart <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_cart)
mse_cart <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_cart)
R2_cart <- R2(test_with_prediction$log_shares, test_with_prediction$pred_cart)
coef_forest <- data.frame(forest_fit$cptable)#can't make it a data frame
ncoef_forest <- forest_fit$ntree
#aic_forest <- AIC(forest_fit) #no aic
mae_forest <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_forest)
mse_forest <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_forest)
R2_forest <- R2(test_with_prediction$log_shares, test_with_prediction$pred_forest)
coef_lasso <- data.frame(lasso_fit$coefficients)
ncoef_lasso <- nrow(coef_lasso)
aic_lasso <- AIC(lasso_fit)
mae_lasso <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_lasso)
mse_lasso <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_lasso)
R2_lasso <- R2(test_with_prediction$log_shares, test_with_prediction$pred_lasso)
coef_pl <- data.frame(post_lasso$coefficients)
ncoef_pl <- nrow(coef_pl)
aic_pl <- AIC(post_lasso)
mae_pl <- MAE(test_with_prediction$log_shares, test_with_prediction$pred_pl)
mse_pl <- MSE(test_with_prediction$log_shares, test_with_prediction$pred_pl)
R2_pl <- R2(test_with_prediction$log_shares, test_with_prediction$pred_pl)
#create evaluation metrics data frame
mae_data <- data.frame(cbind(mae_full, mae_interaction, mae_step, mae_pca, mae_cart, mae_forest, mae_lasso, mae_pl))
mse_data <- data.frame(cbind(mse_full, mse_interaction, mse_step, mse_pca, mse_cart, mse_forest, mse_lasso, mse_pl))
aic_data <- data.frame(cbind(aic_full, aic_interaction, aic_step, aic_pca, aic_lasso, aic_pl))
R2_data <- data.frame(cbind(R2_full, R2_interaction, R2_step, R2_pca, R2_cart, R2_forest, R2_lasso, R2_pl))
num_coef <- data.frame(cbind(ncoef_full, ncoef_interaction, ncoef_step, ncoef_pca, ncoef_cart, ncoef_forest, ncoef_lasso, ncoef_pl))
#visualize OOS performance
barplot(height = cbind(mae_full, mae_interaction, mae_step, mae_pca, mae_cart, mae_forest, mae_lasso, mae_pl), space = 0.1, ylim =c(0, 0.8),
names.arg = c("Full", "INT", "Step", "PCA", "CART", "Forest", "Lasso", "PL"), ylab = "OOS Mean Absolute Error",
main = "OOS MAE Across Different Models")
abline(h = sum(mae_data[1,])/ncol(mae_data), col = "red", lwd = 2)
mtext(side=1, "Line represents the average", col = "red", line = 4, font=1, adj = 1)
barplot(height = cbind(mse_full, mse_interaction, mse_step, mse_pca, mse_cart, mae_forest, mse_lasso, mse_pl), space = 0.1, ylim =c(0, 0.8),
names.arg = c("Full", "INT", "Step", "PCA", "CART", "Forest", "Lasso", "PL"), ylab = "OOS Mean Square Error",
main = "OOS MSE Across Different Models")
abline(h = sum(mse_data[1,])/ncol(mse_data), col = "red", lwd = 2)
mtext(side=1, "Line represents the average", col = "red", line = 4, font=1, adj = 1)
barplot(height = cbind(aic_full, aic_interaction, aic_step, aic_pca, aic_lasso, aic_pl), space = 0.1,
names.arg = c("Full", "INT", "Step", "PCA", "Lasso", "PL"), ylab = "AIC Value", ylim =c(0, 80000),
main = "AIC Values Across Different Models")
abline(h = sum(aic_data[1,])/ncol(aic_data), col = "red", lwd = 2)
barplot(height = cbind(R2_full, R2_interaction, R2_step, R2_pca, R2_cart, R2_forest, R2_lasso, R2_pl), space = 0.1, ylim =c(0, 0.14),
names.arg = c("Full", "INT", "Step", "PCA", "CART", "Forest", "Lasso", "PL"), ylab = "OOS R Square",
main = "OOS R2 Across Different Models")
abline(h = sum(R2_data[1,])/ncol(R2_data), col = "red", lwd = 2)
mtext(side=1, "Line represents the average", col = "red", line = 4, font=1, adj = 1)
barplot(height = cbind(ncoef_full, ncoef_interaction, ncoef_step, ncoef_pca, ncoef_cart, ncoef_forest, ncoef_lasso, ncoef_pl), space = 0.1, ylim =c(0, 500),
names.arg = c("Full", "INT", "Step", "PCA", "CART", "Forest", "Lasso", "PL"), ylab = "Number of Coefficients",
main = "Number of Coefficients Across Different Models")
abline(h = sum(num_coef[1,])/ncol(num_coef), col = "red", lwd = 2)
mtext(side=1, " Number of splits for CART \n Number of trees for Random Forest", line = 4, font=1, adj = 0)
mtext(side=1, "Line represents the average", col = "red", line = 4, font=1, adj = 1)
barplot(height = cbind(ncoef_full, ncoef_step, ncoef_pca, ncoef_cart, ncoef_lasso, ncoef_pl), space = 0.1, ylim =c(0, 30),
names.arg = c("Full", "Step", "PCA", "CART", "Lasso", "PL"), ylab = "Number of Coefficients",
main = "Number of Coefficients Across Different Models")
abline(h = (sum(num_coef[1,])-num_coef[1,2]-num_coef[1,6])/(ncol(num_coef)-2), col = "red", lwd = 2)
Quantile regression enabled us to provide decision makers with information to optimize articles that received very few shares and those that received the most shares, in order to improve the features of a low-share articles and to further boost the shares of high-share articles.
#########################
###Quantile Regression###
#########################
data_quan <- lasso_train_data
data_quan$log_shares <- data_train$log_shares
### Look at the lower tail say tau = 0.05
lower.tail <- rq(log_shares ~ ., tau = 0.05, data = data_quan)
coef(lower.tail)
## (Intercept) num_self_hrefs
## 6.087359807 -0.015601806
## num_imgs num_videos
## 0.021442596 0.018427350
## num_keywords self_reference_avg_sharess
## 0.008774502 0.042859417
## global_subjectivity avg_negative_polarity
## -0.027662853 0.009750629
## title_subjectivity title_sentiment_polarity
## 0.026853212 0.011745622
## weekdayTues weekdayWed
## -0.036166449 -0.026117106
## weekdaySat weekdaySun
## 0.378346900 0.307290861
## channelEntertainment channelBusiness
## -0.197466432 -0.021798876
## channelSocial_Media channelWorld
## 0.207189058 -0.171960893
### Look at the upper tail say tau = 0.95
upper.tail <- rq(log_shares ~ ., tau = 0.95, data = data_quan)
coef(upper.tail)
## (Intercept) num_self_hrefs
## 7.9924650511 -0.2926611937
## num_imgs num_videos
## 0.0931196590 0.2093122312
## num_keywords self_reference_avg_sharess
## 0.0008896307 0.0690336155
## global_subjectivity avg_negative_polarity
## 1.8821370105 -0.7273687290
## title_subjectivity title_sentiment_polarity
## 0.0522939074 0.1558140529
## weekdayTues weekdayWed
## -0.0634786415 -0.0585003226
## weekdaySat weekdaySun
## 0.2254553195 0.1469037495
## channelEntertainment channelBusiness
## -0.2444643967 -0.3444075481
## channelSocial_Media channelWorld
## 0.0671010580 -0.5070925192
###taus from 0.05 to 0.95
taus <- seq(from=0.05,to=0.95,by=0.05)
rq_taus <-rq(log_shares ~ ., tau = taus, data = data_quan)
fittaus_rq <- summary(rq_taus)
#visualize the different variables' effects across different taus
plot(fittaus_rq)
mtext("Values of taus", side=1, line = 4.2, font=1, col = 'blue')
mtext("Values of coefficients", side=2, line = 3.3, font=1, col = 'blue')
plot(fittaus_rq, parm = 1, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Intercept ")
plot(fittaus_rq, parm = 2, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for num_self_hrefs ")
plot(fittaus_rq, parm = 3, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for num_imgs ")
plot(fittaus_rq, parm = 4, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for num_videos ")
plot(fittaus_rq, parm = 5, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for num_keywords ")
plot(fittaus_rq, parm = 6, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for self_reference_avg_sharess ")
plot(fittaus_rq, parm = 7, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for global_subjectivity ")
plot(fittaus_rq, parm = 8, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for avg_negative_polarity ")
plot(fittaus_rq, parm = 9, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for title_subjectivity ")
plot(fittaus_rq, parm = 10, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for title_sentiment_polarity ")
plot(fittaus_rq, parm = 11, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Saturday ")
plot(fittaus_rq, parm = 12, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Sunday ")
plot(fittaus_rq, parm = 13, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Entertainment ")
plot(fittaus_rq, parm = 14, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Business ")
plot(fittaus_rq, parm = 15, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for Social_Media ")
plot(fittaus_rq, parm = 16, xlab = "Quantile Index", ylab = "Coefficient Value", main = "Coefficient for World ")
3 steps: 1. run Lasso of Y on controls X 2. run Lasso of d on controls X 3. Take all selectedX (Step 1 and 2) and run Linear regression of Y on treatment d and selectedX
causality_data_train <- lasso_train_data
causality_y <- data_train$log_shares
### treatment is num_imgs
causality_x_num_imgs <- model.matrix(causality_y ~ .-num_imgs, data = causality_data_train )
treatment_num_imgs <- causality_data_train$num_imgs
## Step 1.Select controls that are good to predict the outcome
causality_model1_num_imgs <- cv.glmnet(causality_x_num_imgs, causality_y, alpha = 1, family = 'gaussian', type.measure = 'mse')
causality_lam_1se_num_imgs <- causality_model1_num_imgs$lambda.1se
# Call Lasso
causality_lasso_1se_num_imgs <- glmnet(causality_x_num_imgs, causality_y,alpha = 1, family = 'gaussian',lambda = causality_lam_1se_num_imgs)
# Get the support
causality_supp1_num_imgs <- support(causality_lasso_1se_num_imgs$beta)
# Step 1 selected
length(causality_supp1_num_imgs)
## [1] 14
### controls
colnames(causality_x_num_imgs[,causality_supp1_num_imgs])
## [1] "num_self_hrefs" "num_videos"
## [3] "num_keywords" "self_reference_avg_sharess"
## [5] "global_subjectivity" "avg_negative_polarity"
## [7] "title_subjectivity" "title_sentiment_polarity"
## [9] "weekdaySat" "weekdaySun"
## [11] "channelEntertainment" "channelBusiness"
## [13] "channelSocial_Media" "channelWorld"
###
### Step 2.Select controls that are good to predict the treatment
causality_model2_num_imgs <- cv.glmnet(causality_x_num_imgs, treatment_num_imgs, alpha = 1, family = 'gaussian', type.measure = 'mse')
causality_lam_d_1se_num_imgs <- causality_model2_num_imgs$lambda.1se
# Call Lasso
causality_lam_d_1se_num_imgs <- glmnet(causality_x_num_imgs, treatment_num_imgs, alpha = 1, family = 'gaussian',lambda = causality_lam_d_1se_num_imgs)
# Get the support
causality_supp2_num_imgs <-support(causality_lam_d_1se_num_imgs$beta)
### Step 2 selected
length(causality_supp2_num_imgs)
## [1] 11
### controls
colnames(causality_x_num_imgs[,causality_supp2_num_imgs])
## [1] "num_self_hrefs" "num_videos"
## [3] "num_keywords" "self_reference_avg_sharess"
## [5] "global_subjectivity" "avg_negative_polarity"
## [7] "title_subjectivity" "title_sentiment_polarity"
## [9] "channelBusiness" "channelSocial_Media"
## [11] "channelWorld"
###
### Step 3.Combine all selected and refit linear regression
causality_inthemodel_num_imgs <- unique(c(causality_supp1_num_imgs, causality_supp2_num_imgs)) # unique grabs union
selectdata_num_imgs <- cBind(treatment_num_imgs, causality_x_num_imgs[,causality_inthemodel_num_imgs])
selectdata_num_imgs <- as.data.frame(as.matrix(selectdata_num_imgs)) # make it a data.frame
dim(selectdata_num_imgs) ## p about half n
## [1] 29930 15
causality_lm_num_imgs <- lm(causality_y ~ ., data = selectdata_num_imgs)
summary(causality_lm_num_imgs)$coef["treatment_num_imgs",]
## Estimate Std. Error t value Pr(>|t|)
## 6.838849e-02 5.891901e-03 1.160720e+01 4.416458e-31
####
#### Compare/contrast the results with the following models
####
simple_num_imgs <- lm(causality_y ~ num_imgs, data = causality_data_train)
summary(simple_num_imgs)
##
## Call:
## lm(formula = causality_y ~ num_imgs, data = causality_data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3564 -0.6100 -0.1863 0.4379 6.0886
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.356441 0.008266 889.97 <2e-16 ***
## num_imgs 0.103193 0.005658 18.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9235 on 29928 degrees of freedom
## Multiple R-squared: 0.01099, Adjusted R-squared: 0.01096
## F-statistic: 332.6 on 1 and 29928 DF, p-value: < 2.2e-16
summary(full_fit)
##
## Call:
## lm(formula = log_shares ~ ., data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7460 -0.5672 -0.1786 0.4066 5.9923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.7641324 0.2195487 35.364 < 2e-16 ***
## n_tokens_title -0.0012955 0.0024951 -0.519 0.603596
## n_non_stop_unique_tokens -0.2370074 0.0666601 -3.555 0.000378 ***
## num_self_hrefs -0.1162882 0.0100241 -11.601 < 2e-16 ***
## num_imgs 0.0529891 0.0070219 7.546 4.61e-14 ***
## num_videos 0.0670971 0.0082127 8.170 3.21e-16 ***
## average_token_length -0.2652572 0.1132977 -2.341 0.019226 *
## num_keywords 0.0173763 0.0028118 6.180 6.50e-10 ***
## self_reference_avg_sharess 0.0389144 0.0021084 18.457 < 2e-16 ***
## global_subjectivity 0.5866778 0.0731207 8.023 1.07e-15 ***
## global_sentiment_polarity -0.1099720 0.1382201 -0.796 0.426254
## global_rate_positive_words 0.3441118 0.5142576 0.669 0.503409
## global_rate_negative_words -1.7157769 0.8387466 -2.046 0.040800 *
## avg_positive_polarity -0.1060966 0.0925714 -1.146 0.251760
## avg_negative_polarity -0.1350650 0.0544898 -2.479 0.013191 *
## title_subjectivity 0.0456180 0.0167662 2.721 0.006516 **
## title_sentiment_polarity 0.0765887 0.0206930 3.701 0.000215 ***
## weekdayTues -0.0733780 0.0172023 -4.266 2.00e-05 ***
## weekdayWed -0.0723752 0.0171706 -4.215 2.50e-05 ***
## weekdayThurs -0.0598058 0.0172737 -3.462 0.000536 ***
## weekdayFri 0.0005677 0.0183814 0.031 0.975362
## weekdaySat 0.2649675 0.0242289 10.936 < 2e-16 ***
## weekdaySun 0.2322852 0.0233014 9.969 < 2e-16 ***
## channelLifestyle -0.1771966 0.0269101 -6.585 4.63e-11 ***
## channelEntertainment -0.4290908 0.0191382 -22.421 < 2e-16 ***
## channelBusiness -0.2148456 0.0210432 -10.210 < 2e-16 ***
## channelSocial_Media 0.0935417 0.0263201 3.554 0.000380 ***
## channelTechnology -0.1134975 0.0201425 -5.635 1.77e-08 ***
## channelWorld -0.4686965 0.0201523 -23.258 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8841 on 29901 degrees of freedom
## Multiple R-squared: 0.09445, Adjusted R-squared: 0.0936
## F-statistic: 111.4 on 28 and 29901 DF, p-value: < 2.2e-16
summary(step_fit)
##
## Call:
## lm(formula = log_shares ~ n_non_stop_unique_tokens + num_self_hrefs +
## num_imgs + num_videos + num_keywords + self_reference_avg_sharess +
## global_subjectivity + global_rate_negative_words + avg_positive_polarity +
## avg_negative_polarity + title_subjectivity + title_sentiment_polarity +
## weekday + channel, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7492 -0.5675 -0.1788 0.4064 5.9928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2838030 0.0714406 101.956 < 2e-16 ***
## n_non_stop_unique_tokens -0.2303441 0.0665819 -3.460 0.000542 ***
## num_self_hrefs -0.1160584 0.0099967 -11.610 < 2e-16 ***
## num_imgs 0.0521318 0.0070055 7.442 1.02e-13 ***
## num_videos 0.0680752 0.0081815 8.321 < 2e-16 ***
## num_keywords 0.0171564 0.0028031 6.120 9.45e-10 ***
## self_reference_avg_sharess 0.0390412 0.0020856 18.719 < 2e-16 ***
## global_subjectivity 0.5896006 0.0695995 8.471 < 2e-16 ***
## global_rate_negative_words -1.0899775 0.5290109 -2.060 0.039368 *
## avg_positive_polarity -0.1526391 0.0672672 -2.269 0.023267 *
## avg_negative_polarity -0.1560336 0.0471303 -3.311 0.000932 ***
## title_subjectivity 0.0457695 0.0166566 2.748 0.006003 **
## title_sentiment_polarity 0.0764401 0.0204928 3.730 0.000192 ***
## weekdayTues -0.0734032 0.0172020 -4.267 1.99e-05 ***
## weekdayWed -0.0724128 0.0171707 -4.217 2.48e-05 ***
## weekdayThurs -0.0599418 0.0172725 -3.470 0.000520 ***
## weekdayFri 0.0005836 0.0183815 0.032 0.974674
## weekdaySat 0.2644808 0.0242208 10.920 < 2e-16 ***
## weekdaySun 0.2316073 0.0232899 9.945 < 2e-16 ***
## channelLifestyle -0.1714681 0.0267797 -6.403 1.55e-10 ***
## channelEntertainment -0.4245561 0.0189213 -22.438 < 2e-16 ***
## channelBusiness -0.2125001 0.0209687 -10.134 < 2e-16 ***
## channelSocial_Media 0.0983462 0.0261616 3.759 0.000171 ***
## channelTechnology -0.1059221 0.0198937 -5.324 1.02e-07 ***
## channelWorld -0.4722948 0.0199803 -23.638 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8841 on 29905 degrees of freedom
## Multiple R-squared: 0.09426, Adjusted R-squared: 0.09353
## F-statistic: 129.7 on 24 and 29905 DF, p-value: < 2.2e-16
summary(lasso_fit)
##
## Call:
## lm(formula = model_train_y ~ ., data = lasso_train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7404 -0.5694 -0.1780 0.4098 5.9785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.928537 0.039323 176.197 < 2e-16 ***
## num_self_hrefs -0.114896 0.009757 -11.775 < 2e-16 ***
## num_imgs 0.068475 0.005890 11.625 < 2e-16 ***
## num_videos 0.078153 0.007889 9.906 < 2e-16 ***
## num_keywords 0.016148 0.002793 5.782 7.45e-09 ***
## self_reference_avg_sharess 0.039632 0.002067 19.170 < 2e-16 ***
## global_subjectivity 0.573377 0.065297 8.781 < 2e-16 ***
## avg_negative_polarity -0.169516 0.045607 -3.717 0.000202 ***
## title_subjectivity 0.048054 0.016557 2.902 0.003708 **
## title_sentiment_polarity 0.078060 0.020202 3.864 0.000112 ***
## weekdayTues -0.051140 0.013898 -3.680 0.000234 ***
## weekdayWed -0.050414 0.013859 -3.638 0.000276 ***
## weekdaySat 0.285209 0.021994 12.968 < 2e-16 ***
## weekdaySun 0.255872 0.020946 12.216 < 2e-16 ***
## channelEntertainment -0.352930 0.015172 -23.262 < 2e-16 ***
## channelBusiness -0.123422 0.016052 -7.689 1.53e-14 ***
## channelSocial_Media 0.182459 0.022780 8.010 1.19e-15 ***
## channelWorld -0.380933 0.014762 -25.805 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8852 on 29912 degrees of freedom
## Multiple R-squared: 0.0919, Adjusted R-squared: 0.09139
## F-statistic: 178.1 on 17 and 29912 DF, p-value: < 2.2e-16
summary(post_lasso)
##
## Call:
## lm(formula = pl_train_y ~ ., data = data_pl_1se)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7179 -0.5701 -0.1758 0.4093 6.0012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.904692 0.038989 177.092 < 2e-16 ***
## num_self_hrefs -0.114914 0.009759 -11.775 < 2e-16 ***
## num_imgs 0.068388 0.005892 11.607 < 2e-16 ***
## num_videos 0.077785 0.007891 9.858 < 2e-16 ***
## num_keywords 0.016200 0.002794 5.799 6.74e-09 ***
## self_reference_avg_sharess 0.039587 0.002068 19.143 < 2e-16 ***
## global_subjectivity 0.576647 0.065313 8.829 < 2e-16 ***
## avg_negative_polarity -0.170595 0.045621 -3.739 0.000185 ***
## title_subjectivity 0.048122 0.016563 2.905 0.003670 **
## title_sentiment_polarity 0.077653 0.020207 3.843 0.000122 ***
## weekdaySat 0.307065 0.021478 14.297 < 2e-16 ***
## weekdaySun 0.277690 0.020404 13.610 < 2e-16 ***
## channelEntertainment -0.352011 0.015175 -23.196 < 2e-16 ***
## channelBusiness -0.123099 0.016057 -7.666 1.82e-14 ***
## channelSocial_Media 0.182829 0.022787 8.023 1.07e-15 ***
## channelWorld -0.380107 0.014766 -25.743 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8854 on 29914 degrees of freedom
## Multiple R-squared: 0.09127, Adjusted R-squared: 0.09081
## F-statistic: 200.3 on 15 and 29914 DF, p-value: < 2.2e-16
coef(lower.tail)
## (Intercept) num_self_hrefs
## 6.087359807 -0.015601806
## num_imgs num_videos
## 0.021442596 0.018427350
## num_keywords self_reference_avg_sharess
## 0.008774502 0.042859417
## global_subjectivity avg_negative_polarity
## -0.027662853 0.009750629
## title_subjectivity title_sentiment_polarity
## 0.026853212 0.011745622
## weekdayTues weekdayWed
## -0.036166449 -0.026117106
## weekdaySat weekdaySun
## 0.378346900 0.307290861
## channelEntertainment channelBusiness
## -0.197466432 -0.021798876
## channelSocial_Media channelWorld
## 0.207189058 -0.171960893
coef(upper.tail)
## (Intercept) num_self_hrefs
## 7.9924650511 -0.2926611937
## num_imgs num_videos
## 0.0931196590 0.2093122312
## num_keywords self_reference_avg_sharess
## 0.0008896307 0.0690336155
## global_subjectivity avg_negative_polarity
## 1.8821370105 -0.7273687290
## title_subjectivity title_sentiment_polarity
## 0.0522939074 0.1558140529
## weekdayTues weekdayWed
## -0.0634786415 -0.0585003226
## weekdaySat weekdaySun
## 0.2254553195 0.1469037495
## channelEntertainment channelBusiness
## -0.2444643967 -0.3444075481
## channelSocial_Media channelWorld
## 0.0671010580 -0.5070925192
According to the output from quantile regression, we can do the same for variables like num_videos, global_subjectivity, Sunday, Business, World, etc. to explore the causality relationship between each variable and the predictive variable.