# Reading the raw data to a dataframe :
popNews <- read.csv('D:/MLDS/Datasets/OnlineNewsPopularity(1)/OnlineNewsPopularity/OnlineNewsPopularity.csv', header = TRUE)
# Summary and Structure of data before preprocessing:
summary(popNews)
## url
## http://mashable.com/2013/01/07/amazon-instant-video-browser/ : 1
## http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/ : 1
## http://mashable.com/2013/01/07/apple-40-billion-app-downloads/: 1
## http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/ : 1
## http://mashable.com/2013/01/07/att-u-verse-apps/ : 1
## http://mashable.com/2013/01/07/beewi-smart-toys/ : 1
## (Other) :39638
## timedelta n_tokens_title n_tokens_content n_unique_tokens
## Min. : 8.0 Min. : 2.0 Min. : 0.0 Min. : 0.0000
## 1st Qu.:164.0 1st Qu.: 9.0 1st Qu.: 246.0 1st Qu.: 0.4709
## Median :339.0 Median :10.0 Median : 409.0 Median : 0.5392
## Mean :354.5 Mean :10.4 Mean : 546.5 Mean : 0.5482
## 3rd Qu.:542.0 3rd Qu.:12.0 3rd Qu.: 716.0 3rd Qu.: 0.6087
## Max. :731.0 Max. :23.0 Max. :8474.0 Max. :701.0000
##
## n_non_stop_words n_non_stop_unique_tokens num_hrefs
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 1.0000 1st Qu.: 0.6257 1st Qu.: 4.00
## Median : 1.0000 Median : 0.6905 Median : 8.00
## Mean : 0.9965 Mean : 0.6892 Mean : 10.88
## 3rd Qu.: 1.0000 3rd Qu.: 0.7546 3rd Qu.: 14.00
## Max. :1042.0000 Max. :650.0000 Max. :304.00
##
## num_self_hrefs num_imgs num_videos average_token_length
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. :0.000
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 0.00 1st Qu.:4.478
## Median : 3.000 Median : 1.000 Median : 0.00 Median :4.664
## Mean : 3.294 Mean : 4.544 Mean : 1.25 Mean :4.548
## 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.: 1.00 3rd Qu.:4.855
## Max. :116.000 Max. :128.000 Max. :91.00 Max. :8.042
##
## num_keywords data_channel_is_lifestyle data_channel_is_entertainment
## Min. : 1.000 Min. :0.00000 Min. :0.000
## 1st Qu.: 6.000 1st Qu.:0.00000 1st Qu.:0.000
## Median : 7.000 Median :0.00000 Median :0.000
## Mean : 7.224 Mean :0.05295 Mean :0.178
## 3rd Qu.: 9.000 3rd Qu.:0.00000 3rd Qu.:0.000
## Max. :10.000 Max. :1.00000 Max. :1.000
##
## data_channel_is_bus data_channel_is_socmed data_channel_is_tech
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1579 Mean :0.0586 Mean :0.1853
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## data_channel_is_world kw_min_min kw_max_min kw_avg_min
## Min. :0.0000 Min. : -1.00 Min. : 0 Min. : -1.0
## 1st Qu.:0.0000 1st Qu.: -1.00 1st Qu.: 445 1st Qu.: 141.8
## Median :0.0000 Median : -1.00 Median : 660 Median : 235.5
## Mean :0.2126 Mean : 26.11 Mean : 1154 Mean : 312.4
## 3rd Qu.:0.0000 3rd Qu.: 4.00 3rd Qu.: 1000 3rd Qu.: 357.0
## Max. :1.0000 Max. :377.00 Max. :298400 Max. :42827.9
##
## kw_min_max kw_max_max kw_avg_max kw_min_avg
## Min. : 0 Min. : 0 Min. : 0 Min. : -1
## 1st Qu.: 0 1st Qu.:843300 1st Qu.:172847 1st Qu.: 0
## Median : 1400 Median :843300 Median :244572 Median :1024
## Mean : 13612 Mean :752324 Mean :259282 Mean :1117
## 3rd Qu.: 7900 3rd Qu.:843300 3rd Qu.:330980 3rd Qu.:2057
## Max. :843300 Max. :843300 Max. :843300 Max. :3613
##
## kw_max_avg kw_avg_avg self_reference_min_shares
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 3562 1st Qu.: 2382 1st Qu.: 639
## Median : 4356 Median : 2870 Median : 1200
## Mean : 5657 Mean : 3136 Mean : 3999
## 3rd Qu.: 6020 3rd Qu.: 3600 3rd Qu.: 2600
## Max. :298400 Max. :43568 Max. :843300
##
## self_reference_max_shares self_reference_avg_sharess weekday_is_monday
## Min. : 0 Min. : 0.0 Min. :0.000
## 1st Qu.: 1100 1st Qu.: 981.2 1st Qu.:0.000
## Median : 2800 Median : 2200.0 Median :0.000
## Mean : 10329 Mean : 6401.7 Mean :0.168
## 3rd Qu.: 8000 3rd Qu.: 5200.0 3rd Qu.:0.000
## Max. :843300 Max. :843300.0 Max. :1.000
##
## weekday_is_tuesday weekday_is_wednesday weekday_is_thursday
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1864 Mean :0.1875 Mean :0.1833
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## weekday_is_friday weekday_is_saturday weekday_is_sunday is_weekend
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.1438 Mean :0.06188 Mean :0.06904 Mean :0.1309
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
##
## LDA_00 LDA_01 LDA_02 LDA_03
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.02505 1st Qu.:0.02501 1st Qu.:0.02857 1st Qu.:0.02857
## Median :0.03339 Median :0.03334 Median :0.04000 Median :0.04000
## Mean :0.18460 Mean :0.14126 Mean :0.21632 Mean :0.22377
## 3rd Qu.:0.24096 3rd Qu.:0.15083 3rd Qu.:0.33422 3rd Qu.:0.37576
## Max. :0.92699 Max. :0.92595 Max. :0.92000 Max. :0.92653
##
## LDA_04 global_subjectivity global_sentiment_polarity
## Min. :0.00000 Min. :0.0000 Min. :-0.39375
## 1st Qu.:0.02857 1st Qu.:0.3962 1st Qu.: 0.05776
## Median :0.04073 Median :0.4535 Median : 0.11912
## Mean :0.23403 Mean :0.4434 Mean : 0.11931
## 3rd Qu.:0.39999 3rd Qu.:0.5083 3rd Qu.: 0.17783
## Max. :0.92719 Max. :1.0000 Max. : 0.72784
##
## global_rate_positive_words global_rate_negative_words rate_positive_words
## Min. :0.00000 Min. :0.000000 Min. :0.0000
## 1st Qu.:0.02838 1st Qu.:0.009615 1st Qu.:0.6000
## Median :0.03902 Median :0.015337 Median :0.7105
## Mean :0.03962 Mean :0.016612 Mean :0.6822
## 3rd Qu.:0.05028 3rd Qu.:0.021739 3rd Qu.:0.8000
## Max. :0.15549 Max. :0.184932 Max. :1.0000
##
## rate_negative_words avg_positive_polarity min_positive_polarity
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1852 1st Qu.:0.3062 1st Qu.:0.05000
## Median :0.2800 Median :0.3588 Median :0.10000
## Mean :0.2879 Mean :0.3538 Mean :0.09545
## 3rd Qu.:0.3846 3rd Qu.:0.4114 3rd Qu.:0.10000
## Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## max_positive_polarity avg_negative_polarity min_negative_polarity
## Min. :0.0000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.:0.6000 1st Qu.:-0.3284 1st Qu.:-0.7000
## Median :0.8000 Median :-0.2533 Median :-0.5000
## Mean :0.7567 Mean :-0.2595 Mean :-0.5219
## 3rd Qu.:1.0000 3rd Qu.:-0.1869 3rd Qu.:-0.3000
## Max. :1.0000 Max. : 0.0000 Max. : 0.0000
##
## max_negative_polarity title_subjectivity title_sentiment_polarity
## Min. :-1.0000 Min. :0.0000 Min. :-1.00000
## 1st Qu.:-0.1250 1st Qu.:0.0000 1st Qu.: 0.00000
## Median :-0.1000 Median :0.1500 Median : 0.00000
## Mean :-0.1075 Mean :0.2824 Mean : 0.07143
## 3rd Qu.:-0.0500 3rd Qu.:0.5000 3rd Qu.: 0.15000
## Max. : 0.0000 Max. :1.0000 Max. : 1.00000
##
## abs_title_subjectivity abs_title_sentiment_polarity shares
## Min. :0.0000 Min. :0.0000 Min. : 1
## 1st Qu.:0.1667 1st Qu.:0.0000 1st Qu.: 946
## Median :0.5000 Median :0.0000 Median : 1400
## Mean :0.3418 Mean :0.1561 Mean : 3395
## 3rd Qu.:0.5000 3rd Qu.:0.2500 3rd Qu.: 2800
## Max. :0.5000 Max. :1.0000 Max. :843300
##
str(popNews)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
# Exploratory Data Analysis and Cleaning Data:
# Initially, I check if the data has any missing values:
sum(is.na(popNews))
## [1] 0
# There are no missing values
# From the summary, I can see outliers in var:
# "n_unique_tokens", "n_non_stop_words", and "n_non_stop_unique_tokens"
# These values seem unusual from the rest values, hence I shall remove this observation.
popNews=popNews[!popNews$n_unique_tokens==701,]
# url, timedelta - being non-predictive variables, I don't include it in my analysis.
# Since, is_weekend seems to be repetitive with other weekday data, I remove it.
popNews <- subset( popNews, select = -c(url, timedelta, is_weekend ) )
# The following variables are categorical with 2 values : 0 and 1 but are numeric; hence, converted all such variables to factor variables with 2 levels.
popNews$weekday_is_monday <- factor(popNews$weekday_is_monday)
popNews$weekday_is_wednesday <- factor(popNews$weekday_is_wednesday)
popNews$weekday_is_thursday <- factor(popNews$weekday_is_thursday)
popNews$weekday_is_friday <- factor(popNews$weekday_is_friday)
popNews$weekday_is_tuesday <- factor(popNews$weekday_is_tuesday)
popNews$weekday_is_saturday <- factor(popNews$weekday_is_saturday)
popNews$weekday_is_sunday <- factor(popNews$weekday_is_sunday)
popNews$data_channel_is_lifestyle <- factor(popNews$data_channel_is_lifestyle)
popNews$data_channel_is_entertainment <- factor(popNews$data_channel_is_entertainment)
popNews$data_channel_is_bus <- factor(popNews$data_channel_is_bus)
popNews$data_channel_is_socmed <- factor(popNews$data_channel_is_socmed)
popNews$data_channel_is_tech <- factor(popNews$data_channel_is_tech)
popNews$data_channel_is_world <- factor(popNews$data_channel_is_world)
# Summary and Structure of data before preprocessing:
summary(popNews)
## n_tokens_title n_tokens_content n_unique_tokens n_non_stop_words
## Min. : 2.0 Min. : 0.0 Min. :0.0000 Min. :0.0000
## 1st Qu.: 9.0 1st Qu.: 246.0 1st Qu.:0.4709 1st Qu.:1.0000
## Median :10.0 Median : 409.0 Median :0.5392 Median :1.0000
## Mean :10.4 Mean : 546.5 Mean :0.5305 Mean :0.9702
## 3rd Qu.:12.0 3rd Qu.: 716.0 3rd Qu.:0.6087 3rd Qu.:1.0000
## Max. :23.0 Max. :8474.0 Max. :1.0000 Max. :1.0000
## n_non_stop_unique_tokens num_hrefs num_self_hrefs
## Min. :0.0000 Min. : 0.00 Min. : 0.000
## 1st Qu.:0.6257 1st Qu.: 4.00 1st Qu.: 1.000
## Median :0.6905 Median : 8.00 Median : 3.000
## Mean :0.6728 Mean : 10.88 Mean : 3.293
## 3rd Qu.:0.7546 3rd Qu.: 14.00 3rd Qu.: 4.000
## Max. :1.0000 Max. :304.00 Max. :116.000
## num_imgs num_videos average_token_length num_keywords
## Min. : 0.000 Min. : 0.00 Min. :0.000 Min. : 1.000
## 1st Qu.: 1.000 1st Qu.: 0.00 1st Qu.:4.478 1st Qu.: 6.000
## Median : 1.000 Median : 0.00 Median :4.664 Median : 7.000
## Mean : 4.543 Mean : 1.25 Mean :4.548 Mean : 7.224
## 3rd Qu.: 4.000 3rd Qu.: 1.00 3rd Qu.:4.855 3rd Qu.: 9.000
## Max. :128.000 Max. :91.00 Max. :8.042 Max. :10.000
## data_channel_is_lifestyle data_channel_is_entertainment
## 0:37544 0:32587
## 1: 2099 1: 7056
##
##
##
##
## data_channel_is_bus data_channel_is_socmed data_channel_is_tech
## 0:33385 0:37320 0:32297
## 1: 6258 1: 2323 1: 7346
##
##
##
##
## data_channel_is_world kw_min_min kw_max_min kw_avg_min
## 0:31216 Min. : -1.00 Min. : 0 Min. : -1.0
## 1: 8427 1st Qu.: -1.00 1st Qu.: 445 1st Qu.: 141.8
## Median : -1.00 Median : 660 Median : 235.5
## Mean : 26.11 Mean : 1154 Mean : 312.4
## 3rd Qu.: 4.00 3rd Qu.: 1000 3rd Qu.: 357.0
## Max. :377.00 Max. :298400 Max. :42827.9
## kw_min_max kw_max_max kw_avg_max kw_min_avg
## Min. : 0 Min. : 0 Min. : 0 Min. : -1
## 1st Qu.: 0 1st Qu.:843300 1st Qu.:172844 1st Qu.: 0
## Median : 1400 Median :843300 Median :244567 Median :1024
## Mean : 13612 Mean :752322 Mean :259280 Mean :1117
## 3rd Qu.: 7900 3rd Qu.:843300 3rd Qu.:330980 3rd Qu.:2057
## Max. :843300 Max. :843300 Max. :843300 Max. :3613
## kw_max_avg kw_avg_avg self_reference_min_shares
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 3562 1st Qu.: 2382 1st Qu.: 639
## Median : 4356 Median : 2870 Median : 1200
## Mean : 5657 Mean : 3136 Mean : 3999
## 3rd Qu.: 6020 3rd Qu.: 3600 3rd Qu.: 2600
## Max. :298400 Max. :43568 Max. :843300
## self_reference_max_shares self_reference_avg_sharess weekday_is_monday
## Min. : 0 Min. : 0.0 0:32982
## 1st Qu.: 1100 1st Qu.: 981.1 1: 6661
## Median : 2800 Median : 2200.0
## Mean : 10330 Mean : 6401.7
## 3rd Qu.: 8000 3rd Qu.: 5200.0
## Max. :843300 Max. :843300.0
## weekday_is_tuesday weekday_is_wednesday weekday_is_thursday
## 0:32254 0:32208 0:32376
## 1: 7389 1: 7435 1: 7267
##
##
##
##
## weekday_is_friday weekday_is_saturday weekday_is_sunday LDA_00
## 0:33942 0:37190 0:36906 Min. :0.01818
## 1: 5701 1: 2453 1: 2737 1st Qu.:0.02505
## Median :0.03339
## Mean :0.18460
## 3rd Qu.:0.24097
## Max. :0.92699
## LDA_01 LDA_02 LDA_03 LDA_04
## Min. :0.01818 Min. :0.01818 Min. :0.01818 Min. :0.01818
## 1st Qu.:0.02501 1st Qu.:0.02857 1st Qu.:0.02857 1st Qu.:0.02857
## Median :0.03334 Median :0.04000 Median :0.04000 Median :0.04073
## Mean :0.14126 Mean :0.21633 Mean :0.22378 Mean :0.23404
## 3rd Qu.:0.15084 3rd Qu.:0.33422 3rd Qu.:0.37578 3rd Qu.:0.39999
## Max. :0.92595 Max. :0.92000 Max. :0.92653 Max. :0.92719
## global_subjectivity global_sentiment_polarity global_rate_positive_words
## Min. :0.0000 Min. :-0.39375 Min. :0.00000
## 1st Qu.:0.3962 1st Qu.: 0.05776 1st Qu.:0.02839
## Median :0.4535 Median : 0.11912 Median :0.03902
## Mean :0.4434 Mean : 0.11931 Mean :0.03963
## 3rd Qu.:0.5083 3rd Qu.: 0.17784 3rd Qu.:0.05028
## Max. :1.0000 Max. : 0.72784 Max. :0.15549
## global_rate_negative_words rate_positive_words rate_negative_words
## Min. :0.000000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.009615 1st Qu.:0.6000 1st Qu.:0.1852
## Median :0.015337 Median :0.7105 Median :0.2800
## Mean :0.016613 Mean :0.6822 Mean :0.2879
## 3rd Qu.:0.021739 3rd Qu.:0.8000 3rd Qu.:0.3846
## Max. :0.184932 Max. :1.0000 Max. :1.0000
## avg_positive_polarity min_positive_polarity max_positive_polarity
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3062 1st Qu.:0.05000 1st Qu.:0.6000
## Median :0.3588 Median :0.10000 Median :0.8000
## Mean :0.3538 Mean :0.09545 Mean :0.7567
## 3rd Qu.:0.4114 3rd Qu.:0.10000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## avg_negative_polarity min_negative_polarity max_negative_polarity
## Min. :-1.0000 Min. :-1.000 Min. :-1.0000
## 1st Qu.:-0.3284 1st Qu.:-0.700 1st Qu.:-0.1250
## Median :-0.2533 Median :-0.500 Median :-0.1000
## Mean :-0.2595 Mean :-0.522 Mean :-0.1075
## 3rd Qu.:-0.1869 3rd Qu.:-0.300 3rd Qu.:-0.0500
## Max. : 0.0000 Max. : 0.000 Max. : 0.0000
## title_subjectivity title_sentiment_polarity abs_title_subjectivity
## Min. :0.0000 Min. :-1.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.:0.1667
## Median :0.1500 Median : 0.00000 Median :0.5000
## Mean :0.2824 Mean : 0.07143 Mean :0.3419
## 3rd Qu.:0.5000 3rd Qu.: 0.15000 3rd Qu.:0.5000
## Max. :1.0000 Max. : 1.00000 Max. :0.5000
## abs_title_sentiment_polarity shares
## Min. :0.0000 Min. : 1
## 1st Qu.:0.0000 1st Qu.: 946
## Median :0.0000 Median : 1400
## Mean :0.1561 Mean : 3395
## 3rd Qu.:0.2500 3rd Qu.: 2800
## Max. :1.0000 Max. :843300
str(popNews)
## 'data.frame': 39643 obs. of 58 variables:
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
## $ data_channel_is_entertainment: Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 1 1 1 ...
## $ data_channel_is_bus : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 1 1 ...
## $ data_channel_is_socmed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ data_channel_is_tech : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 1 2 2 1 ...
## $ data_channel_is_world : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ weekday_is_tuesday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_wednesday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_thursday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_friday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_saturday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_sunday : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
# Sampling the dataset into 75% : training data and 25% : test data:
set.seed(174004689)
popNewsTrain <- sample(nrow(popNews),as.integer(nrow(popNews)*0.75))
train.news = popNews[popNewsTrain,]
test.news = popNews[-popNewsTrain,]
# Now, we fit a model with all the variables; shares being the dependent variable and all other explanatory variables from the dataset as the predictors.
fit_mlm1 <- lm(shares ~ ., data = train.news)
summary(fit_mlm1)
##
## Call:
## lm(formula = shares ~ ., data = train.news)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29232 -2190 -1164 -55 837485
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.573e+02 9.709e+02 -0.265 0.791027
## n_tokens_title 6.275e+01 3.122e+01 2.010 0.044410 *
## n_tokens_content 2.350e-01 2.434e-01 0.965 0.334303
## n_unique_tokens 2.892e+03 2.086e+03 1.386 0.165664
## n_non_stop_words -2.597e+03 6.443e+03 -0.403 0.686897
## n_non_stop_unique_tokens 3.559e+02 1.773e+03 0.201 0.840935
## num_hrefs 2.788e+01 7.300e+00 3.819 0.000134 ***
## num_self_hrefs -4.806e+01 1.949e+01 -2.466 0.013679 *
## num_imgs 2.112e+01 9.709e+00 2.176 0.029585 *
## num_videos 1.976e+01 1.685e+01 1.172 0.241037
## average_token_length -5.188e+02 2.638e+02 -1.966 0.049285 *
## num_keywords 6.355e+01 4.045e+01 1.571 0.116148
## data_channel_is_lifestyle1 -6.180e+02 4.297e+02 -1.438 0.150446
## data_channel_is_entertainment1 -1.127e+03 2.783e+02 -4.051 5.11e-05 ***
## data_channel_is_bus1 -7.572e+02 4.169e+02 -1.816 0.069340 .
## data_channel_is_socmed1 -5.402e+02 4.084e+02 -1.323 0.185902
## data_channel_is_tech1 -5.256e+02 4.039e+02 -1.302 0.193097
## data_channel_is_world1 -3.584e+02 4.100e+02 -0.874 0.382060
## kw_min_min 3.234e-01 1.779e+00 0.182 0.855739
## kw_max_min 7.133e-02 5.402e-02 1.320 0.186711
## kw_avg_min -3.732e-01 3.382e-01 -1.103 0.269839
## kw_min_max -2.194e-03 1.297e-03 -1.691 0.090811 .
## kw_max_max -1.043e-03 6.350e-04 -1.643 0.100432
## kw_avg_max 5.755e-04 9.085e-04 0.633 0.526424
## kw_min_avg -4.003e-01 8.275e-02 -4.838 1.32e-06 ***
## kw_max_avg -2.095e-01 2.763e-02 -7.582 3.51e-14 ***
## kw_avg_avg 1.640e+00 1.579e-01 10.390 < 2e-16 ***
## self_reference_min_shares 3.250e-02 7.776e-03 4.179 2.93e-05 ***
## self_reference_max_shares 6.261e-03 4.275e-03 1.465 0.142999
## self_reference_avg_sharess -9.708e-03 1.077e-02 -0.901 0.367519
## weekday_is_monday1 2.837e+02 2.878e+02 0.986 0.324298
## weekday_is_tuesday1 -3.535e+02 2.834e+02 -1.247 0.212245
## weekday_is_wednesday1 -1.731e+02 2.836e+02 -0.610 0.541578
## weekday_is_thursday1 -5.327e+02 2.845e+02 -1.872 0.061181 .
## weekday_is_friday1 -2.698e+02 2.944e+02 -0.916 0.359433
## weekday_is_saturday1 1.397e+02 3.508e+02 0.398 0.690555
## weekday_is_sunday1 NA NA NA NA
## LDA_00 3.678e+02 4.981e+02 0.739 0.460208
## LDA_01 -3.728e+02 5.510e+02 -0.677 0.498700
## LDA_02 -7.676e+02 4.978e+02 -1.542 0.123098
## LDA_03 -1.046e+02 5.230e+02 -0.200 0.841463
## LDA_04 NA NA NA NA
## global_subjectivity 2.651e+03 9.258e+02 2.864 0.004190 **
## global_sentiment_polarity 1.040e+03 1.812e+03 0.574 0.565901
## global_rate_positive_words -1.081e+04 7.842e+03 -1.378 0.168234
## global_rate_negative_words 1.234e+04 1.476e+04 0.836 0.403039
## rate_positive_words 2.035e+03 6.306e+03 0.323 0.746898
## rate_negative_words 1.575e+03 6.354e+03 0.248 0.804195
## avg_positive_polarity -1.415e+03 1.488e+03 -0.951 0.341846
## min_positive_polarity -2.133e+03 1.245e+03 -1.713 0.086701 .
## max_positive_polarity 3.069e+02 4.713e+02 0.651 0.514998
## avg_negative_polarity -1.347e+03 1.373e+03 -0.981 0.326714
## min_negative_polarity 1.903e+01 5.019e+02 0.038 0.969760
## max_negative_polarity 3.454e+02 1.145e+03 0.302 0.762832
## title_subjectivity -1.111e+02 2.999e+02 -0.370 0.711100
## title_sentiment_polarity 3.380e+02 2.721e+02 1.242 0.214168
## abs_title_subjectivity 1.000e+03 3.975e+02 2.516 0.011879 *
## abs_title_sentiment_polarity 5.015e+02 4.325e+02 1.160 0.246190
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10860 on 29676 degrees of freedom
## Multiple R-squared: 0.0262, Adjusted R-squared: 0.02439
## F-statistic: 14.52 on 55 and 29676 DF, p-value: < 2.2e-16
pred_mlm1 = predict(fit_mlm1, test.news)
## Warning in predict.lm(fit_mlm1, test.news): prediction from a rank-
## deficient fit may be misleading
sqrt(mean((test.news$shares - pred_mlm1)^2))
## [1] 13242.71
# This model has a low R-square value, we can use a transformation on the model.
# Now, I try using a log-transformation on our target variable to optimise our fit model:
popNews$shares <- log(popNews$shares)
popNewsTrain <- sample(nrow(popNews),as.integer(nrow(popNews)*0.75))
train.news = popNews[popNewsTrain,]
test.news = popNews[-popNewsTrain,]
# Now, I try fitting a model on the transformed target variable:
fit_m3 <- lm(shares ~ ., data = train.news)
summary(fit_m3)
##
## Call:
## lm(formula = shares ~ ., data = train.news)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1050 -0.5455 -0.1645 0.3860 5.6075
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.906e+00 7.801e-02 88.521 < 2e-16 ***
## n_tokens_title 5.493e-03 2.509e-03 2.189 0.028595 *
## n_tokens_content 3.611e-05 1.934e-05 1.866 0.061988 .
## n_unique_tokens 1.524e-01 1.664e-01 0.916 0.359479
## n_non_stop_words 1.207e-01 5.158e-01 0.234 0.815032
## n_non_stop_unique_tokens -2.271e-01 1.413e-01 -1.607 0.108162
## num_hrefs 3.608e-03 5.938e-04 6.076 1.25e-09 ***
## num_self_hrefs -7.094e-03 1.583e-03 -4.482 7.43e-06 ***
## num_imgs 2.521e-03 7.815e-04 3.225 0.001259 **
## num_videos 2.258e-03 1.373e-03 1.645 0.099989 .
## average_token_length -8.190e-02 2.109e-02 -3.883 0.000103 ***
## num_keywords 1.173e-02 3.238e-03 3.622 0.000293 ***
## data_channel_is_lifestyle1 -9.525e-02 3.441e-02 -2.768 0.005648 **
## data_channel_is_entertainment1 -1.813e-01 2.222e-02 -8.157 3.57e-16 ***
## data_channel_is_bus1 -1.616e-01 3.344e-02 -4.832 1.36e-06 ***
## data_channel_is_socmed1 1.656e-01 3.276e-02 5.054 4.35e-07 ***
## data_channel_is_tech1 1.195e-01 3.243e-02 3.685 0.000229 ***
## data_channel_is_world1 -2.753e-02 3.295e-02 -0.836 0.403366
## kw_min_min 9.604e-04 1.406e-04 6.831 8.61e-12 ***
## kw_max_min 1.937e-05 4.709e-06 4.114 3.89e-05 ***
## kw_avg_min -1.411e-04 2.738e-05 -5.152 2.60e-07 ***
## kw_min_max -4.398e-07 1.027e-07 -4.283 1.85e-05 ***
## kw_max_max 8.330e-09 5.012e-08 0.166 0.867997
## kw_avg_max -2.695e-07 7.278e-08 -3.702 0.000214 ***
## kw_min_avg -4.634e-05 6.584e-06 -7.037 2.00e-12 ***
## kw_max_avg -4.090e-05 2.182e-06 -18.741 < 2e-16 ***
## kw_avg_avg 3.334e-04 1.250e-05 26.664 < 2e-16 ***
## self_reference_min_shares 1.009e-06 7.017e-07 1.438 0.150552
## self_reference_max_shares -8.002e-08 3.916e-07 -0.204 0.838104
## self_reference_avg_sharess 1.470e-06 9.976e-07 1.473 0.140747
## weekday_is_monday1 -2.322e-01 2.287e-02 -10.155 < 2e-16 ***
## weekday_is_tuesday1 -2.972e-01 2.257e-02 -13.166 < 2e-16 ***
## weekday_is_wednesday1 -2.970e-01 2.255e-02 -13.168 < 2e-16 ***
## weekday_is_thursday1 -3.015e-01 2.259e-02 -13.347 < 2e-16 ***
## weekday_is_friday1 -2.277e-01 2.343e-02 -9.717 < 2e-16 ***
## weekday_is_saturday1 -2.830e-03 2.780e-02 -0.102 0.918929
## weekday_is_sunday1 NA NA NA NA
## LDA_00 2.432e-01 3.983e-02 6.106 1.04e-09 ***
## LDA_01 -1.352e-01 4.428e-02 -3.053 0.002266 **
## LDA_02 -2.384e-01 3.984e-02 -5.982 2.22e-09 ***
## LDA_03 -1.138e-01 4.201e-02 -2.709 0.006749 **
## LDA_04 NA NA NA NA
## global_subjectivity 4.147e-01 7.452e-02 5.565 2.64e-08 ***
## global_sentiment_polarity -1.498e-01 1.458e-01 -1.027 0.304200
## global_rate_positive_words -8.402e-01 6.245e-01 -1.345 0.178493
## global_rate_negative_words -4.637e-02 1.205e+00 -0.038 0.969312
## rate_positive_words 1.634e-01 5.048e-01 0.324 0.746127
## rate_negative_words 2.990e-02 5.087e-01 0.059 0.953120
## avg_positive_polarity -5.408e-03 1.195e-01 -0.045 0.963899
## min_positive_polarity -2.643e-01 9.974e-02 -2.650 0.008050 **
## max_positive_polarity -8.057e-03 3.765e-02 -0.214 0.830542
## avg_negative_polarity -1.443e-01 1.099e-01 -1.313 0.189123
## min_negative_polarity -6.796e-03 4.012e-02 -0.169 0.865466
## max_negative_polarity 4.416e-02 9.167e-02 0.482 0.629967
## title_subjectivity 5.839e-02 2.392e-02 2.441 0.014650 *
## title_sentiment_polarity 6.969e-02 2.185e-02 3.189 0.001429 **
## abs_title_subjectivity 1.372e-01 3.181e-02 4.312 1.62e-05 ***
## abs_title_sentiment_polarity -5.147e-03 3.446e-02 -0.149 0.881269
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8696 on 29676 degrees of freedom
## Multiple R-squared: 0.1292, Adjusted R-squared: 0.1276
## F-statistic: 80.04 on 55 and 29676 DF, p-value: < 2.2e-16
# This model gives a better R-square value than the previous model.
# Now, I want to include only statistically significant variables in the model. So, I use the Stepwise regression step():
fit_mlm2 <- step(fit_m3)
## Start: AIC=-8253.38
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg +
## kw_avg_avg + self_reference_min_shares + self_reference_max_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## weekday_is_saturday + weekday_is_sunday + LDA_00 + LDA_01 +
## LDA_02 + LDA_03 + LDA_04 + global_subjectivity + global_sentiment_polarity +
## global_rate_positive_words + global_rate_negative_words +
## rate_positive_words + rate_negative_words + avg_positive_polarity +
## min_positive_polarity + max_positive_polarity + avg_negative_polarity +
## min_negative_polarity + max_negative_polarity + title_subjectivity +
## title_sentiment_polarity + abs_title_subjectivity + abs_title_sentiment_polarity
##
##
## Step: AIC=-8253.38
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg +
## kw_avg_avg + self_reference_min_shares + self_reference_max_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## weekday_is_saturday + weekday_is_sunday + LDA_00 + LDA_01 +
## LDA_02 + LDA_03 + global_subjectivity + global_sentiment_polarity +
## global_rate_positive_words + global_rate_negative_words +
## rate_positive_words + rate_negative_words + avg_positive_polarity +
## min_positive_polarity + max_positive_polarity + avg_negative_polarity +
## min_negative_polarity + max_negative_polarity + title_subjectivity +
## title_sentiment_polarity + abs_title_subjectivity + abs_title_sentiment_polarity
##
##
## Step: AIC=-8253.38
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg +
## kw_avg_avg + self_reference_min_shares + self_reference_max_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## weekday_is_saturday + LDA_00 + LDA_01 + LDA_02 + LDA_03 +
## global_subjectivity + global_sentiment_polarity + global_rate_positive_words +
## global_rate_negative_words + rate_positive_words + rate_negative_words +
## avg_positive_polarity + min_positive_polarity + max_positive_polarity +
## avg_negative_polarity + min_negative_polarity + max_negative_polarity +
## title_subjectivity + title_sentiment_polarity + abs_title_subjectivity +
## abs_title_sentiment_polarity
##
## Df Sum of Sq RSS AIC
## - global_rate_negative_words 1 0.00 22440 -8255.4
## - avg_positive_polarity 1 0.00 22440 -8255.4
## - rate_negative_words 1 0.00 22440 -8255.4
## - weekday_is_saturday 1 0.01 22440 -8255.4
## - abs_title_sentiment_polarity 1 0.02 22441 -8255.4
## - kw_max_max 1 0.02 22441 -8255.4
## - min_negative_polarity 1 0.02 22441 -8255.4
## - self_reference_max_shares 1 0.03 22441 -8255.3
## - max_positive_polarity 1 0.03 22441 -8255.3
## - n_non_stop_words 1 0.04 22441 -8255.3
## - rate_positive_words 1 0.08 22441 -8255.3
## - max_negative_polarity 1 0.18 22441 -8255.1
## - data_channel_is_world 1 0.53 22441 -8254.7
## - n_unique_tokens 1 0.63 22441 -8254.5
## - global_sentiment_polarity 1 0.80 22441 -8254.3
## - avg_negative_polarity 1 1.30 22442 -8253.7
## - global_rate_positive_words 1 1.37 22442 -8253.6
## <none> 22440 -8253.4
## - self_reference_min_shares 1 1.56 22442 -8253.3
## - self_reference_avg_sharess 1 1.64 22442 -8253.2
## - n_non_stop_unique_tokens 1 1.95 22442 -8252.8
## - num_videos 1 2.05 22443 -8252.7
## - n_tokens_content 1 2.63 22443 -8251.9
## - n_tokens_title 1 3.62 22444 -8250.6
## - title_subjectivity 1 4.51 22445 -8249.4
## - min_positive_polarity 1 5.31 22446 -8248.3
## - LDA_03 1 5.55 22446 -8248.0
## - data_channel_is_lifestyle 1 5.79 22446 -8247.7
## - LDA_01 1 7.05 22448 -8246.0
## - title_sentiment_polarity 1 7.69 22448 -8245.2
## - num_imgs 1 7.87 22448 -8245.0
## - num_keywords 1 9.92 22450 -8242.2
## - data_channel_is_tech 1 10.27 22451 -8241.8
## - kw_avg_max 1 10.36 22451 -8241.7
## - average_token_length 1 11.40 22452 -8240.3
## - kw_max_min 1 12.80 22453 -8238.4
## - kw_min_max 1 13.87 22454 -8237.0
## - abs_title_subjectivity 1 14.06 22455 -8236.8
## - num_self_hrefs 1 15.19 22456 -8235.3
## - data_channel_is_bus 1 17.65 22458 -8232.0
## - data_channel_is_socmed 1 19.32 22460 -8229.8
## - kw_avg_min 1 20.07 22461 -8228.8
## - global_subjectivity 1 23.42 22464 -8224.4
## - LDA_02 1 27.06 22468 -8219.5
## - num_hrefs 1 27.92 22468 -8218.4
## - LDA_00 1 28.19 22469 -8218.1
## - kw_min_min 1 35.28 22476 -8208.7
## - kw_min_avg 1 37.45 22478 -8205.8
## - data_channel_is_entertainment 1 50.31 22491 -8188.8
## - weekday_is_friday 1 71.40 22512 -8160.9
## - weekday_is_monday 1 77.98 22518 -8152.2
## - weekday_is_tuesday 1 131.08 22572 -8082.2
## - weekday_is_wednesday 1 131.11 22572 -8082.2
## - weekday_is_thursday 1 134.70 22575 -8077.4
## - kw_max_avg 1 265.58 22706 -7905.6
## - kw_avg_avg 1 537.64 22978 -7551.4
##
## Step: AIC=-8255.38
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg +
## kw_avg_avg + self_reference_min_shares + self_reference_max_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## weekday_is_saturday + LDA_00 + LDA_01 + LDA_02 + LDA_03 +
## global_subjectivity + global_sentiment_polarity + global_rate_positive_words +
## rate_positive_words + rate_negative_words + avg_positive_polarity +
## min_positive_polarity + max_positive_polarity + avg_negative_polarity +
## min_negative_polarity + max_negative_polarity + title_subjectivity +
## title_sentiment_polarity + abs_title_subjectivity + abs_title_sentiment_polarity
##
## Df Sum of Sq RSS AIC
## - avg_positive_polarity 1 0.00 22440 -8257.4
## - rate_negative_words 1 0.00 22440 -8257.4
## - weekday_is_saturday 1 0.01 22440 -8257.4
## - abs_title_sentiment_polarity 1 0.02 22441 -8257.4
## - kw_max_max 1 0.02 22441 -8257.4
## - min_negative_polarity 1 0.02 22441 -8257.3
## - self_reference_max_shares 1 0.03 22441 -8257.3
## - max_positive_polarity 1 0.04 22441 -8257.3
## - n_non_stop_words 1 0.04 22441 -8257.3
## - rate_positive_words 1 0.08 22441 -8257.3
## - max_negative_polarity 1 0.18 22441 -8257.1
## - data_channel_is_world 1 0.53 22441 -8256.7
## - n_unique_tokens 1 0.63 22441 -8256.5
## - global_sentiment_polarity 1 0.86 22441 -8256.2
## - avg_negative_polarity 1 1.33 22442 -8255.6
## <none> 22440 -8255.4
## - self_reference_min_shares 1 1.56 22442 -8255.3
## - self_reference_avg_sharess 1 1.64 22442 -8255.2
## - n_non_stop_unique_tokens 1 1.95 22442 -8254.8
## - num_videos 1 2.06 22443 -8254.6
## - global_rate_positive_words 1 2.60 22443 -8253.9
## - n_tokens_content 1 2.64 22443 -8253.9
## - n_tokens_title 1 3.63 22444 -8252.6
## - title_subjectivity 1 4.51 22445 -8251.4
## - min_positive_polarity 1 5.36 22446 -8250.3
## - LDA_03 1 5.55 22446 -8250.0
## - data_channel_is_lifestyle 1 5.79 22446 -8249.7
## - LDA_01 1 7.05 22448 -8248.0
## - title_sentiment_polarity 1 7.71 22448 -8247.2
## - num_imgs 1 7.87 22448 -8247.0
## - num_keywords 1 9.92 22450 -8244.2
## - data_channel_is_tech 1 10.28 22451 -8243.8
## - kw_avg_max 1 10.37 22451 -8243.6
## - average_token_length 1 11.42 22452 -8242.2
## - kw_max_min 1 12.80 22453 -8240.4
## - kw_min_max 1 13.87 22454 -8239.0
## - abs_title_subjectivity 1 14.08 22455 -8238.7
## - num_self_hrefs 1 15.19 22456 -8237.3
## - data_channel_is_bus 1 17.66 22458 -8234.0
## - data_channel_is_socmed 1 19.33 22460 -8231.8
## - kw_avg_min 1 20.07 22461 -8230.8
## - global_subjectivity 1 23.99 22464 -8225.6
## - LDA_02 1 27.06 22468 -8221.5
## - num_hrefs 1 28.02 22469 -8220.3
## - LDA_00 1 28.19 22469 -8220.1
## - kw_min_min 1 35.28 22476 -8210.7
## - kw_min_avg 1 37.45 22478 -8207.8
## - data_channel_is_entertainment 1 50.31 22491 -8190.8
## - weekday_is_friday 1 71.43 22512 -8162.9
## - weekday_is_monday 1 78.02 22519 -8154.2
## - weekday_is_wednesday 1 131.14 22572 -8084.1
## - weekday_is_tuesday 1 131.15 22572 -8084.1
## - weekday_is_thursday 1 134.77 22575 -8079.3
## - kw_max_avg 1 265.59 22706 -7907.6
## - kw_avg_avg 1 537.69 22978 -7553.4
##
## Step: AIC=-8257.38
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg +
## kw_avg_avg + self_reference_min_shares + self_reference_max_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## weekday_is_saturday + LDA_00 + LDA_01 + LDA_02 + LDA_03 +
## global_subjectivity + global_sentiment_polarity + global_rate_positive_words +
## rate_positive_words + rate_negative_words + min_positive_polarity +
## max_positive_polarity + avg_negative_polarity + min_negative_polarity +
## max_negative_polarity + title_subjectivity + title_sentiment_polarity +
## abs_title_subjectivity + abs_title_sentiment_polarity
##
## Df Sum of Sq RSS AIC
## - rate_negative_words 1 0.00 22440 -8259.4
## - weekday_is_saturday 1 0.01 22440 -8259.4
## - abs_title_sentiment_polarity 1 0.02 22441 -8259.4
## - kw_max_max 1 0.02 22441 -8259.3
## - min_negative_polarity 1 0.02 22441 -8259.3
## - self_reference_max_shares 1 0.03 22441 -8259.3
## - n_non_stop_words 1 0.04 22441 -8259.3
## - max_positive_polarity 1 0.06 22441 -8259.3
## - rate_positive_words 1 0.08 22441 -8259.3
## - max_negative_polarity 1 0.17 22441 -8259.1
## - data_channel_is_world 1 0.53 22441 -8258.7
## - n_unique_tokens 1 0.63 22441 -8258.5
## - avg_negative_polarity 1 1.35 22442 -8257.6
## - global_sentiment_polarity 1 1.35 22442 -8257.6
## <none> 22440 -8257.4
## - self_reference_min_shares 1 1.56 22442 -8257.3
## - self_reference_avg_sharess 1 1.64 22442 -8257.2
## - n_non_stop_unique_tokens 1 1.96 22442 -8256.8
## - num_videos 1 2.06 22443 -8256.6
## - n_tokens_content 1 2.64 22443 -8255.9
## - global_rate_positive_words 1 2.68 22443 -8255.8
## - n_tokens_title 1 3.63 22444 -8254.6
## - title_subjectivity 1 4.52 22445 -8253.4
## - LDA_03 1 5.55 22446 -8252.0
## - data_channel_is_lifestyle 1 5.79 22446 -8251.7
## - LDA_01 1 7.05 22448 -8250.0
## - min_positive_polarity 1 7.11 22448 -8250.0
## - title_sentiment_polarity 1 7.73 22448 -8249.1
## - num_imgs 1 7.86 22448 -8249.0
## - num_keywords 1 9.92 22450 -8246.2
## - data_channel_is_tech 1 10.28 22451 -8245.8
## - kw_avg_max 1 10.36 22451 -8245.6
## - average_token_length 1 11.44 22452 -8244.2
## - kw_max_min 1 12.80 22453 -8242.4
## - kw_min_max 1 13.88 22454 -8241.0
## - abs_title_subjectivity 1 14.08 22455 -8240.7
## - num_self_hrefs 1 15.19 22456 -8239.3
## - data_channel_is_bus 1 17.66 22458 -8236.0
## - data_channel_is_socmed 1 19.33 22460 -8233.8
## - kw_avg_min 1 20.07 22461 -8232.8
## - global_subjectivity 1 24.27 22465 -8227.2
## - LDA_02 1 27.08 22468 -8223.5
## - num_hrefs 1 28.05 22469 -8222.2
## - LDA_00 1 28.19 22469 -8222.0
## - kw_min_min 1 35.28 22476 -8212.7
## - kw_min_avg 1 37.45 22478 -8209.8
## - data_channel_is_entertainment 1 50.32 22491 -8192.8
## - weekday_is_friday 1 71.43 22512 -8164.9
## - weekday_is_monday 1 78.03 22519 -8156.2
## - weekday_is_tuesday 1 131.15 22572 -8086.1
## - weekday_is_wednesday 1 131.15 22572 -8086.1
## - weekday_is_thursday 1 134.77 22575 -8081.3
## - kw_max_avg 1 265.60 22706 -7909.5
## - kw_avg_avg 1 537.78 22978 -7555.3
##
## Step: AIC=-8259.37
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg +
## kw_avg_avg + self_reference_min_shares + self_reference_max_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## weekday_is_saturday + LDA_00 + LDA_01 + LDA_02 + LDA_03 +
## global_subjectivity + global_sentiment_polarity + global_rate_positive_words +
## rate_positive_words + min_positive_polarity + max_positive_polarity +
## avg_negative_polarity + min_negative_polarity + max_negative_polarity +
## title_subjectivity + title_sentiment_polarity + abs_title_subjectivity +
## abs_title_sentiment_polarity
##
## Df Sum of Sq RSS AIC
## - weekday_is_saturday 1 0.01 22440 -8261.4
## - abs_title_sentiment_polarity 1 0.02 22441 -8261.3
## - kw_max_max 1 0.02 22441 -8261.3
## - min_negative_polarity 1 0.02 22441 -8261.3
## - self_reference_max_shares 1 0.03 22441 -8261.3
## - max_positive_polarity 1 0.06 22441 -8261.3
## - max_negative_polarity 1 0.17 22441 -8261.1
## - data_channel_is_world 1 0.53 22441 -8260.7
## - n_unique_tokens 1 0.63 22441 -8260.5
## - n_non_stop_words 1 0.94 22441 -8260.1
## - avg_negative_polarity 1 1.35 22442 -8259.6
## - global_sentiment_polarity 1 1.37 22442 -8259.6
## <none> 22440 -8259.4
## - self_reference_min_shares 1 1.56 22442 -8259.3
## - self_reference_avg_sharess 1 1.64 22442 -8259.2
## - n_non_stop_unique_tokens 1 1.95 22442 -8258.8
## - num_videos 1 2.06 22443 -8258.6
## - n_tokens_content 1 2.64 22443 -8257.9
## - global_rate_positive_words 1 2.67 22443 -8257.8
## - rate_positive_words 1 3.63 22444 -8256.6
## - n_tokens_title 1 3.63 22444 -8256.6
## - title_subjectivity 1 4.52 22445 -8255.4
## - LDA_03 1 5.55 22446 -8254.0
## - data_channel_is_lifestyle 1 5.79 22446 -8253.7
## - LDA_01 1 7.06 22448 -8252.0
## - min_positive_polarity 1 7.11 22448 -8252.0
## - title_sentiment_polarity 1 7.73 22448 -8251.1
## - num_imgs 1 7.87 22448 -8251.0
## - num_keywords 1 9.92 22450 -8248.2
## - data_channel_is_tech 1 10.28 22451 -8247.8
## - kw_avg_max 1 10.36 22451 -8247.6
## - average_token_length 1 11.43 22452 -8246.2
## - kw_max_min 1 12.80 22453 -8244.4
## - kw_min_max 1 13.88 22454 -8243.0
## - abs_title_subjectivity 1 14.08 22455 -8242.7
## - num_self_hrefs 1 15.19 22456 -8241.3
## - data_channel_is_bus 1 17.67 22458 -8238.0
## - data_channel_is_socmed 1 19.33 22460 -8235.8
## - kw_avg_min 1 20.07 22461 -8234.8
## - global_subjectivity 1 24.33 22465 -8229.2
## - LDA_02 1 27.08 22468 -8225.5
## - num_hrefs 1 28.05 22469 -8224.2
## - LDA_00 1 28.19 22469 -8224.0
## - kw_min_min 1 35.29 22476 -8214.7
## - kw_min_avg 1 37.46 22478 -8211.8
## - data_channel_is_entertainment 1 50.32 22491 -8194.8
## - weekday_is_friday 1 71.44 22512 -8166.9
## - weekday_is_monday 1 78.03 22519 -8158.2
## - weekday_is_wednesday 1 131.15 22572 -8088.1
## - weekday_is_tuesday 1 131.16 22572 -8088.1
## - weekday_is_thursday 1 134.77 22575 -8083.3
## - kw_max_avg 1 265.60 22706 -7911.5
## - kw_avg_avg 1 537.79 22978 -7557.2
##
## Step: AIC=-8261.36
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg +
## kw_avg_avg + self_reference_min_shares + self_reference_max_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## LDA_00 + LDA_01 + LDA_02 + LDA_03 + global_subjectivity +
## global_sentiment_polarity + global_rate_positive_words +
## rate_positive_words + min_positive_polarity + max_positive_polarity +
## avg_negative_polarity + min_negative_polarity + max_negative_polarity +
## title_subjectivity + title_sentiment_polarity + abs_title_subjectivity +
## abs_title_sentiment_polarity
##
## Df Sum of Sq RSS AIC
## - abs_title_sentiment_polarity 1 0.02 22441 -8263.3
## - kw_max_max 1 0.02 22441 -8263.3
## - min_negative_polarity 1 0.02 22441 -8263.3
## - self_reference_max_shares 1 0.03 22441 -8263.3
## - max_positive_polarity 1 0.06 22441 -8263.3
## - max_negative_polarity 1 0.17 22441 -8263.1
## - data_channel_is_world 1 0.52 22441 -8262.7
## - n_unique_tokens 1 0.63 22441 -8262.5
## - n_non_stop_words 1 0.94 22441 -8262.1
## - avg_negative_polarity 1 1.35 22442 -8261.6
## - global_sentiment_polarity 1 1.37 22442 -8261.6
## <none> 22440 -8261.4
## - self_reference_min_shares 1 1.56 22442 -8261.3
## - self_reference_avg_sharess 1 1.64 22442 -8261.2
## - n_non_stop_unique_tokens 1 1.96 22442 -8260.8
## - num_videos 1 2.06 22443 -8260.6
## - n_tokens_content 1 2.65 22443 -8259.9
## - global_rate_positive_words 1 2.67 22443 -8259.8
## - rate_positive_words 1 3.63 22444 -8258.6
## - n_tokens_title 1 3.64 22444 -8258.5
## - title_subjectivity 1 4.51 22445 -8257.4
## - LDA_03 1 5.55 22446 -8256.0
## - data_channel_is_lifestyle 1 5.79 22446 -8255.7
## - LDA_01 1 7.05 22448 -8254.0
## - min_positive_polarity 1 7.11 22448 -8253.9
## - title_sentiment_polarity 1 7.72 22448 -8253.1
## - num_imgs 1 7.87 22448 -8252.9
## - num_keywords 1 9.92 22450 -8250.2
## - data_channel_is_tech 1 10.27 22451 -8249.8
## - kw_avg_max 1 10.37 22451 -8249.6
## - average_token_length 1 11.43 22452 -8248.2
## - kw_max_min 1 12.80 22453 -8246.4
## - kw_min_max 1 13.88 22454 -8245.0
## - abs_title_subjectivity 1 14.08 22455 -8244.7
## - num_self_hrefs 1 15.20 22456 -8243.2
## - data_channel_is_bus 1 17.66 22458 -8240.0
## - data_channel_is_socmed 1 19.32 22460 -8237.8
## - kw_avg_min 1 20.07 22461 -8236.8
## - global_subjectivity 1 24.33 22465 -8231.1
## - LDA_02 1 27.09 22468 -8227.5
## - num_hrefs 1 28.04 22469 -8226.2
## - LDA_00 1 28.19 22469 -8226.0
## - kw_min_min 1 35.30 22476 -8216.6
## - kw_min_avg 1 37.46 22478 -8213.8
## - data_channel_is_entertainment 1 50.31 22491 -8196.8
## - weekday_is_friday 1 103.25 22544 -8126.9
## - weekday_is_monday 1 115.47 22556 -8110.8
## - weekday_is_wednesday 1 197.35 22638 -8003.0
## - weekday_is_tuesday 1 197.36 22638 -8003.0
## - weekday_is_thursday 1 202.50 22643 -7996.3
## - kw_max_avg 1 265.62 22706 -7913.5
## - kw_avg_avg 1 537.78 22978 -7559.2
##
## Step: AIC=-8263.34
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg +
## kw_avg_avg + self_reference_min_shares + self_reference_max_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## LDA_00 + LDA_01 + LDA_02 + LDA_03 + global_subjectivity +
## global_sentiment_polarity + global_rate_positive_words +
## rate_positive_words + min_positive_polarity + max_positive_polarity +
## avg_negative_polarity + min_negative_polarity + max_negative_polarity +
## title_subjectivity + title_sentiment_polarity + abs_title_subjectivity
##
## Df Sum of Sq RSS AIC
## - kw_max_max 1 0.02 22441 -8265.3
## - min_negative_polarity 1 0.02 22441 -8265.3
## - self_reference_max_shares 1 0.03 22441 -8265.3
## - max_positive_polarity 1 0.06 22441 -8265.3
## - max_negative_polarity 1 0.17 22441 -8265.1
## - data_channel_is_world 1 0.53 22441 -8264.6
## - n_unique_tokens 1 0.63 22441 -8264.5
## - n_non_stop_words 1 0.93 22441 -8264.1
## - avg_negative_polarity 1 1.33 22442 -8263.6
## - global_sentiment_polarity 1 1.38 22442 -8263.5
## <none> 22441 -8263.3
## - self_reference_min_shares 1 1.56 22442 -8263.3
## - self_reference_avg_sharess 1 1.64 22442 -8263.2
## - n_non_stop_unique_tokens 1 1.95 22442 -8262.8
## - num_videos 1 2.06 22443 -8262.6
## - n_tokens_content 1 2.65 22443 -8261.8
## - global_rate_positive_words 1 2.67 22443 -8261.8
## - n_tokens_title 1 3.65 22444 -8260.5
## - rate_positive_words 1 3.66 22444 -8260.5
## - LDA_03 1 5.56 22446 -8258.0
## - data_channel_is_lifestyle 1 5.80 22446 -8257.7
## - title_subjectivity 1 6.95 22447 -8256.1
## - LDA_01 1 7.05 22448 -8256.0
## - min_positive_polarity 1 7.11 22448 -8255.9
## - num_imgs 1 7.87 22448 -8254.9
## - title_sentiment_polarity 1 8.51 22449 -8254.1
## - num_keywords 1 9.92 22450 -8252.2
## - data_channel_is_tech 1 10.27 22451 -8251.7
## - kw_avg_max 1 10.37 22451 -8251.6
## - average_token_length 1 11.42 22452 -8250.2
## - kw_max_min 1 12.79 22453 -8248.4
## - kw_min_max 1 13.88 22454 -8247.0
## - abs_title_subjectivity 1 14.16 22455 -8246.6
## - num_self_hrefs 1 15.19 22456 -8245.2
## - data_channel_is_bus 1 17.66 22458 -8241.9
## - data_channel_is_socmed 1 19.32 22460 -8239.8
## - kw_avg_min 1 20.05 22461 -8238.8
## - global_subjectivity 1 24.40 22465 -8233.0
## - LDA_02 1 27.08 22468 -8229.5
## - num_hrefs 1 28.04 22469 -8228.2
## - LDA_00 1 28.18 22469 -8228.0
## - kw_min_min 1 35.29 22476 -8218.6
## - kw_min_avg 1 37.48 22478 -8215.7
## - data_channel_is_entertainment 1 50.31 22491 -8198.8
## - weekday_is_friday 1 103.24 22544 -8128.9
## - weekday_is_monday 1 115.46 22556 -8112.8
## - weekday_is_wednesday 1 197.35 22638 -8005.0
## - weekday_is_tuesday 1 197.35 22638 -8005.0
## - weekday_is_thursday 1 202.48 22643 -7998.3
## - kw_max_avg 1 265.60 22706 -7915.5
## - kw_avg_avg 1 537.77 22978 -7561.2
##
## Step: AIC=-8265.31
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_avg_max + kw_min_avg + kw_max_avg + kw_avg_avg +
## self_reference_min_shares + self_reference_max_shares + self_reference_avg_sharess +
## weekday_is_monday + weekday_is_tuesday + weekday_is_wednesday +
## weekday_is_thursday + weekday_is_friday + LDA_00 + LDA_01 +
## LDA_02 + LDA_03 + global_subjectivity + global_sentiment_polarity +
## global_rate_positive_words + rate_positive_words + min_positive_polarity +
## max_positive_polarity + avg_negative_polarity + min_negative_polarity +
## max_negative_polarity + title_subjectivity + title_sentiment_polarity +
## abs_title_subjectivity
##
## Df Sum of Sq RSS AIC
## - min_negative_polarity 1 0.02 22441 -8267.3
## - self_reference_max_shares 1 0.03 22441 -8267.3
## - max_positive_polarity 1 0.06 22441 -8267.2
## - max_negative_polarity 1 0.17 22441 -8267.1
## - data_channel_is_world 1 0.51 22441 -8266.6
## - n_unique_tokens 1 0.63 22441 -8266.5
## - n_non_stop_words 1 0.94 22441 -8266.1
## - avg_negative_polarity 1 1.33 22442 -8265.6
## - global_sentiment_polarity 1 1.39 22442 -8265.5
## <none> 22441 -8265.3
## - self_reference_min_shares 1 1.57 22442 -8265.2
## - self_reference_avg_sharess 1 1.64 22442 -8265.1
## - n_non_stop_unique_tokens 1 1.95 22442 -8264.7
## - num_videos 1 2.05 22443 -8264.6
## - n_tokens_content 1 2.64 22443 -8263.8
## - global_rate_positive_words 1 2.67 22443 -8263.8
## - n_tokens_title 1 3.66 22444 -8262.5
## - rate_positive_words 1 3.66 22444 -8262.5
## - LDA_03 1 5.65 22446 -8259.8
## - data_channel_is_lifestyle 1 5.78 22446 -8259.7
## - title_subjectivity 1 6.95 22447 -8258.1
## - LDA_01 1 7.06 22448 -8258.0
## - min_positive_polarity 1 7.10 22448 -8257.9
## - num_imgs 1 7.90 22448 -8256.9
## - title_sentiment_polarity 1 8.50 22449 -8256.0
## - num_keywords 1 10.28 22451 -8253.7
## - data_channel_is_tech 1 10.35 22451 -8253.6
## - kw_avg_max 1 11.39 22452 -8252.2
## - average_token_length 1 11.42 22452 -8252.2
## - kw_max_min 1 12.85 22453 -8250.3
## - abs_title_subjectivity 1 14.15 22455 -8248.6
## - kw_min_max 1 14.31 22455 -8248.4
## - num_self_hrefs 1 15.24 22456 -8247.1
## - data_channel_is_bus 1 17.65 22458 -8243.9
## - data_channel_is_socmed 1 19.54 22460 -8241.4
## - kw_avg_min 1 20.15 22461 -8240.6
## - global_subjectivity 1 24.40 22465 -8235.0
## - LDA_02 1 27.11 22468 -8231.4
## - num_hrefs 1 28.11 22469 -8230.1
## - LDA_00 1 28.16 22469 -8230.0
## - kw_min_avg 1 37.68 22478 -8217.4
## - data_channel_is_entertainment 1 50.89 22491 -8200.0
## - kw_min_min 1 75.83 22516 -8167.0
## - weekday_is_friday 1 103.29 22544 -8130.8
## - weekday_is_monday 1 115.67 22556 -8114.5
## - weekday_is_tuesday 1 197.53 22638 -8006.7
## - weekday_is_wednesday 1 197.60 22638 -8006.6
## - weekday_is_thursday 1 202.64 22643 -8000.0
## - kw_max_avg 1 267.54 22708 -7914.9
## - kw_avg_avg 1 543.77 22984 -7555.4
##
## Step: AIC=-8267.28
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_avg_max + kw_min_avg + kw_max_avg + kw_avg_avg +
## self_reference_min_shares + self_reference_max_shares + self_reference_avg_sharess +
## weekday_is_monday + weekday_is_tuesday + weekday_is_wednesday +
## weekday_is_thursday + weekday_is_friday + LDA_00 + LDA_01 +
## LDA_02 + LDA_03 + global_subjectivity + global_sentiment_polarity +
## global_rate_positive_words + rate_positive_words + min_positive_polarity +
## max_positive_polarity + avg_negative_polarity + max_negative_polarity +
## title_subjectivity + title_sentiment_polarity + abs_title_subjectivity
##
## Df Sum of Sq RSS AIC
## - self_reference_max_shares 1 0.03 22441 -8269.2
## - max_positive_polarity 1 0.06 22441 -8269.2
## - max_negative_polarity 1 0.30 22441 -8268.9
## - data_channel_is_world 1 0.51 22441 -8268.6
## - n_unique_tokens 1 0.61 22441 -8268.5
## - n_non_stop_words 1 0.96 22442 -8268.0
## - global_sentiment_polarity 1 1.39 22442 -8267.4
## <none> 22441 -8267.3
## - self_reference_min_shares 1 1.57 22442 -8267.2
## - self_reference_avg_sharess 1 1.64 22442 -8267.1
## - n_non_stop_unique_tokens 1 1.92 22442 -8266.7
## - num_videos 1 2.05 22443 -8266.6
## - global_rate_positive_words 1 2.65 22443 -8265.8
## - n_tokens_content 1 2.89 22443 -8265.4
## - avg_negative_polarity 1 3.63 22444 -8264.5
## - n_tokens_title 1 3.66 22444 -8264.4
## - rate_positive_words 1 3.68 22444 -8264.4
## - LDA_03 1 5.65 22446 -8261.8
## - data_channel_is_lifestyle 1 5.78 22446 -8261.6
## - title_subjectivity 1 6.93 22447 -8260.1
## - LDA_01 1 7.06 22448 -8259.9
## - min_positive_polarity 1 7.13 22448 -8259.8
## - num_imgs 1 7.88 22448 -8258.8
## - title_sentiment_polarity 1 8.55 22449 -8258.0
## - num_keywords 1 10.28 22451 -8255.7
## - data_channel_is_tech 1 10.34 22451 -8255.6
## - kw_avg_max 1 11.40 22452 -8254.2
## - average_token_length 1 11.41 22452 -8254.2
## - kw_max_min 1 12.85 22453 -8252.3
## - abs_title_subjectivity 1 14.15 22455 -8250.5
## - kw_min_max 1 14.30 22455 -8250.3
## - num_self_hrefs 1 15.28 22456 -8249.0
## - data_channel_is_bus 1 17.66 22458 -8245.9
## - data_channel_is_socmed 1 19.54 22460 -8243.4
## - kw_avg_min 1 20.15 22461 -8242.6
## - global_subjectivity 1 24.39 22465 -8237.0
## - LDA_02 1 27.09 22468 -8233.4
## - LDA_00 1 28.17 22469 -8232.0
## - num_hrefs 1 28.23 22469 -8231.9
## - kw_min_avg 1 37.68 22478 -8219.4
## - data_channel_is_entertainment 1 50.94 22491 -8201.9
## - kw_min_min 1 75.81 22516 -8169.0
## - weekday_is_friday 1 103.29 22544 -8132.7
## - weekday_is_monday 1 115.69 22556 -8116.4
## - weekday_is_tuesday 1 197.53 22638 -8008.7
## - weekday_is_wednesday 1 197.61 22638 -8008.6
## - weekday_is_thursday 1 202.63 22643 -8002.0
## - kw_max_avg 1 267.58 22708 -7916.9
## - kw_avg_avg 1 543.85 22984 -7557.3
##
## Step: AIC=-8269.24
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_avg_max + kw_min_avg + kw_max_avg + kw_avg_avg +
## self_reference_min_shares + self_reference_avg_sharess +
## weekday_is_monday + weekday_is_tuesday + weekday_is_wednesday +
## weekday_is_thursday + weekday_is_friday + LDA_00 + LDA_01 +
## LDA_02 + LDA_03 + global_subjectivity + global_sentiment_polarity +
## global_rate_positive_words + rate_positive_words + min_positive_polarity +
## max_positive_polarity + avg_negative_polarity + max_negative_polarity +
## title_subjectivity + title_sentiment_polarity + abs_title_subjectivity
##
## Df Sum of Sq RSS AIC
## - max_positive_polarity 1 0.06 22441 -8271.2
## - max_negative_polarity 1 0.30 22441 -8270.8
## - data_channel_is_world 1 0.51 22441 -8270.6
## - n_unique_tokens 1 0.61 22441 -8270.4
## - n_non_stop_words 1 0.96 22442 -8270.0
## - global_sentiment_polarity 1 1.39 22442 -8269.4
## <none> 22441 -8269.2
## - n_non_stop_unique_tokens 1 1.94 22443 -8268.7
## - num_videos 1 2.03 22443 -8268.5
## - global_rate_positive_words 1 2.66 22443 -8267.7
## - n_tokens_content 1 2.90 22443 -8267.4
## - avg_negative_polarity 1 3.62 22444 -8266.4
## - n_tokens_title 1 3.67 22444 -8266.4
## - rate_positive_words 1 3.68 22444 -8266.4
## - self_reference_min_shares 1 4.58 22445 -8265.2
## - LDA_03 1 5.65 22446 -8263.8
## - data_channel_is_lifestyle 1 5.79 22446 -8263.6
## - title_subjectivity 1 6.92 22448 -8262.1
## - LDA_01 1 7.06 22448 -8261.9
## - min_positive_polarity 1 7.15 22448 -8261.8
## - num_imgs 1 7.89 22448 -8260.8
## - title_sentiment_polarity 1 8.54 22449 -8259.9
## - self_reference_avg_sharess 1 8.71 22449 -8259.7
## - num_keywords 1 10.29 22451 -8257.6
## - data_channel_is_tech 1 10.34 22451 -8257.5
## - kw_avg_max 1 11.40 22452 -8256.1
## - average_token_length 1 11.42 22452 -8256.1
## - kw_max_min 1 12.85 22453 -8254.2
## - abs_title_subjectivity 1 14.15 22455 -8252.5
## - kw_min_max 1 14.31 22455 -8252.3
## - num_self_hrefs 1 16.13 22457 -8249.9
## - data_channel_is_bus 1 17.69 22458 -8247.8
## - data_channel_is_socmed 1 19.52 22460 -8245.4
## - kw_avg_min 1 20.14 22461 -8244.6
## - global_subjectivity 1 24.40 22465 -8238.9
## - LDA_02 1 27.10 22468 -8235.4
## - LDA_00 1 28.17 22469 -8233.9
## - num_hrefs 1 28.20 22469 -8233.9
## - kw_min_avg 1 37.66 22478 -8221.4
## - data_channel_is_entertainment 1 50.98 22492 -8203.8
## - kw_min_min 1 75.81 22516 -8171.0
## - weekday_is_friday 1 103.27 22544 -8134.7
## - weekday_is_monday 1 115.66 22556 -8118.4
## - weekday_is_tuesday 1 197.50 22638 -8010.7
## - weekday_is_wednesday 1 197.58 22638 -8010.6
## - weekday_is_thursday 1 202.60 22643 -8004.0
## - kw_max_avg 1 267.59 22708 -7918.8
## - kw_avg_avg 1 543.82 22984 -7559.3
##
## Step: AIC=-8271.16
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_avg_max + kw_min_avg + kw_max_avg + kw_avg_avg +
## self_reference_min_shares + self_reference_avg_sharess +
## weekday_is_monday + weekday_is_tuesday + weekday_is_wednesday +
## weekday_is_thursday + weekday_is_friday + LDA_00 + LDA_01 +
## LDA_02 + LDA_03 + global_subjectivity + global_sentiment_polarity +
## global_rate_positive_words + rate_positive_words + min_positive_polarity +
## avg_negative_polarity + max_negative_polarity + title_subjectivity +
## title_sentiment_polarity + abs_title_subjectivity
##
## Df Sum of Sq RSS AIC
## - max_negative_polarity 1 0.27 22441 -8272.8
## - data_channel_is_world 1 0.50 22441 -8272.5
## - n_unique_tokens 1 0.70 22441 -8272.2
## - n_non_stop_words 1 0.91 22442 -8272.0
## <none> 22441 -8271.2
## - global_sentiment_polarity 1 1.96 22443 -8270.6
## - num_videos 1 1.99 22443 -8270.5
## - n_non_stop_unique_tokens 1 2.05 22443 -8270.4
## - global_rate_positive_words 1 2.76 22443 -8269.5
## - n_tokens_content 1 2.85 22443 -8269.4
## - avg_negative_polarity 1 3.59 22444 -8268.4
## - n_tokens_title 1 3.66 22444 -8268.3
## - rate_positive_words 1 4.06 22445 -8267.8
## - self_reference_min_shares 1 4.58 22445 -8267.1
## - LDA_03 1 5.65 22446 -8265.7
## - data_channel_is_lifestyle 1 5.76 22446 -8265.5
## - title_subjectivity 1 6.90 22448 -8264.0
## - LDA_01 1 7.05 22448 -8263.8
## - min_positive_polarity 1 7.11 22448 -8263.7
## - num_imgs 1 7.84 22448 -8262.8
## - title_sentiment_polarity 1 8.57 22449 -8261.8
## - self_reference_avg_sharess 1 8.71 22449 -8261.6
## - num_keywords 1 10.27 22451 -8259.6
## - data_channel_is_tech 1 10.40 22451 -8259.4
## - kw_avg_max 1 11.39 22452 -8258.1
## - average_token_length 1 11.43 22452 -8258.0
## - kw_max_min 1 12.84 22453 -8256.1
## - abs_title_subjectivity 1 14.16 22455 -8254.4
## - kw_min_max 1 14.33 22455 -8254.2
## - num_self_hrefs 1 16.11 22457 -8251.8
## - data_channel_is_bus 1 17.65 22458 -8249.8
## - data_channel_is_socmed 1 19.67 22460 -8247.1
## - kw_avg_min 1 20.15 22461 -8246.5
## - global_subjectivity 1 24.41 22465 -8240.8
## - LDA_02 1 27.07 22468 -8237.3
## - num_hrefs 1 28.14 22469 -8235.9
## - LDA_00 1 28.14 22469 -8235.9
## - kw_min_avg 1 37.66 22478 -8223.3
## - data_channel_is_entertainment 1 50.97 22492 -8205.7
## - kw_min_min 1 75.85 22516 -8172.8
## - weekday_is_friday 1 103.27 22544 -8136.7
## - weekday_is_monday 1 115.70 22556 -8120.3
## - weekday_is_tuesday 1 197.50 22638 -8012.6
## - weekday_is_wednesday 1 197.56 22638 -8012.6
## - weekday_is_thursday 1 202.56 22643 -8006.0
## - kw_max_avg 1 267.54 22708 -7920.8
## - kw_avg_avg 1 543.78 22984 -7561.3
##
## Step: AIC=-8272.81
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_avg_max + kw_min_avg + kw_max_avg + kw_avg_avg +
## self_reference_min_shares + self_reference_avg_sharess +
## weekday_is_monday + weekday_is_tuesday + weekday_is_wednesday +
## weekday_is_thursday + weekday_is_friday + LDA_00 + LDA_01 +
## LDA_02 + LDA_03 + global_subjectivity + global_sentiment_polarity +
## global_rate_positive_words + rate_positive_words + min_positive_polarity +
## avg_negative_polarity + title_subjectivity + title_sentiment_polarity +
## abs_title_subjectivity
##
## Df Sum of Sq RSS AIC
## - data_channel_is_world 1 0.51 22441 -8274.1
## - n_unique_tokens 1 0.55 22441 -8274.1
## - n_non_stop_words 1 0.97 22442 -8273.5
## <none> 22441 -8272.8
## - n_non_stop_unique_tokens 1 1.86 22443 -8272.3
## - num_videos 1 2.10 22443 -8272.0
## - global_sentiment_polarity 1 2.17 22443 -8271.9
## - global_rate_positive_words 1 2.56 22443 -8271.4
## - n_tokens_content 1 2.99 22444 -8270.9
## - n_tokens_title 1 3.67 22445 -8269.9
## - rate_positive_words 1 3.80 22445 -8269.8
## - avg_negative_polarity 1 4.39 22445 -8269.0
## - self_reference_min_shares 1 4.55 22445 -8268.8
## - LDA_03 1 5.62 22447 -8267.4
## - data_channel_is_lifestyle 1 5.78 22447 -8267.2
## - title_subjectivity 1 6.97 22448 -8265.6
## - LDA_01 1 7.02 22448 -8265.5
## - min_positive_polarity 1 7.26 22448 -8265.2
## - num_imgs 1 7.90 22449 -8264.3
## - title_sentiment_polarity 1 8.50 22449 -8263.6
## - self_reference_avg_sharess 1 8.71 22450 -8263.3
## - num_keywords 1 10.26 22451 -8261.2
## - data_channel_is_tech 1 10.34 22451 -8261.1
## - average_token_length 1 11.37 22452 -8259.7
## - kw_avg_max 1 11.38 22452 -8259.7
## - kw_max_min 1 12.85 22454 -8257.8
## - abs_title_subjectivity 1 14.24 22455 -8256.0
## - kw_min_max 1 14.33 22455 -8255.8
## - num_self_hrefs 1 16.14 22457 -8253.4
## - data_channel_is_bus 1 17.74 22459 -8251.3
## - data_channel_is_socmed 1 19.59 22461 -8248.9
## - kw_avg_min 1 20.15 22461 -8248.1
## - global_subjectivity 1 25.69 22467 -8240.8
## - LDA_02 1 27.08 22468 -8238.9
## - LDA_00 1 28.15 22469 -8237.5
## - num_hrefs 1 28.74 22470 -8236.8
## - kw_min_avg 1 37.62 22479 -8225.0
## - data_channel_is_entertainment 1 50.90 22492 -8207.4
## - kw_min_min 1 75.83 22517 -8174.5
## - weekday_is_friday 1 103.33 22544 -8138.2
## - weekday_is_monday 1 115.63 22557 -8122.0
## - weekday_is_wednesday 1 197.50 22638 -8014.3
## - weekday_is_tuesday 1 197.56 22638 -8014.2
## - weekday_is_thursday 1 202.55 22643 -8007.7
## - kw_max_avg 1 267.42 22708 -7922.6
## - kw_avg_avg 1 543.65 22985 -7563.1
##
## Step: AIC=-8274.14
## shares ~ n_tokens_title + n_tokens_content + n_unique_tokens +
## n_non_stop_words + n_non_stop_unique_tokens + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## kw_min_min + kw_max_min + kw_avg_min + kw_min_max + kw_avg_max +
## kw_min_avg + kw_max_avg + kw_avg_avg + self_reference_min_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## LDA_00 + LDA_01 + LDA_02 + LDA_03 + global_subjectivity +
## global_sentiment_polarity + global_rate_positive_words +
## rate_positive_words + min_positive_polarity + avg_negative_polarity +
## title_subjectivity + title_sentiment_polarity + abs_title_subjectivity
##
## Df Sum of Sq RSS AIC
## - n_unique_tokens 1 0.66 22442 -8275.3
## - n_non_stop_words 1 0.98 22442 -8274.8
## <none> 22441 -8274.1
## - n_non_stop_unique_tokens 1 2.04 22443 -8273.4
## - num_videos 1 2.11 22444 -8273.3
## - global_sentiment_polarity 1 2.17 22444 -8273.3
## - global_rate_positive_words 1 2.43 22444 -8272.9
## - n_tokens_content 1 3.00 22444 -8272.2
## - n_tokens_title 1 3.61 22445 -8271.4
## - rate_positive_words 1 3.76 22445 -8271.2
## - avg_negative_polarity 1 4.40 22446 -8270.3
## - self_reference_min_shares 1 4.56 22446 -8270.1
## - LDA_03 1 5.18 22447 -8269.3
## - data_channel_is_lifestyle 1 5.71 22447 -8268.6
## - LDA_01 1 6.53 22448 -8267.5
## - title_subjectivity 1 6.95 22448 -8266.9
## - min_positive_polarity 1 7.15 22449 -8266.7
## - num_imgs 1 8.11 22450 -8265.4
## - title_sentiment_polarity 1 8.49 22450 -8264.9
## - self_reference_avg_sharess 1 8.71 22450 -8264.6
## - num_keywords 1 10.36 22452 -8262.4
## - kw_avg_max 1 11.06 22452 -8261.5
## - average_token_length 1 11.63 22453 -8260.7
## - kw_max_min 1 12.88 22454 -8259.1
## - abs_title_subjectivity 1 14.19 22456 -8257.3
## - kw_min_max 1 14.61 22456 -8256.8
## - num_self_hrefs 1 15.99 22457 -8255.0
## - kw_avg_min 1 20.22 22462 -8249.4
## - data_channel_is_bus 1 21.21 22463 -8248.1
## - data_channel_is_tech 1 22.91 22464 -8245.8
## - global_subjectivity 1 26.04 22467 -8241.7
## - LDA_00 1 28.64 22470 -8238.2
## - num_hrefs 1 29.03 22470 -8237.7
## - LDA_02 1 32.72 22474 -8232.8
## - data_channel_is_socmed 1 34.45 22476 -8230.5
## - kw_min_avg 1 38.41 22480 -8225.3
## - data_channel_is_entertainment 1 58.20 22500 -8199.1
## - kw_min_min 1 78.39 22520 -8172.5
## - weekday_is_friday 1 103.29 22545 -8139.6
## - weekday_is_monday 1 115.73 22557 -8123.2
## - weekday_is_wednesday 1 197.42 22639 -8015.7
## - weekday_is_tuesday 1 197.55 22639 -8015.6
## - weekday_is_thursday 1 202.59 22644 -8008.9
## - kw_max_avg 1 274.77 22716 -7914.3
## - kw_avg_avg 1 561.26 23003 -7541.7
##
## Step: AIC=-8275.26
## shares ~ n_tokens_title + n_tokens_content + n_non_stop_words +
## n_non_stop_unique_tokens + num_hrefs + num_self_hrefs + num_imgs +
## num_videos + average_token_length + num_keywords + data_channel_is_lifestyle +
## data_channel_is_entertainment + data_channel_is_bus + data_channel_is_socmed +
## data_channel_is_tech + kw_min_min + kw_max_min + kw_avg_min +
## kw_min_max + kw_avg_max + kw_min_avg + kw_max_avg + kw_avg_avg +
## self_reference_min_shares + self_reference_avg_sharess +
## weekday_is_monday + weekday_is_tuesday + weekday_is_wednesday +
## weekday_is_thursday + weekday_is_friday + LDA_00 + LDA_01 +
## LDA_02 + LDA_03 + global_subjectivity + global_sentiment_polarity +
## global_rate_positive_words + rate_positive_words + min_positive_polarity +
## avg_negative_polarity + title_subjectivity + title_sentiment_polarity +
## abs_title_subjectivity
##
## Df Sum of Sq RSS AIC
## - n_non_stop_words 1 0.76 22443 -8276.3
## <none> 22442 -8275.3
## - n_non_stop_unique_tokens 1 2.04 22444 -8274.6
## - global_rate_positive_words 1 2.28 22444 -8274.2
## - num_videos 1 2.34 22444 -8274.2
## - global_sentiment_polarity 1 2.34 22444 -8274.2
## - n_tokens_content 1 2.37 22444 -8274.1
## - n_tokens_title 1 3.59 22446 -8272.5
## - rate_positive_words 1 4.04 22446 -8271.9
## - avg_negative_polarity 1 4.22 22446 -8271.7
## - self_reference_min_shares 1 4.59 22447 -8271.2
## - LDA_03 1 4.82 22447 -8270.9
## - data_channel_is_lifestyle 1 5.92 22448 -8269.4
## - LDA_01 1 6.28 22448 -8268.9
## - min_positive_polarity 1 6.53 22449 -8268.6
## - title_subjectivity 1 7.00 22449 -8268.0
## - title_sentiment_polarity 1 8.45 22451 -8266.1
## - self_reference_avg_sharess 1 8.68 22451 -8265.8
## - num_imgs 1 9.50 22452 -8264.7
## - num_keywords 1 10.44 22453 -8263.4
## - kw_avg_max 1 10.92 22453 -8262.8
## - average_token_length 1 10.99 22453 -8262.7
## - kw_max_min 1 12.90 22455 -8260.2
## - abs_title_subjectivity 1 14.03 22456 -8258.7
## - kw_min_max 1 14.64 22457 -8257.9
## - num_self_hrefs 1 15.93 22458 -8256.2
## - kw_avg_min 1 20.25 22462 -8250.4
## - data_channel_is_bus 1 21.28 22463 -8249.1
## - data_channel_is_tech 1 22.85 22465 -8247.0
## - global_subjectivity 1 26.02 22468 -8242.8
## - LDA_00 1 28.49 22471 -8239.5
## - num_hrefs 1 29.82 22472 -8237.8
## - LDA_02 1 33.11 22475 -8233.4
## - data_channel_is_socmed 1 34.64 22477 -8231.4
## - kw_min_avg 1 38.27 22480 -8226.6
## - data_channel_is_entertainment 1 58.65 22501 -8199.7
## - kw_min_min 1 79.27 22521 -8172.4
## - weekday_is_friday 1 103.36 22545 -8140.6
## - weekday_is_monday 1 115.72 22558 -8124.3
## - weekday_is_wednesday 1 197.47 22640 -8016.8
## - weekday_is_tuesday 1 197.54 22640 -8016.7
## - weekday_is_thursday 1 202.57 22645 -8010.1
## - kw_max_avg 1 274.84 22717 -7915.4
## - kw_avg_avg 1 561.43 23004 -7542.6
##
## Step: AIC=-8276.25
## shares ~ n_tokens_title + n_tokens_content + n_non_stop_unique_tokens +
## num_hrefs + num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## kw_min_min + kw_max_min + kw_avg_min + kw_min_max + kw_avg_max +
## kw_min_avg + kw_max_avg + kw_avg_avg + self_reference_min_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## LDA_00 + LDA_01 + LDA_02 + LDA_03 + global_subjectivity +
## global_sentiment_polarity + global_rate_positive_words +
## rate_positive_words + min_positive_polarity + avg_negative_polarity +
## title_subjectivity + title_sentiment_polarity + abs_title_subjectivity
##
## Df Sum of Sq RSS AIC
## - n_non_stop_unique_tokens 1 1.34 22444 -8276.5
## <none> 22443 -8276.3
## - global_rate_positive_words 1 2.22 22445 -8275.3
## - num_videos 1 2.38 22445 -8275.1
## - global_sentiment_polarity 1 2.92 22446 -8274.4
## - n_tokens_title 1 3.88 22447 -8273.1
## - n_tokens_content 1 4.08 22447 -8272.8
## - avg_negative_polarity 1 4.36 22447 -8272.5
## - self_reference_min_shares 1 4.61 22447 -8272.1
## - LDA_03 1 4.96 22448 -8271.7
## - rate_positive_words 1 5.31 22448 -8271.2
## - data_channel_is_lifestyle 1 5.77 22449 -8270.6
## - min_positive_polarity 1 6.14 22449 -8270.1
## - LDA_01 1 6.51 22449 -8269.6
## - title_subjectivity 1 6.90 22450 -8269.1
## - title_sentiment_polarity 1 8.51 22451 -8267.0
## - self_reference_avg_sharess 1 8.71 22452 -8266.7
## - num_imgs 1 9.94 22453 -8265.1
## - num_keywords 1 10.15 22453 -8264.8
## - kw_avg_max 1 11.15 22454 -8263.5
## - kw_max_min 1 12.81 22456 -8261.3
## - abs_title_subjectivity 1 14.06 22457 -8259.6
## - kw_min_max 1 14.65 22457 -8258.9
## - num_self_hrefs 1 15.61 22458 -8257.6
## - average_token_length 1 18.68 22462 -8253.5
## - kw_avg_min 1 20.16 22463 -8251.6
## - data_channel_is_bus 1 21.07 22464 -8250.4
## - data_channel_is_tech 1 23.40 22466 -8247.3
## - LDA_00 1 28.12 22471 -8241.0
## - num_hrefs 1 29.11 22472 -8239.7
## - global_subjectivity 1 30.48 22473 -8237.9
## - LDA_02 1 33.89 22477 -8233.4
## - data_channel_is_socmed 1 35.27 22478 -8231.6
## - kw_min_avg 1 38.07 22481 -8227.9
## - data_channel_is_entertainment 1 57.96 22501 -8201.6
## - kw_min_min 1 79.00 22522 -8173.8
## - weekday_is_friday 1 103.13 22546 -8141.9
## - weekday_is_monday 1 115.43 22558 -8125.7
## - weekday_is_tuesday 1 197.20 22640 -8018.1
## - weekday_is_wednesday 1 197.21 22640 -8018.1
## - weekday_is_thursday 1 202.28 22645 -8011.5
## - kw_max_avg 1 274.23 22717 -7917.2
## - kw_avg_avg 1 560.72 23004 -7544.6
##
## Step: AIC=-8276.48
## shares ~ n_tokens_title + n_tokens_content + num_hrefs + num_self_hrefs +
## num_imgs + num_videos + average_token_length + num_keywords +
## data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## kw_min_min + kw_max_min + kw_avg_min + kw_min_max + kw_avg_max +
## kw_min_avg + kw_max_avg + kw_avg_avg + self_reference_min_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## LDA_00 + LDA_01 + LDA_02 + LDA_03 + global_subjectivity +
## global_sentiment_polarity + global_rate_positive_words +
## rate_positive_words + min_positive_polarity + avg_negative_polarity +
## title_subjectivity + title_sentiment_polarity + abs_title_subjectivity
##
## Df Sum of Sq RSS AIC
## <none> 22444 -8276.5
## - num_videos 1 2.31 22446 -8275.4
## - global_sentiment_polarity 1 2.49 22447 -8275.2
## - global_rate_positive_words 1 2.88 22447 -8274.7
## - n_tokens_title 1 3.79 22448 -8273.5
## - avg_negative_polarity 1 4.32 22449 -8272.8
## - self_reference_min_shares 1 4.61 22449 -8272.4
## - rate_positive_words 1 4.80 22449 -8272.1
## - LDA_03 1 5.01 22449 -8271.8
## - data_channel_is_lifestyle 1 6.04 22450 -8270.5
## - n_tokens_content 1 6.48 22451 -8269.9
## - LDA_01 1 6.77 22451 -8269.5
## - title_subjectivity 1 7.01 22451 -8269.2
## - min_positive_polarity 1 7.34 22452 -8268.7
## - self_reference_avg_sharess 1 8.65 22453 -8267.0
## - title_sentiment_polarity 1 8.65 22453 -8267.0
## - num_keywords 1 10.49 22455 -8264.6
## - kw_avg_max 1 11.28 22455 -8263.5
## - kw_max_min 1 12.89 22457 -8261.4
## - num_imgs 1 13.47 22458 -8260.6
## - abs_title_subjectivity 1 13.97 22458 -8260.0
## - kw_min_max 1 14.60 22459 -8259.1
## - num_self_hrefs 1 16.20 22460 -8257.0
## - kw_avg_min 1 20.25 22464 -8251.7
## - data_channel_is_bus 1 21.19 22465 -8250.4
## - data_channel_is_tech 1 23.29 22467 -8247.6
## - LDA_00 1 28.21 22472 -8241.1
## - global_subjectivity 1 29.24 22473 -8239.8
## - num_hrefs 1 31.66 22476 -8236.6
## - LDA_02 1 33.75 22478 -8233.8
## - average_token_length 1 34.09 22478 -8233.3
## - data_channel_is_socmed 1 35.11 22479 -8232.0
## - kw_min_avg 1 38.06 22482 -8228.1
## - data_channel_is_entertainment 1 58.45 22503 -8201.1
## - kw_min_min 1 78.48 22523 -8174.7
## - weekday_is_friday 1 103.71 22548 -8141.4
## - weekday_is_monday 1 115.78 22560 -8125.5
## - weekday_is_wednesday 1 197.50 22642 -8018.0
## - weekday_is_tuesday 1 197.57 22642 -8017.9
## - weekday_is_thursday 1 202.53 22647 -8011.4
## - kw_max_avg 1 275.11 22719 -7916.3
## - kw_avg_avg 1 562.10 23006 -7543.0
summary(fit_mlm2)
##
## Call:
## lm(formula = shares ~ n_tokens_title + n_tokens_content + num_hrefs +
## num_self_hrefs + num_imgs + num_videos + average_token_length +
## num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment +
## data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech +
## kw_min_min + kw_max_min + kw_avg_min + kw_min_max + kw_avg_max +
## kw_min_avg + kw_max_avg + kw_avg_avg + self_reference_min_shares +
## self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday +
## weekday_is_wednesday + weekday_is_thursday + weekday_is_friday +
## LDA_00 + LDA_01 + LDA_02 + LDA_03 + global_subjectivity +
## global_sentiment_polarity + global_rate_positive_words +
## rate_positive_words + min_positive_polarity + avg_negative_polarity +
## title_subjectivity + title_sentiment_polarity + abs_title_subjectivity,
## data = train.news)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1150 -0.5461 -0.1649 0.3857 5.5981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.886e+00 6.650e-02 103.553 < 2e-16 ***
## n_tokens_title 5.593e-03 2.497e-03 2.240 0.025119 *
## n_tokens_content 3.988e-05 1.362e-05 2.929 0.003408 **
## num_hrefs 3.733e-03 5.768e-04 6.472 9.85e-11 ***
## num_self_hrefs -7.158e-03 1.547e-03 -4.629 3.70e-06 ***
## num_imgs 3.061e-03 7.251e-04 4.221 2.44e-05 ***
## num_videos 2.354e-03 1.348e-03 1.747 0.080635 .
## average_token_length -6.956e-02 1.036e-02 -6.716 1.91e-11 ***
## num_keywords 1.189e-02 3.191e-03 3.726 0.000195 ***
## data_channel_is_lifestyle1 -8.216e-02 2.907e-02 -2.826 0.004713 **
## data_channel_is_entertainment1 -1.723e-01 1.960e-02 -8.793 < 2e-16 ***
## data_channel_is_bus1 -1.462e-01 2.762e-02 -5.294 1.20e-07 ***
## data_channel_is_socmed1 1.825e-01 2.678e-02 6.815 9.60e-12 ***
## data_channel_is_tech1 1.375e-01 2.477e-02 5.550 2.87e-08 ***
## kw_min_min 9.509e-04 9.332e-05 10.189 < 2e-16 ***
## kw_max_min 1.942e-05 4.703e-06 4.129 3.65e-05 ***
## kw_avg_min -1.415e-04 2.734e-05 -5.176 2.29e-07 ***
## kw_min_max -4.463e-07 1.016e-07 -4.394 1.12e-05 ***
## kw_avg_max -2.627e-07 6.801e-08 -3.863 0.000112 ***
## kw_min_avg -4.651e-05 6.554e-06 -7.096 1.32e-12 ***
## kw_max_avg -4.114e-05 2.156e-06 -19.077 < 2e-16 ***
## kw_avg_avg 3.351e-04 1.229e-05 27.268 < 2e-16 ***
## self_reference_min_shares 1.124e-06 4.549e-07 2.471 0.013495 *
## self_reference_avg_sharess 1.275e-06 3.771e-07 3.382 0.000721 ***
## weekday_is_monday1 -2.310e-01 1.867e-02 -12.376 < 2e-16 ***
## weekday_is_tuesday1 -2.959e-01 1.830e-02 -16.166 < 2e-16 ***
## weekday_is_wednesday1 -2.956e-01 1.829e-02 -16.164 < 2e-16 ***
## weekday_is_thursday1 -3.001e-01 1.833e-02 -16.368 < 2e-16 ***
## weekday_is_friday1 -2.268e-01 1.936e-02 -11.713 < 2e-16 ***
## LDA_00 2.425e-01 3.970e-02 6.109 1.01e-09 ***
## LDA_01 -1.250e-01 4.176e-02 -2.993 0.002766 **
## LDA_02 -2.517e-01 3.767e-02 -6.682 2.40e-11 ***
## LDA_03 -9.811e-02 3.813e-02 -2.573 0.010082 *
## global_subjectivity 4.276e-01 6.876e-02 6.219 5.06e-10 ***
## global_sentiment_polarity -1.815e-01 1.001e-01 -1.814 0.069745 .
## global_rate_positive_words -8.480e-01 4.343e-01 -1.953 0.050885 .
## rate_positive_words 1.431e-01 5.678e-02 2.520 0.011742 *
## min_positive_polarity -2.575e-01 8.261e-02 -3.117 0.001829 **
## avg_negative_polarity -1.226e-01 5.129e-02 -2.390 0.016859 *
## title_subjectivity 5.628e-02 1.848e-02 3.046 0.002321 **
## title_sentiment_polarity 6.904e-02 2.041e-02 3.382 0.000720 ***
## abs_title_subjectivity 1.364e-01 3.173e-02 4.299 1.72e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8695 on 29690 degrees of freedom
## Multiple R-squared: 0.129, Adjusted R-squared: 0.1278
## F-statistic: 107.3 on 41 and 29690 DF, p-value: < 2.2e-16
pred_mlm2 = predict(fit_mlm2, test.news)
sqrt(mean((test.news$shares - pred_mlm2)^2))
## [1] 0.8725425
# We thus get an optimized model with R-square value of approximately 0.13 and Root mean square error of approximately 0.87. This model includes only the statiscal variables.