news <- read.csv("https://raw.githubusercontent.com/johnpannyc/online-news-data-621/master/OnlineNewsPopularity.csv")
glimpse(news)
## Observations: 39,644
## Variables: 61
## $ url <fct> http://mashable.com/2013/01/07/a...
## $ timedelta <dbl> 731, 731, 731, 731, 731, 731, 73...
## $ n_tokens_title <dbl> 12, 9, 9, 9, 13, 10, 8, 12, 11, ...
## $ n_tokens_content <dbl> 219, 255, 211, 531, 1072, 370, 9...
## $ n_unique_tokens <dbl> 0.6635945, 0.6047431, 0.5751295,...
## $ n_non_stop_words <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ n_non_stop_unique_tokens <dbl> 0.8153846, 0.7919463, 0.6638655,...
## $ num_hrefs <dbl> 4, 3, 3, 9, 19, 2, 21, 20, 2, 4,...
## $ num_self_hrefs <dbl> 2, 1, 1, 0, 19, 2, 20, 20, 0, 1,...
## $ num_imgs <dbl> 1, 1, 1, 1, 20, 0, 20, 20, 0, 1,...
## $ num_videos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ average_token_length <dbl> 4.680365, 4.913725, 4.393365, 4....
## $ num_keywords <dbl> 5, 4, 6, 7, 7, 9, 10, 9, 7, 5, 8...
## $ data_channel_is_lifestyle <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ data_channel_is_entertainment <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ data_channel_is_bus <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ data_channel_is_socmed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ data_channel_is_tech <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,...
## $ data_channel_is_world <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ kw_min_min <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_max_min <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_avg_min <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_min_max <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_max_max <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_avg_max <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_min_avg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_max_avg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_avg_avg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ self_reference_min_shares <dbl> 496, 0, 918, 0, 545, 8500, 545, ...
## $ self_reference_max_shares <dbl> 496, 0, 918, 0, 16000, 8500, 160...
## $ self_reference_avg_sharess <dbl> 496.000, 0.000, 918.000, 0.000, ...
## $ weekday_is_monday <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ weekday_is_tuesday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ weekday_is_wednesday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ weekday_is_thursday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ weekday_is_friday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ weekday_is_saturday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ weekday_is_sunday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ is_weekend <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ LDA_00 <dbl> 0.50033120, 0.79975569, 0.217792...
## $ LDA_01 <dbl> 0.37827893, 0.05004668, 0.033334...
## $ LDA_02 <dbl> 0.04000468, 0.05009625, 0.033351...
## $ LDA_03 <dbl> 0.04126265, 0.05010067, 0.033333...
## $ LDA_04 <dbl> 0.04012254, 0.05000071, 0.682188...
## $ global_subjectivity <dbl> 0.5216171, 0.3412458, 0.7022222,...
## $ global_sentiment_polarity <dbl> 0.09256198, 0.14894781, 0.323333...
## $ global_rate_positive_words <dbl> 0.04566210, 0.04313725, 0.056872...
## $ global_rate_negative_words <dbl> 0.013698630, 0.015686275, 0.0094...
## $ rate_positive_words <dbl> 0.7692308, 0.7333333, 0.8571429,...
## $ rate_negative_words <dbl> 0.2307692, 0.2666667, 0.1428571,...
## $ avg_positive_polarity <dbl> 0.3786364, 0.2869146, 0.4958333,...
## $ min_positive_polarity <dbl> 0.10000000, 0.03333333, 0.100000...
## $ max_positive_polarity <dbl> 0.7000000, 0.7000000, 1.0000000,...
## $ avg_negative_polarity <dbl> -0.3500000, -0.1187500, -0.46666...
## $ min_negative_polarity <dbl> -0.6000000, -0.1250000, -0.80000...
## $ max_negative_polarity <dbl> -0.2000000, -0.1000000, -0.13333...
## $ title_subjectivity <dbl> 0.5000000, 0.0000000, 0.0000000,...
## $ title_sentiment_polarity <dbl> -0.1875000, 0.0000000, 0.0000000...
## $ abs_title_subjectivity <dbl> 0.00000000, 0.50000000, 0.500000...
## $ abs_title_sentiment_polarity <dbl> 0.1875000, 0.0000000, 0.0000000,...
## $ shares <int> 593, 711, 1500, 1200, 505, 855, ...
data has 61 variables and 39,644 observations
summary(news)
## url
## http://mashable.com/2013/01/07/amazon-instant-video-browser/ : 1
## http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/ : 1
## http://mashable.com/2013/01/07/apple-40-billion-app-downloads/: 1
## http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/ : 1
## http://mashable.com/2013/01/07/att-u-verse-apps/ : 1
## http://mashable.com/2013/01/07/beewi-smart-toys/ : 1
## (Other) :39638
## timedelta n_tokens_title n_tokens_content n_unique_tokens
## Min. : 8.0 Min. : 2.0 Min. : 0.0 Min. : 0.0000
## 1st Qu.:164.0 1st Qu.: 9.0 1st Qu.: 246.0 1st Qu.: 0.4709
## Median :339.0 Median :10.0 Median : 409.0 Median : 0.5392
## Mean :354.5 Mean :10.4 Mean : 546.5 Mean : 0.5482
## 3rd Qu.:542.0 3rd Qu.:12.0 3rd Qu.: 716.0 3rd Qu.: 0.6087
## Max. :731.0 Max. :23.0 Max. :8474.0 Max. :701.0000
##
## n_non_stop_words n_non_stop_unique_tokens num_hrefs
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 1.0000 1st Qu.: 0.6257 1st Qu.: 4.00
## Median : 1.0000 Median : 0.6905 Median : 8.00
## Mean : 0.9965 Mean : 0.6892 Mean : 10.88
## 3rd Qu.: 1.0000 3rd Qu.: 0.7546 3rd Qu.: 14.00
## Max. :1042.0000 Max. :650.0000 Max. :304.00
##
## num_self_hrefs num_imgs num_videos average_token_length
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. :0.000
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 0.00 1st Qu.:4.478
## Median : 3.000 Median : 1.000 Median : 0.00 Median :4.664
## Mean : 3.294 Mean : 4.544 Mean : 1.25 Mean :4.548
## 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.: 1.00 3rd Qu.:4.855
## Max. :116.000 Max. :128.000 Max. :91.00 Max. :8.042
##
## num_keywords data_channel_is_lifestyle data_channel_is_entertainment
## Min. : 1.000 Min. :0.00000 Min. :0.000
## 1st Qu.: 6.000 1st Qu.:0.00000 1st Qu.:0.000
## Median : 7.000 Median :0.00000 Median :0.000
## Mean : 7.224 Mean :0.05295 Mean :0.178
## 3rd Qu.: 9.000 3rd Qu.:0.00000 3rd Qu.:0.000
## Max. :10.000 Max. :1.00000 Max. :1.000
##
## data_channel_is_bus data_channel_is_socmed data_channel_is_tech
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1579 Mean :0.0586 Mean :0.1853
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## data_channel_is_world kw_min_min kw_max_min kw_avg_min
## Min. :0.0000 Min. : -1.00 Min. : 0 Min. : -1.0
## 1st Qu.:0.0000 1st Qu.: -1.00 1st Qu.: 445 1st Qu.: 141.8
## Median :0.0000 Median : -1.00 Median : 660 Median : 235.5
## Mean :0.2126 Mean : 26.11 Mean : 1154 Mean : 312.4
## 3rd Qu.:0.0000 3rd Qu.: 4.00 3rd Qu.: 1000 3rd Qu.: 357.0
## Max. :1.0000 Max. :377.00 Max. :298400 Max. :42827.9
##
## kw_min_max kw_max_max kw_avg_max kw_min_avg
## Min. : 0 Min. : 0 Min. : 0 Min. : -1
## 1st Qu.: 0 1st Qu.:843300 1st Qu.:172847 1st Qu.: 0
## Median : 1400 Median :843300 Median :244572 Median :1024
## Mean : 13612 Mean :752324 Mean :259282 Mean :1117
## 3rd Qu.: 7900 3rd Qu.:843300 3rd Qu.:330980 3rd Qu.:2057
## Max. :843300 Max. :843300 Max. :843300 Max. :3613
##
## kw_max_avg kw_avg_avg self_reference_min_shares
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 3562 1st Qu.: 2382 1st Qu.: 639
## Median : 4356 Median : 2870 Median : 1200
## Mean : 5657 Mean : 3136 Mean : 3999
## 3rd Qu.: 6020 3rd Qu.: 3600 3rd Qu.: 2600
## Max. :298400 Max. :43568 Max. :843300
##
## self_reference_max_shares self_reference_avg_sharess weekday_is_monday
## Min. : 0 Min. : 0.0 Min. :0.000
## 1st Qu.: 1100 1st Qu.: 981.2 1st Qu.:0.000
## Median : 2800 Median : 2200.0 Median :0.000
## Mean : 10329 Mean : 6401.7 Mean :0.168
## 3rd Qu.: 8000 3rd Qu.: 5200.0 3rd Qu.:0.000
## Max. :843300 Max. :843300.0 Max. :1.000
##
## weekday_is_tuesday weekday_is_wednesday weekday_is_thursday
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1864 Mean :0.1875 Mean :0.1833
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## weekday_is_friday weekday_is_saturday weekday_is_sunday is_weekend
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.1438 Mean :0.06188 Mean :0.06904 Mean :0.1309
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
##
## LDA_00 LDA_01 LDA_02 LDA_03
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.02505 1st Qu.:0.02501 1st Qu.:0.02857 1st Qu.:0.02857
## Median :0.03339 Median :0.03334 Median :0.04000 Median :0.04000
## Mean :0.18460 Mean :0.14126 Mean :0.21632 Mean :0.22377
## 3rd Qu.:0.24096 3rd Qu.:0.15083 3rd Qu.:0.33422 3rd Qu.:0.37576
## Max. :0.92699 Max. :0.92595 Max. :0.92000 Max. :0.92653
##
## LDA_04 global_subjectivity global_sentiment_polarity
## Min. :0.00000 Min. :0.0000 Min. :-0.39375
## 1st Qu.:0.02857 1st Qu.:0.3962 1st Qu.: 0.05776
## Median :0.04073 Median :0.4535 Median : 0.11912
## Mean :0.23403 Mean :0.4434 Mean : 0.11931
## 3rd Qu.:0.39999 3rd Qu.:0.5083 3rd Qu.: 0.17783
## Max. :0.92719 Max. :1.0000 Max. : 0.72784
##
## global_rate_positive_words global_rate_negative_words rate_positive_words
## Min. :0.00000 Min. :0.000000 Min. :0.0000
## 1st Qu.:0.02838 1st Qu.:0.009615 1st Qu.:0.6000
## Median :0.03902 Median :0.015337 Median :0.7105
## Mean :0.03962 Mean :0.016612 Mean :0.6822
## 3rd Qu.:0.05028 3rd Qu.:0.021739 3rd Qu.:0.8000
## Max. :0.15549 Max. :0.184932 Max. :1.0000
##
## rate_negative_words avg_positive_polarity min_positive_polarity
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1852 1st Qu.:0.3062 1st Qu.:0.05000
## Median :0.2800 Median :0.3588 Median :0.10000
## Mean :0.2879 Mean :0.3538 Mean :0.09545
## 3rd Qu.:0.3846 3rd Qu.:0.4114 3rd Qu.:0.10000
## Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## max_positive_polarity avg_negative_polarity min_negative_polarity
## Min. :0.0000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.:0.6000 1st Qu.:-0.3284 1st Qu.:-0.7000
## Median :0.8000 Median :-0.2533 Median :-0.5000
## Mean :0.7567 Mean :-0.2595 Mean :-0.5219
## 3rd Qu.:1.0000 3rd Qu.:-0.1869 3rd Qu.:-0.3000
## Max. :1.0000 Max. : 0.0000 Max. : 0.0000
##
## max_negative_polarity title_subjectivity title_sentiment_polarity
## Min. :-1.0000 Min. :0.0000 Min. :-1.00000
## 1st Qu.:-0.1250 1st Qu.:0.0000 1st Qu.: 0.00000
## Median :-0.1000 Median :0.1500 Median : 0.00000
## Mean :-0.1075 Mean :0.2824 Mean : 0.07143
## 3rd Qu.:-0.0500 3rd Qu.:0.5000 3rd Qu.: 0.15000
## Max. : 0.0000 Max. :1.0000 Max. : 1.00000
##
## abs_title_subjectivity abs_title_sentiment_polarity shares
## Min. :0.0000 Min. :0.0000 Min. : 1
## 1st Qu.:0.1667 1st Qu.:0.0000 1st Qu.: 946
## Median :0.5000 Median :0.0000 Median : 1400
## Mean :0.3418 Mean :0.1561 Mean : 3395
## 3rd Qu.:0.5000 3rd Qu.:0.2500 3rd Qu.: 2800
## Max. :0.5000 Max. :1.0000 Max. :843300
##
names(news)
## [1] "url" "timedelta"
## [3] "n_tokens_title" "n_tokens_content"
## [5] "n_unique_tokens" "n_non_stop_words"
## [7] "n_non_stop_unique_tokens" "num_hrefs"
## [9] "num_self_hrefs" "num_imgs"
## [11] "num_videos" "average_token_length"
## [13] "num_keywords" "data_channel_is_lifestyle"
## [15] "data_channel_is_entertainment" "data_channel_is_bus"
## [17] "data_channel_is_socmed" "data_channel_is_tech"
## [19] "data_channel_is_world" "kw_min_min"
## [21] "kw_max_min" "kw_avg_min"
## [23] "kw_min_max" "kw_max_max"
## [25] "kw_avg_max" "kw_min_avg"
## [27] "kw_max_avg" "kw_avg_avg"
## [29] "self_reference_min_shares" "self_reference_max_shares"
## [31] "self_reference_avg_sharess" "weekday_is_monday"
## [33] "weekday_is_tuesday" "weekday_is_wednesday"
## [35] "weekday_is_thursday" "weekday_is_friday"
## [37] "weekday_is_saturday" "weekday_is_sunday"
## [39] "is_weekend" "LDA_00"
## [41] "LDA_01" "LDA_02"
## [43] "LDA_03" "LDA_04"
## [45] "global_subjectivity" "global_sentiment_polarity"
## [47] "global_rate_positive_words" "global_rate_negative_words"
## [49] "rate_positive_words" "rate_negative_words"
## [51] "avg_positive_polarity" "min_positive_polarity"
## [53] "max_positive_polarity" "avg_negative_polarity"
## [55] "min_negative_polarity" "max_negative_polarity"
## [57] "title_subjectivity" "title_sentiment_polarity"
## [59] "abs_title_subjectivity" "abs_title_sentiment_polarity"
## [61] "shares"
Data distributions
var_names <- read.csv("https://raw.githubusercontent.com/johnpannyc/online-news-data-621/master/variable%20dictionary.csv")
var_names
## Name.of.Variable
## 1 url
## 2 timedelta
## 3 n_tokens_title
## 4 n_tokens_content
## 5 n_unique_tokens
## 6 n_non_stop_words
## 7 n_non_stop_unique_tokens
## 8 num_hrefs
## 9 num_self_hrefs
## 10 num_imgs
## 11 num_videos
## 12 average_token_length
## 13 num_keywords
## 14 data_channel_is_lifestyle
## 15 data_channel_is_entertainment
## 16 data_channel_is_bus
## 17 data_channel_is_socmed
## 18 data_channel_is_tech
## 19 data_channel_is_world
## 20 kw_min_min
## 21 kw_max_min
## 22 kw_avg_min
## 23 kw_min_max
## 24 kw_max_max
## 25 kw_avg_max
## 26 kw_min_avg
## 27 kw_max_avg
## 28 kw_avg_avg
## 29 self_reference_min_shares
## 30 self_reference_max_shares
## 31 self_reference_avg_sharess
## 32 weekday_is_monday
## 33 weekday_is_tuesday
## 34 weekday_is_Wednesday
## 35 weekday_is_thursday
## 36 weekday_is_friday
## 37 weekday_is_saturday
## 38 weekday_is_sunday
## 39 is_weekend
## 40 LDA_00
## 41 LDA_01
## 42 LDA_02
## 43 LDA_03
## 44 LDA_04
## 45 global_subjectivity
## 46 global_sentiment_polarity
## 47 global_rate_positive_words
## 48 global_rate_negative_words
## 49 rate_positive_words
## 50 rate_negative_words
## 51 avg_positive_polarity
## 52 min_positive_polarity
## 53 max_positive_polarity
## 54 avg_negative_polarity
## 55 min_negative_polarity
## 56 max_negative_polarity
## 57 title_subjectivity
## 58 title_sentiment_polarity
## 59 abs_title_subjectivity
## 60 abs_title_sentiment_polarity
## 61 shares
## Explanations
## 1 URL of the article
## 2 Days between the article publication and the dataset acquisition
## 3 Number of words in the title
## 4 Number of words in the title
## 5 Rate of unique words in the content
## 6 Rate of non-stop words in the content
## 7 Rate of unique non-stop words in the content
## 8 Number of links
## 9 Number of links to other articles published by Mashable
## 10 Number of images
## 11 Number of videos
## 12 Average length of the words in the
## 13 Number of keywords in the metadata
## 14 Is data channel ¡®Lifestyle¡¯?
## 15 Is data channel ¡®Entertainment¡¯?
## 16 Is data channel ¡®Business¡¯?
## 17 Is data channel ¡®Social Media¡¯?
## 18 Is data channel ¡®Tech¡¯?
## 19 Is data channel ¡®World¡¯?
## 20 Worst keyword (min. shares)
## 21 Worst keyword (max. shares)
## 22 Worst keyword (avg. shares)
## 23 Best keyword (min. shares)
## 24 Best keyword (max. shares)
## 25 Best keyword (avg. shares)
## 26 Avg. keyword (min. shares)
## 27 Avg. keyword (max. shares)
## 28 Avg. keyword (avg. shares)
## 29 Min. shares of referenced articles in Mashable
## 30 Max. shares of referenced articles in Mashable
## 31 Avg. shares of referenced articles in Mashable
## 32 Was the article published on a Monday?
## 33 Was the article published on a Tuesday?
## 34 Was the article published on a Wednesday?
## 35 Was the article published on a Thursday?
## 36 Was the article published on a Friday?
## 37 Was the article published on a Saturday?
## 38 Was the article published on a Sunday?
## 39 Was the article published on the weekend?
## 40 Closeness to LDA topic 0
## 41 Closeness to LDA topic 1
## 42 Closeness to LDA topic 2
## 43 Closeness to LDA topic 3
## 44 Closeness to LDA topic 4
## 45 Text subjectivity
## 46 Text sentiment polarity
## 47 Rate of positive words in the content
## 48 Rate of negative words in the content
## 49 Rate of positive words among non-neutral tokens
## 50 Rate of negative words among non-neutral tokens
## 51 Avg. polarity of positive words
## 52 Min. polarity of positive words
## 53 Max. polarity of positive words
## 54 Avg. polarity of negative words
## 55 Min. polarity of negative words
## 56 Max. polarity of negative words
## 57 Title subjectivity
## 58 Title polarity
## 59 Absolute subjectivity level
## 60 Absolute polarity level
## 61 Number of shares (target)
because the data are big, so we Split training dataset and test dataset
set.seed(123)
news$popular= ifelse(news$avg_positive_polarity>0.5,1, 0)
news=news[,-23]
split=sample.split(news$popular , SplitRatio=0.1)
training_set=subset(news, split==TRUE)
test_set=subset(news, split==FALSE)
str(training_set)
## 'data.frame': 3965 obs. of 61 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 6 20 38 55 66 79 104 119 134 150 ...
## $ timedelta : num 731 731 731 731 731 731 729 729 729 729 ...
## $ n_tokens_title : num 10 8 8 9 8 10 6 9 9 10 ...
## $ n_tokens_content : num 370 1207 257 1115 403 ...
## $ n_unique_tokens : num 0.56 0.411 0.568 0.424 0.516 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.698 0.549 0.671 0.565 0.618 ...
## $ num_hrefs : num 2 24 9 21 3 15 36 10 1 11 ...
## $ num_self_hrefs : num 2 24 7 21 3 11 30 5 1 2 ...
## $ num_imgs : num 0 42 0 20 1 1 19 1 0 1 ...
## $ num_videos : num 0 0 1 0 0 0 0 0 0 0 ...
## $ average_token_length : num 4.36 4.72 4.64 4.63 4.08 ...
## $ num_keywords : num 9 8 9 8 8 6 10 9 10 4 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 0 1 0 0 ...
## $ data_channel_is_entertainment: num 0 0 0 0 0 0 0 0 1 0 ...
## $ data_channel_is_bus : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 1 0 0 1 0 0 0 0 ...
## $ data_channel_is_tech : num 1 1 0 1 1 0 1 0 0 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 217 217 217 217 ...
## $ kw_max_min : num 0 0 0 0 0 0 1200 2600 819 598 ...
## $ kw_avg_min : num 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 17100 17100 17100 17100 ...
## $ kw_avg_max : num 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 ...
## $ self_reference_min_shares : num 8500 545 1300 545 545 757 545 2000 0 2800 ...
## $ self_reference_max_shares : num 8500 16000 2500 16000 545 5400 16000 4900 0 2800 ...
## $ self_reference_avg_sharess : num 8500 2830 1775 3429 545 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 0 0 0 0 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 1 1 1 1 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.0222 0.025 0.4392 0.025 0.025 ...
## $ LDA_01 : num 0.3067 0.0252 0.0225 0.327 0.333 ...
## $ LDA_02 : num 0.0222 0.025 0.0224 0.025 0.025 ...
## $ LDA_03 : num 0.0222 0.025 0.0233 0.025 0.025 ...
## $ LDA_04 : num 0.627 0.9 0.493 0.598 0.592 ...
## $ global_subjectivity : num 0.437 0.539 0.4 0.507 0.518 ...
## $ global_sentiment_polarity : num 0.07118 0.28826 0.00741 0.27977 0.0694 ...
## $ global_rate_positive_words : num 0.0297 0.0696 0.0311 0.0717 0.0496 ...
## $ global_rate_negative_words : num 0.027 0.0116 0.0272 0.0135 0.0397 ...
## $ rate_positive_words : num 0.524 0.857 0.533 0.842 0.556 ...
## $ rate_negative_words : num 0.476 0.143 0.467 0.158 0.444 ...
## $ avg_positive_polarity : num 0.351 0.427 0.36 0.417 0.28 ...
## $ min_positive_polarity : num 0.1364 0.1 0.0333 0.1 0.05 ...
## $ max_positive_polarity : num 0.6 1 0.6 1 0.6 1 1 0.8 0.5 1 ...
## $ avg_negative_polarity : num -0.195 -0.227 -0.393 -0.212 -0.172 ...
## $ min_negative_polarity : num -0.4 -0.5 -0.5 -0.5 -0.4 -0.8 -0.5 -0.5 -0.4 -0.5 ...
## $ max_negative_polarity : num -0.1 -0.05 -0.125 -0.05 -0.1 ...
## $ title_subjectivity : num 0.643 0.5 0.667 0.333 0.75 ...
## $ title_sentiment_polarity : num 0.214 0 -0.5 0.25 -0.125 ...
## $ abs_title_subjectivity : num 0.143 0 0.167 0.167 0.25 ...
## $ abs_title_sentiment_polarity : num 0.214 0 0.5 0.25 0.125 ...
## $ shares : int 855 17100 2600 2400 3200 851 302 2300 454 373 ...
## $ popular : num 0 0 0 0 0 0 0 0 0 0 ...