news <- read.csv("https://raw.githubusercontent.com/johnpannyc/online-news-data-621/master/OnlineNewsPopularity.csv")
glimpse(news)
## Observations: 39,644
## Variables: 61
## $ url                           <fct> http://mashable.com/2013/01/07/a...
## $ timedelta                     <dbl> 731, 731, 731, 731, 731, 731, 73...
## $ n_tokens_title                <dbl> 12, 9, 9, 9, 13, 10, 8, 12, 11, ...
## $ n_tokens_content              <dbl> 219, 255, 211, 531, 1072, 370, 9...
## $ n_unique_tokens               <dbl> 0.6635945, 0.6047431, 0.5751295,...
## $ n_non_stop_words              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ n_non_stop_unique_tokens      <dbl> 0.8153846, 0.7919463, 0.6638655,...
## $ num_hrefs                     <dbl> 4, 3, 3, 9, 19, 2, 21, 20, 2, 4,...
## $ num_self_hrefs                <dbl> 2, 1, 1, 0, 19, 2, 20, 20, 0, 1,...
## $ num_imgs                      <dbl> 1, 1, 1, 1, 20, 0, 20, 20, 0, 1,...
## $ num_videos                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ average_token_length          <dbl> 4.680365, 4.913725, 4.393365, 4....
## $ num_keywords                  <dbl> 5, 4, 6, 7, 7, 9, 10, 9, 7, 5, 8...
## $ data_channel_is_lifestyle     <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ data_channel_is_entertainment <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ data_channel_is_bus           <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ data_channel_is_socmed        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ data_channel_is_tech          <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,...
## $ data_channel_is_world         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ kw_min_min                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_max_min                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_avg_min                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_min_max                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_max_max                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_avg_max                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_min_avg                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_max_avg                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ kw_avg_avg                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ self_reference_min_shares     <dbl> 496, 0, 918, 0, 545, 8500, 545, ...
## $ self_reference_max_shares     <dbl> 496, 0, 918, 0, 16000, 8500, 160...
## $ self_reference_avg_sharess    <dbl> 496.000, 0.000, 918.000, 0.000, ...
## $ weekday_is_monday             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ weekday_is_tuesday            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ weekday_is_wednesday          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ weekday_is_thursday           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ weekday_is_friday             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ weekday_is_saturday           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ weekday_is_sunday             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ is_weekend                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ LDA_00                        <dbl> 0.50033120, 0.79975569, 0.217792...
## $ LDA_01                        <dbl> 0.37827893, 0.05004668, 0.033334...
## $ LDA_02                        <dbl> 0.04000468, 0.05009625, 0.033351...
## $ LDA_03                        <dbl> 0.04126265, 0.05010067, 0.033333...
## $ LDA_04                        <dbl> 0.04012254, 0.05000071, 0.682188...
## $ global_subjectivity           <dbl> 0.5216171, 0.3412458, 0.7022222,...
## $ global_sentiment_polarity     <dbl> 0.09256198, 0.14894781, 0.323333...
## $ global_rate_positive_words    <dbl> 0.04566210, 0.04313725, 0.056872...
## $ global_rate_negative_words    <dbl> 0.013698630, 0.015686275, 0.0094...
## $ rate_positive_words           <dbl> 0.7692308, 0.7333333, 0.8571429,...
## $ rate_negative_words           <dbl> 0.2307692, 0.2666667, 0.1428571,...
## $ avg_positive_polarity         <dbl> 0.3786364, 0.2869146, 0.4958333,...
## $ min_positive_polarity         <dbl> 0.10000000, 0.03333333, 0.100000...
## $ max_positive_polarity         <dbl> 0.7000000, 0.7000000, 1.0000000,...
## $ avg_negative_polarity         <dbl> -0.3500000, -0.1187500, -0.46666...
## $ min_negative_polarity         <dbl> -0.6000000, -0.1250000, -0.80000...
## $ max_negative_polarity         <dbl> -0.2000000, -0.1000000, -0.13333...
## $ title_subjectivity            <dbl> 0.5000000, 0.0000000, 0.0000000,...
## $ title_sentiment_polarity      <dbl> -0.1875000, 0.0000000, 0.0000000...
## $ abs_title_subjectivity        <dbl> 0.00000000, 0.50000000, 0.500000...
## $ abs_title_sentiment_polarity  <dbl> 0.1875000, 0.0000000, 0.0000000,...
## $ shares                        <int> 593, 711, 1500, 1200, 505, 855, ...

data has 61 variables and 39,644 observations

summary(news)
##                                                              url       
##  http://mashable.com/2013/01/07/amazon-instant-video-browser/  :    1  
##  http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/   :    1  
##  http://mashable.com/2013/01/07/apple-40-billion-app-downloads/:    1  
##  http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/      :    1  
##  http://mashable.com/2013/01/07/att-u-verse-apps/              :    1  
##  http://mashable.com/2013/01/07/beewi-smart-toys/              :    1  
##  (Other)                                                       :39638  
##    timedelta     n_tokens_title n_tokens_content n_unique_tokens   
##  Min.   :  8.0   Min.   : 2.0   Min.   :   0.0   Min.   :  0.0000  
##  1st Qu.:164.0   1st Qu.: 9.0   1st Qu.: 246.0   1st Qu.:  0.4709  
##  Median :339.0   Median :10.0   Median : 409.0   Median :  0.5392  
##  Mean   :354.5   Mean   :10.4   Mean   : 546.5   Mean   :  0.5482  
##  3rd Qu.:542.0   3rd Qu.:12.0   3rd Qu.: 716.0   3rd Qu.:  0.6087  
##  Max.   :731.0   Max.   :23.0   Max.   :8474.0   Max.   :701.0000  
##                                                                    
##  n_non_stop_words    n_non_stop_unique_tokens   num_hrefs     
##  Min.   :   0.0000   Min.   :  0.0000         Min.   :  0.00  
##  1st Qu.:   1.0000   1st Qu.:  0.6257         1st Qu.:  4.00  
##  Median :   1.0000   Median :  0.6905         Median :  8.00  
##  Mean   :   0.9965   Mean   :  0.6892         Mean   : 10.88  
##  3rd Qu.:   1.0000   3rd Qu.:  0.7546         3rd Qu.: 14.00  
##  Max.   :1042.0000   Max.   :650.0000         Max.   :304.00  
##                                                               
##  num_self_hrefs       num_imgs         num_videos    average_token_length
##  Min.   :  0.000   Min.   :  0.000   Min.   : 0.00   Min.   :0.000       
##  1st Qu.:  1.000   1st Qu.:  1.000   1st Qu.: 0.00   1st Qu.:4.478       
##  Median :  3.000   Median :  1.000   Median : 0.00   Median :4.664       
##  Mean   :  3.294   Mean   :  4.544   Mean   : 1.25   Mean   :4.548       
##  3rd Qu.:  4.000   3rd Qu.:  4.000   3rd Qu.: 1.00   3rd Qu.:4.855       
##  Max.   :116.000   Max.   :128.000   Max.   :91.00   Max.   :8.042       
##                                                                          
##   num_keywords    data_channel_is_lifestyle data_channel_is_entertainment
##  Min.   : 1.000   Min.   :0.00000           Min.   :0.000                
##  1st Qu.: 6.000   1st Qu.:0.00000           1st Qu.:0.000                
##  Median : 7.000   Median :0.00000           Median :0.000                
##  Mean   : 7.224   Mean   :0.05295           Mean   :0.178                
##  3rd Qu.: 9.000   3rd Qu.:0.00000           3rd Qu.:0.000                
##  Max.   :10.000   Max.   :1.00000           Max.   :1.000                
##                                                                          
##  data_channel_is_bus data_channel_is_socmed data_channel_is_tech
##  Min.   :0.0000      Min.   :0.0000         Min.   :0.0000      
##  1st Qu.:0.0000      1st Qu.:0.0000         1st Qu.:0.0000      
##  Median :0.0000      Median :0.0000         Median :0.0000      
##  Mean   :0.1579      Mean   :0.0586         Mean   :0.1853      
##  3rd Qu.:0.0000      3rd Qu.:0.0000         3rd Qu.:0.0000      
##  Max.   :1.0000      Max.   :1.0000         Max.   :1.0000      
##                                                                 
##  data_channel_is_world   kw_min_min       kw_max_min       kw_avg_min     
##  Min.   :0.0000        Min.   : -1.00   Min.   :     0   Min.   :   -1.0  
##  1st Qu.:0.0000        1st Qu.: -1.00   1st Qu.:   445   1st Qu.:  141.8  
##  Median :0.0000        Median : -1.00   Median :   660   Median :  235.5  
##  Mean   :0.2126        Mean   : 26.11   Mean   :  1154   Mean   :  312.4  
##  3rd Qu.:0.0000        3rd Qu.:  4.00   3rd Qu.:  1000   3rd Qu.:  357.0  
##  Max.   :1.0000        Max.   :377.00   Max.   :298400   Max.   :42827.9  
##                                                                           
##    kw_min_max       kw_max_max       kw_avg_max       kw_min_avg  
##  Min.   :     0   Min.   :     0   Min.   :     0   Min.   :  -1  
##  1st Qu.:     0   1st Qu.:843300   1st Qu.:172847   1st Qu.:   0  
##  Median :  1400   Median :843300   Median :244572   Median :1024  
##  Mean   : 13612   Mean   :752324   Mean   :259282   Mean   :1117  
##  3rd Qu.:  7900   3rd Qu.:843300   3rd Qu.:330980   3rd Qu.:2057  
##  Max.   :843300   Max.   :843300   Max.   :843300   Max.   :3613  
##                                                                   
##    kw_max_avg       kw_avg_avg    self_reference_min_shares
##  Min.   :     0   Min.   :    0   Min.   :     0           
##  1st Qu.:  3562   1st Qu.: 2382   1st Qu.:   639           
##  Median :  4356   Median : 2870   Median :  1200           
##  Mean   :  5657   Mean   : 3136   Mean   :  3999           
##  3rd Qu.:  6020   3rd Qu.: 3600   3rd Qu.:  2600           
##  Max.   :298400   Max.   :43568   Max.   :843300           
##                                                            
##  self_reference_max_shares self_reference_avg_sharess weekday_is_monday
##  Min.   :     0            Min.   :     0.0           Min.   :0.000    
##  1st Qu.:  1100            1st Qu.:   981.2           1st Qu.:0.000    
##  Median :  2800            Median :  2200.0           Median :0.000    
##  Mean   : 10329            Mean   :  6401.7           Mean   :0.168    
##  3rd Qu.:  8000            3rd Qu.:  5200.0           3rd Qu.:0.000    
##  Max.   :843300            Max.   :843300.0           Max.   :1.000    
##                                                                        
##  weekday_is_tuesday weekday_is_wednesday weekday_is_thursday
##  Min.   :0.0000     Min.   :0.0000       Min.   :0.0000     
##  1st Qu.:0.0000     1st Qu.:0.0000       1st Qu.:0.0000     
##  Median :0.0000     Median :0.0000       Median :0.0000     
##  Mean   :0.1864     Mean   :0.1875       Mean   :0.1833     
##  3rd Qu.:0.0000     3rd Qu.:0.0000       3rd Qu.:0.0000     
##  Max.   :1.0000     Max.   :1.0000       Max.   :1.0000     
##                                                             
##  weekday_is_friday weekday_is_saturday weekday_is_sunday   is_weekend    
##  Min.   :0.0000    Min.   :0.00000     Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000    1st Qu.:0.00000     1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000    Median :0.00000     Median :0.00000   Median :0.0000  
##  Mean   :0.1438    Mean   :0.06188     Mean   :0.06904   Mean   :0.1309  
##  3rd Qu.:0.0000    3rd Qu.:0.00000     3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.0000    Max.   :1.00000     Max.   :1.00000   Max.   :1.0000  
##                                                                          
##      LDA_00            LDA_01            LDA_02            LDA_03       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.02505   1st Qu.:0.02501   1st Qu.:0.02857   1st Qu.:0.02857  
##  Median :0.03339   Median :0.03334   Median :0.04000   Median :0.04000  
##  Mean   :0.18460   Mean   :0.14126   Mean   :0.21632   Mean   :0.22377  
##  3rd Qu.:0.24096   3rd Qu.:0.15083   3rd Qu.:0.33422   3rd Qu.:0.37576  
##  Max.   :0.92699   Max.   :0.92595   Max.   :0.92000   Max.   :0.92653  
##                                                                         
##      LDA_04        global_subjectivity global_sentiment_polarity
##  Min.   :0.00000   Min.   :0.0000      Min.   :-0.39375         
##  1st Qu.:0.02857   1st Qu.:0.3962      1st Qu.: 0.05776         
##  Median :0.04073   Median :0.4535      Median : 0.11912         
##  Mean   :0.23403   Mean   :0.4434      Mean   : 0.11931         
##  3rd Qu.:0.39999   3rd Qu.:0.5083      3rd Qu.: 0.17783         
##  Max.   :0.92719   Max.   :1.0000      Max.   : 0.72784         
##                                                                 
##  global_rate_positive_words global_rate_negative_words rate_positive_words
##  Min.   :0.00000            Min.   :0.000000           Min.   :0.0000     
##  1st Qu.:0.02838            1st Qu.:0.009615           1st Qu.:0.6000     
##  Median :0.03902            Median :0.015337           Median :0.7105     
##  Mean   :0.03962            Mean   :0.016612           Mean   :0.6822     
##  3rd Qu.:0.05028            3rd Qu.:0.021739           3rd Qu.:0.8000     
##  Max.   :0.15549            Max.   :0.184932           Max.   :1.0000     
##                                                                           
##  rate_negative_words avg_positive_polarity min_positive_polarity
##  Min.   :0.0000      Min.   :0.0000        Min.   :0.00000      
##  1st Qu.:0.1852      1st Qu.:0.3062        1st Qu.:0.05000      
##  Median :0.2800      Median :0.3588        Median :0.10000      
##  Mean   :0.2879      Mean   :0.3538        Mean   :0.09545      
##  3rd Qu.:0.3846      3rd Qu.:0.4114        3rd Qu.:0.10000      
##  Max.   :1.0000      Max.   :1.0000        Max.   :1.00000      
##                                                                 
##  max_positive_polarity avg_negative_polarity min_negative_polarity
##  Min.   :0.0000        Min.   :-1.0000       Min.   :-1.0000      
##  1st Qu.:0.6000        1st Qu.:-0.3284       1st Qu.:-0.7000      
##  Median :0.8000        Median :-0.2533       Median :-0.5000      
##  Mean   :0.7567        Mean   :-0.2595       Mean   :-0.5219      
##  3rd Qu.:1.0000        3rd Qu.:-0.1869       3rd Qu.:-0.3000      
##  Max.   :1.0000        Max.   : 0.0000       Max.   : 0.0000      
##                                                                   
##  max_negative_polarity title_subjectivity title_sentiment_polarity
##  Min.   :-1.0000       Min.   :0.0000     Min.   :-1.00000        
##  1st Qu.:-0.1250       1st Qu.:0.0000     1st Qu.: 0.00000        
##  Median :-0.1000       Median :0.1500     Median : 0.00000        
##  Mean   :-0.1075       Mean   :0.2824     Mean   : 0.07143        
##  3rd Qu.:-0.0500       3rd Qu.:0.5000     3rd Qu.: 0.15000        
##  Max.   : 0.0000       Max.   :1.0000     Max.   : 1.00000        
##                                                                   
##  abs_title_subjectivity abs_title_sentiment_polarity     shares      
##  Min.   :0.0000         Min.   :0.0000               Min.   :     1  
##  1st Qu.:0.1667         1st Qu.:0.0000               1st Qu.:   946  
##  Median :0.5000         Median :0.0000               Median :  1400  
##  Mean   :0.3418         Mean   :0.1561               Mean   :  3395  
##  3rd Qu.:0.5000         3rd Qu.:0.2500               3rd Qu.:  2800  
##  Max.   :0.5000         Max.   :1.0000               Max.   :843300  
## 
names(news)
##  [1] "url"                           "timedelta"                    
##  [3] "n_tokens_title"                "n_tokens_content"             
##  [5] "n_unique_tokens"               "n_non_stop_words"             
##  [7] "n_non_stop_unique_tokens"      "num_hrefs"                    
##  [9] "num_self_hrefs"                "num_imgs"                     
## [11] "num_videos"                    "average_token_length"         
## [13] "num_keywords"                  "data_channel_is_lifestyle"    
## [15] "data_channel_is_entertainment" "data_channel_is_bus"          
## [17] "data_channel_is_socmed"        "data_channel_is_tech"         
## [19] "data_channel_is_world"         "kw_min_min"                   
## [21] "kw_max_min"                    "kw_avg_min"                   
## [23] "kw_min_max"                    "kw_max_max"                   
## [25] "kw_avg_max"                    "kw_min_avg"                   
## [27] "kw_max_avg"                    "kw_avg_avg"                   
## [29] "self_reference_min_shares"     "self_reference_max_shares"    
## [31] "self_reference_avg_sharess"    "weekday_is_monday"            
## [33] "weekday_is_tuesday"            "weekday_is_wednesday"         
## [35] "weekday_is_thursday"           "weekday_is_friday"            
## [37] "weekday_is_saturday"           "weekday_is_sunday"            
## [39] "is_weekend"                    "LDA_00"                       
## [41] "LDA_01"                        "LDA_02"                       
## [43] "LDA_03"                        "LDA_04"                       
## [45] "global_subjectivity"           "global_sentiment_polarity"    
## [47] "global_rate_positive_words"    "global_rate_negative_words"   
## [49] "rate_positive_words"           "rate_negative_words"          
## [51] "avg_positive_polarity"         "min_positive_polarity"        
## [53] "max_positive_polarity"         "avg_negative_polarity"        
## [55] "min_negative_polarity"         "max_negative_polarity"        
## [57] "title_subjectivity"            "title_sentiment_polarity"     
## [59] "abs_title_subjectivity"        "abs_title_sentiment_polarity" 
## [61] "shares"

Data distributions

var_names <- read.csv("https://raw.githubusercontent.com/johnpannyc/online-news-data-621/master/variable%20dictionary.csv")
var_names
##                 Name.of.Variable
## 1                            url
## 2                      timedelta
## 3                 n_tokens_title
## 4               n_tokens_content
## 5                n_unique_tokens
## 6               n_non_stop_words
## 7       n_non_stop_unique_tokens
## 8                      num_hrefs
## 9                 num_self_hrefs
## 10                      num_imgs
## 11                    num_videos
## 12          average_token_length
## 13                  num_keywords
## 14     data_channel_is_lifestyle
## 15 data_channel_is_entertainment
## 16           data_channel_is_bus
## 17        data_channel_is_socmed
## 18          data_channel_is_tech
## 19         data_channel_is_world
## 20                    kw_min_min
## 21                    kw_max_min
## 22                    kw_avg_min
## 23                    kw_min_max
## 24                    kw_max_max
## 25                    kw_avg_max
## 26                    kw_min_avg
## 27                    kw_max_avg
## 28                    kw_avg_avg
## 29     self_reference_min_shares
## 30     self_reference_max_shares
## 31    self_reference_avg_sharess
## 32             weekday_is_monday
## 33            weekday_is_tuesday
## 34          weekday_is_Wednesday
## 35           weekday_is_thursday
## 36             weekday_is_friday
## 37           weekday_is_saturday
## 38             weekday_is_sunday
## 39                    is_weekend
## 40                        LDA_00
## 41                        LDA_01
## 42                        LDA_02
## 43                        LDA_03
## 44                        LDA_04
## 45           global_subjectivity
## 46     global_sentiment_polarity
## 47    global_rate_positive_words
## 48    global_rate_negative_words
## 49           rate_positive_words
## 50           rate_negative_words
## 51         avg_positive_polarity
## 52         min_positive_polarity
## 53         max_positive_polarity
## 54         avg_negative_polarity
## 55         min_negative_polarity
## 56         max_negative_polarity
## 57            title_subjectivity
## 58      title_sentiment_polarity
## 59        abs_title_subjectivity
## 60  abs_title_sentiment_polarity
## 61                        shares
##                                                        Explanations
## 1                                                URL of the article
## 2  Days between the article publication and the dataset acquisition
## 3                                      Number of words in the title
## 4                                      Number of words in the title
## 5                               Rate of unique words in the content
## 6                             Rate of non-stop words in the content
## 7                      Rate of unique non-stop words in the content
## 8                                                   Number of links
## 9           Number of links to other articles published by Mashable
## 10                                                 Number of images
## 11                                                 Number of videos
## 12                               Average length of the words in the
## 13                               Number of keywords in the metadata
## 14                                   Is data channel ¡®Lifestyle¡¯?
## 15                               Is data channel ¡®Entertainment¡¯?
## 16                                    Is data channel ¡®Business¡¯?
## 17                                Is data channel ¡®Social Media¡¯?
## 18                                        Is data channel ¡®Tech¡¯?
## 19                                       Is data channel ¡®World¡¯?
## 20                                      Worst keyword (min. shares)
## 21                                      Worst keyword (max. shares)
## 22                                      Worst keyword (avg. shares)
## 23                                       Best keyword (min. shares)
## 24                                       Best keyword (max. shares)
## 25                                       Best keyword (avg. shares)
## 26                                       Avg. keyword (min. shares)
## 27                                       Avg. keyword (max. shares)
## 28                                       Avg. keyword (avg. shares)
## 29                   Min. shares of referenced articles in Mashable
## 30                   Max. shares of referenced articles in Mashable
## 31                   Avg. shares of referenced articles in Mashable
## 32                           Was the article published on a Monday?
## 33                          Was the article published on a Tuesday?
## 34                        Was the article published on a Wednesday?
## 35                         Was the article published on a Thursday?
## 36                           Was the article published on a Friday?
## 37                         Was the article published on a Saturday?
## 38                           Was the article published on a Sunday?
## 39                        Was the article published on the weekend?
## 40                                         Closeness to LDA topic 0
## 41                                         Closeness to LDA topic 1
## 42                                         Closeness to LDA topic 2
## 43                                         Closeness to LDA topic 3
## 44                                         Closeness to LDA topic 4
## 45                                                Text subjectivity
## 46                                          Text sentiment polarity
## 47                            Rate of positive words in the content
## 48                            Rate of negative words in the content
## 49                  Rate of positive words among non-neutral tokens
## 50                  Rate of negative words among non-neutral tokens
## 51                                  Avg. polarity of positive words
## 52                                  Min. polarity of positive words
## 53                                  Max. polarity of positive words
## 54                                  Avg. polarity of negative words
## 55                                  Min. polarity of negative words
## 56                                  Max. polarity of negative words
## 57                                               Title subjectivity
## 58                                                   Title polarity
## 59                                      Absolute subjectivity level
## 60                                          Absolute polarity level
## 61                                        Number of shares (target)

because the data are big, so we Split training dataset and test dataset

set.seed(123)
news$popular= ifelse(news$avg_positive_polarity>0.5,1, 0)
news=news[,-23]
split=sample.split(news$popular , SplitRatio=0.1)
training_set=subset(news, split==TRUE)
test_set=subset(news, split==FALSE)
str(training_set)
## 'data.frame':    3965 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 6 20 38 55 66 79 104 119 134 150 ...
##  $ timedelta                    : num  731 731 731 731 731 731 729 729 729 729 ...
##  $ n_tokens_title               : num  10 8 8 9 8 10 6 9 9 10 ...
##  $ n_tokens_content             : num  370 1207 257 1115 403 ...
##  $ n_unique_tokens              : num  0.56 0.411 0.568 0.424 0.516 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.698 0.549 0.671 0.565 0.618 ...
##  $ num_hrefs                    : num  2 24 9 21 3 15 36 10 1 11 ...
##  $ num_self_hrefs               : num  2 24 7 21 3 11 30 5 1 2 ...
##  $ num_imgs                     : num  0 42 0 20 1 1 19 1 0 1 ...
##  $ num_videos                   : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ average_token_length         : num  4.36 4.72 4.64 4.63 4.08 ...
##  $ num_keywords                 : num  9 8 9 8 8 6 10 9 10 4 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ data_channel_is_entertainment: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ data_channel_is_bus          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 1 0 0 1 0 0 0 0 ...
##  $ data_channel_is_tech         : num  1 1 0 1 1 0 1 0 0 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 217 217 217 217 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 1200 2600 819 598 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 17100 17100 17100 17100 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  8500 545 1300 545 545 757 545 2000 0 2800 ...
##  $ self_reference_max_shares    : num  8500 16000 2500 16000 545 5400 16000 4900 0 2800 ...
##  $ self_reference_avg_sharess   : num  8500 2830 1775 3429 545 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 0 0 0 0 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 1 1 1 1 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.0222 0.025 0.4392 0.025 0.025 ...
##  $ LDA_01                       : num  0.3067 0.0252 0.0225 0.327 0.333 ...
##  $ LDA_02                       : num  0.0222 0.025 0.0224 0.025 0.025 ...
##  $ LDA_03                       : num  0.0222 0.025 0.0233 0.025 0.025 ...
##  $ LDA_04                       : num  0.627 0.9 0.493 0.598 0.592 ...
##  $ global_subjectivity          : num  0.437 0.539 0.4 0.507 0.518 ...
##  $ global_sentiment_polarity    : num  0.07118 0.28826 0.00741 0.27977 0.0694 ...
##  $ global_rate_positive_words   : num  0.0297 0.0696 0.0311 0.0717 0.0496 ...
##  $ global_rate_negative_words   : num  0.027 0.0116 0.0272 0.0135 0.0397 ...
##  $ rate_positive_words          : num  0.524 0.857 0.533 0.842 0.556 ...
##  $ rate_negative_words          : num  0.476 0.143 0.467 0.158 0.444 ...
##  $ avg_positive_polarity        : num  0.351 0.427 0.36 0.417 0.28 ...
##  $ min_positive_polarity        : num  0.1364 0.1 0.0333 0.1 0.05 ...
##  $ max_positive_polarity        : num  0.6 1 0.6 1 0.6 1 1 0.8 0.5 1 ...
##  $ avg_negative_polarity        : num  -0.195 -0.227 -0.393 -0.212 -0.172 ...
##  $ min_negative_polarity        : num  -0.4 -0.5 -0.5 -0.5 -0.4 -0.8 -0.5 -0.5 -0.4 -0.5 ...
##  $ max_negative_polarity        : num  -0.1 -0.05 -0.125 -0.05 -0.1 ...
##  $ title_subjectivity           : num  0.643 0.5 0.667 0.333 0.75 ...
##  $ title_sentiment_polarity     : num  0.214 0 -0.5 0.25 -0.125 ...
##  $ abs_title_subjectivity       : num  0.143 0 0.167 0.167 0.25 ...
##  $ abs_title_sentiment_polarity : num  0.214 0 0.5 0.25 0.125 ...
##  $ shares                       : int  855 17100 2600 2400 3200 851 302 2300 454 373 ...
##  $ popular                      : num  0 0 0 0 0 0 0 0 0 0 ...