ETM 58A-Final

#Finding Datasets

Here is my data sets withing their explanations about attributes and aims.

#Preparing Datasets

##Online News Popularity

#install.packages('openxlsx')
#install.packages('data.table')
#install.packages('skimr')
#install.packages(file.choose(),repos=NULL)
#install.packages("readxl")

 
library(openxlsx)
library(readxl)
library(skimr)
library(data.table)

data_path_onlinenews=file.choose()
onlinenews=read_excel(data_path_onlinenews,sheet=1)
str(onlinenews)

## tibble [39,644 × 61] (S3: tbl_df/tbl/data.frame)
##  $ url                          : chr [1:39644] "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
##  $ timedelta                    : num [1:39644] 731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num [1:39644] 12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num [1:39644] 219 255 211 531 1072 ...
##  $ n_unique_tokens              : num [1:39644] 0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num [1:39644] 1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num [1:39644] 0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num [1:39644] 4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num [1:39644] 2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num [1:39644] 1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num [1:39644] 4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num [1:39644] 5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num [1:39644] 0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num [1:39644] 1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num [1:39644] 0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num [1:39644] 0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num [1:39644] 496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num [1:39644] 496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num [1:39644] 496 0 918 0 3151 ...
##  $ weekday_is_monday            : num [1:39644] 1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num [1:39644] 0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num [1:39644] 0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num [1:39644] 0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num [1:39644] 0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num [1:39644] 0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num [1:39644] 0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num [1:39644] 0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num [1:39644] 0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num [1:39644] 0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num [1:39644] 0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num [1:39644] 0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num [1:39644] 0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num [1:39644] 0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num [1:39644] 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num [1:39644] -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num [1:39644] -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num [1:39644] -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num [1:39644] 0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num [1:39644] -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num [1:39644] 0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num [1:39644] 0.188 0 0 0 0.136 ...
##  $ shares                       : num [1:39644] 593 711 1500 1200 505 855 556 891 3600 710 ...

summary_data_onlinenews=skim(onlinenews)
print(summary_data_onlinenews)

## ── Data Summary ────────────────────────
##                            Values    
## Name                       onlinenews
## Number of rows             39644     
## Number of columns          61        
## _______________________              
## Column type frequency:               
##   character                1         
##   numeric                  60        
## ________________________             
## Group variables            None      
## 
## ── Variable type: character ────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 url                   0             1  34 192     0    39644          0
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable                 n_missing complete_rate        mean          sd
##  1 timedelta                             0             1    355.        214.    
##  2 n_tokens_title                        0             1     10.4         2.11  
##  3 n_tokens_content                      0             1    547.        471.    
##  4 n_unique_tokens                       0             1      0.548       3.52  
##  5 n_non_stop_words                      0             1      0.996       5.23  
##  6 n_non_stop_unique_tokens              0             1      0.689       3.26  
##  7 num_hrefs                             0             1     10.9        11.3   
##  8 num_self_hrefs                        0             1      3.29        3.86  
##  9 num_imgs                              0             1      4.54        8.31  
## 10 num_videos                            0             1      1.25        4.11  
## 11 average_token_length                  0             1      4.55        0.844 
## 12 num_keywords                          0             1      7.22        1.91  
## 13 data_channel_is_lifestyle             0             1      0.0529      0.224 
## 14 data_channel_is_entertainment         0             1      0.178       0.383 
## 15 data_channel_is_bus                   0             1      0.158       0.365 
## 16 data_channel_is_socmed                0             1      0.0586      0.235 
## 17 data_channel_is_tech                  0             1      0.185       0.389 
## 18 data_channel_is_world                 0             1      0.213       0.409 
## 19 kw_min_min                            0             1     26.1        69.6   
## 20 kw_max_min                            0             1   1154.       3858.    
## 21 kw_avg_min                            0             1    312.        621.    
## 22 kw_min_max                            0             1  13612.      57986.    
## 23 kw_max_max                            0             1 752324.     214502.    
## 24 kw_avg_max                            0             1 259282.     135102.    
## 25 kw_min_avg                            0             1   1117.       1137.    
## 26 kw_max_avg                            0             1   5657.       6099.    
## 27 kw_avg_avg                            0             1   3136.       1318.    
## 28 self_reference_min_shares             0             1   3999.      19739.    
## 29 self_reference_max_shares             0             1  10329.      41028.    
## 30 self_reference_avg_sharess            0             1   6402.      24211.    
## 31 weekday_is_monday                     0             1      0.168       0.374 
## 32 weekday_is_tuesday                    0             1      0.186       0.389 
## 33 weekday_is_wednesday                  0             1      0.188       0.390 
## 34 weekday_is_thursday                   0             1      0.183       0.387 
## 35 weekday_is_friday                     0             1      0.144       0.351 
## 36 weekday_is_saturday                   0             1      0.0619      0.241 
## 37 weekday_is_sunday                     0             1      0.0690      0.254 
## 38 is_weekend                            0             1      0.131       0.337 
## 39 LDA_00                                0             1      0.185       0.263 
## 40 LDA_01                                0             1      0.141       0.220 
## 41 LDA_02                                0             1      0.216       0.282 
## 42 LDA_03                                0             1      0.224       0.295 
## 43 LDA_04                                0             1      0.234       0.289 
## 44 global_subjectivity                   0             1      0.443       0.117 
## 45 global_sentiment_polarity             0             1      0.119       0.0969
## 46 global_rate_positive_words            0             1      0.0396      0.0174
## 47 global_rate_negative_words            0             1      0.0166      0.0108
## 48 rate_positive_words                   0             1      0.682       0.190 
## 49 rate_negative_words                   0             1      0.288       0.156 
## 50 avg_positive_polarity                 0             1      0.354       0.105 
## 51 min_positive_polarity                 0             1      0.0954      0.0713
## 52 max_positive_polarity                 0             1      0.757       0.248 
## 53 avg_negative_polarity                 0             1     -0.260       0.128 
## 54 min_negative_polarity                 0             1     -0.522       0.290 
## 55 max_negative_polarity                 0             1     -0.108       0.0954
## 56 title_subjectivity                    0             1      0.282       0.324 
## 57 title_sentiment_polarity              0             1      0.0714      0.265 
## 58 abs_title_subjectivity                0             1      0.342       0.189 
## 59 abs_title_sentiment_polarity          0             1      0.156       0.226 
## 60 shares                                0             1   3395.      11627.    
##        p0          p25         p50         p75       p100 hist 
##  1  8        164          339         542         731     ▇▇▆▆▇
##  2  2          9           10          12          23     ▁▇▇▁▁
##  3  0        246          409         716        8474     ▇▁▁▁▁
##  4  0          0.471        0.539       0.609     701     ▇▁▁▁▁
##  5  0          1.00         1.00        1.00     1042     ▇▁▁▁▁
##  6  0          0.626        0.690       0.755     650     ▇▁▁▁▁
##  7  0          4            8          14         304     ▇▁▁▁▁
##  8  0          1            3           4         116     ▇▁▁▁▁
##  9  0          1            1           4         128     ▇▁▁▁▁
## 10  0          0            0           1          91     ▇▁▁▁▁
## 11  0          4.48         4.66        4.85        8.04  ▁▁▇▃▁
## 12  1          6            7           9          10     ▁▂▇▇▇
## 13  0          0            0           0           1     ▇▁▁▁▁
## 14  0          0            0           0           1     ▇▁▁▁▂
## 15  0          0            0           0           1     ▇▁▁▁▂
## 16  0          0            0           0           1     ▇▁▁▁▁
## 17  0          0            0           0           1     ▇▁▁▁▂
## 18  0          0            0           0           1     ▇▁▁▁▂
## 19 -1         -1           -1           4         377     ▇▁▁▁▁
## 20  0        445          660        1000      298400     ▇▁▁▁▁
## 21 -1        142.         236.        357       42828.    ▇▁▁▁▁
## 22  0          0         1400        7900      843300     ▇▁▁▁▁
## 23  0     843300       843300      843300      843300     ▁▁▁▁▇
## 24  0     172847.      244572.     330980      843300     ▃▇▃▁▁
## 25 -1          0         1024.       2057.       3613.    ▇▃▃▂▂
## 26  0       3562.        4356.       6020.     298400     ▇▁▁▁▁
## 27  0       2382.        2870.       3600.      43568.    ▇▁▁▁▁
## 28  0        639         1200        2600      843300     ▇▁▁▁▁
## 29  0       1100         2800        8000      843300     ▇▁▁▁▁
## 30  0        981.        2200        5200      843300     ▇▁▁▁▁
## 31  0          0            0           0           1     ▇▁▁▁▂
## 32  0          0            0           0           1     ▇▁▁▁▂
## 33  0          0            0           0           1     ▇▁▁▁▂
## 34  0          0            0           0           1     ▇▁▁▁▂
## 35  0          0            0           0           1     ▇▁▁▁▂
## 36  0          0            0           0           1     ▇▁▁▁▁
## 37  0          0            0           0           1     ▇▁▁▁▁
## 38  0          0            0           0           1     ▇▁▁▁▁
## 39  0          0.0251       0.0334      0.241       0.927 ▇▁▁▁▁
## 40  0          0.0250       0.0333      0.151       0.926 ▇▁▁▁▁
## 41  0          0.0286       0.0400      0.334       0.920 ▇▁▁▁▁
## 42  0          0.0286       0.0400      0.376       0.927 ▇▁▁▁▂
## 43  0          0.0286       0.0407      0.400       0.927 ▇▂▁▁▂
## 44  0          0.396        0.453       0.508       1     ▁▃▇▁▁
## 45 -0.394      0.0578       0.119       0.178       0.728 ▁▂▇▁▁
## 46  0          0.0284       0.0390      0.0503      0.155 ▅▇▁▁▁
## 47  0          0.00962      0.0153      0.0217      0.185 ▇▁▁▁▁
## 48  0          0.6          0.711       0.8         1     ▁▁▃▇▃
## 49  0          0.185        0.28        0.385       1     ▅▇▃▁▁
## 50  0          0.306        0.359       0.411       1     ▁▇▃▁▁
## 51  0          0.05         0.1         0.1         1     ▇▁▁▁▁
## 52  0          0.6          0.8         1           1     ▁▁▅▅▇
## 53 -1         -0.328       -0.253      -0.187       0     ▁▁▂▇▃
## 54 -1         -0.7         -0.5        -0.3         0     ▆▆▇▅▅
## 55 -1         -0.125       -0.1        -0.05        0     ▁▁▁▁▇
## 56  0          0            0.15        0.5         1     ▇▂▂▁▂
## 57 -1          0            0           0.15        1     ▁▁▇▂▁
## 58  0          0.167        0.5         0.5         0.5   ▃▂▁▁▇
## 59  0          0            0           0.25        1     ▇▂▁▁▁
## 60  1        946         1400        2800      843300     ▇▁▁▁▁

## $character
## 
## ── Variable type: character ────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 url                   0             1  34 192     0    39644          0
## 
## $numeric
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable       n_missing complete_rate    mean     sd p0     p25     p50
##  1 timedelta                   0             1 355.    214.    8 164     339    
##  2 n_tokens_title              0             1  10.4     2.11  2   9      10    
##  3 n_tokens_content            0             1 547.    471.    0 246     409    
##  4 n_unique_tokens             0             1   0.548   3.52  0   0.471   0.539
##  5 n_non_stop_words            0             1   0.996   5.23  0   1.00    1.00 
##  6 n_non_stop_unique_…         0             1   0.689   3.26  0   0.626   0.690
##  7 num_hrefs                   0             1  10.9    11.3   0   4       8    
##  8 num_self_hrefs              0             1   3.29    3.86  0   1       3    
##  9 num_imgs                    0             1   4.54    8.31  0   1       1    
## 10 num_videos                  0             1   1.25    4.11  0   0       0    
## # … with 50 more rows, and 3 more variables: p75 <dbl>, p100 <dbl>, hist <chr>

onlinenews=data.table(onlinenews)
str(onlinenews)

## Classes 'data.table' and 'data.frame':   39644 obs. of  61 variables:
##  $ url                          : chr  "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : num  593 711 1500 1200 505 855 556 891 3600 710 ...
##  - attr(*, ".internal.selfref")=<externalptr>

##Absenteeism at Work

data_path_absenteeism=file.choose()
absenteeism=read_excel(data_path_absenteeism,sheet=1)
str(absenteeism)

## tibble [740 × 21] (S3: tbl_df/tbl/data.frame)
##  $ ID                             : num [1:740] 11 36 3 7 11 3 10 20 14 1 ...
##  $ Reason for absence             : num [1:740] 26 0 23 7 23 23 22 23 19 22 ...
##  $ Month of absence               : num [1:740] 7 7 7 7 7 7 7 7 7 7 ...
##  $ Day of the week                : num [1:740] 3 3 4 5 5 6 6 6 2 2 ...
##  $ Seasons                        : num [1:740] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Transportation expense         : num [1:740] 289 118 179 279 289 179 361 260 155 235 ...
##  $ Distance from Residence to Work: num [1:740] 36 13 51 5 36 51 52 50 12 11 ...
##  $ Service time                   : num [1:740] 13 18 18 14 13 18 3 11 14 14 ...
##  $ Age                            : num [1:740] 33 50 38 39 33 38 28 36 34 37 ...
##  $ Work load Average/day          : num [1:740] 239554 239554 239554 239554 239554 ...
##  $ Hit target                     : num [1:740] 97 97 97 97 97 97 97 97 97 97 ...
##  $ Disciplinary failure           : num [1:740] 0 1 0 0 0 0 0 0 0 0 ...
##  $ Education                      : num [1:740] 1 1 1 1 1 1 1 1 1 3 ...
##  $ Son                            : num [1:740] 2 1 0 2 2 0 1 4 2 1 ...
##  $ Social drinker                 : num [1:740] 1 1 1 1 1 1 1 1 1 0 ...
##  $ Social smoker                  : num [1:740] 0 0 0 1 0 0 0 0 0 0 ...
##  $ Pet                            : num [1:740] 1 0 0 0 1 0 4 0 0 1 ...
##  $ Weight                         : num [1:740] 90 98 89 68 90 89 80 65 95 88 ...
##  $ Height                         : num [1:740] 172 178 170 168 172 170 172 168 196 172 ...
##  $ Body mass index                : num [1:740] 30 31 31 24 30 31 27 23 25 29 ...
##  $ Absenteeism time in hours      : num [1:740] 4 0 2 4 2 2 8 4 40 8 ...

summary_data_absenteeism=skim(absenteeism)
print(summary_data_absenteeism)

## ── Data Summary ────────────────────────
##                            Values     
## Name                       absenteeism
## Number of rows             740        
## Number of columns          21         
## _______________________               
## Column type frequency:                
##   numeric                  21         
## ________________________              
## Group variables            None       
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable                   n_missing complete_rate        mean        sd
##  1 ID                                      0             1     18.0       11.0  
##  2 Reason for absence                      0             1     19.2        8.43 
##  3 Month of absence                        0             1      6.32       3.44 
##  4 Day of the week                         0             1      3.91       1.42 
##  5 Seasons                                 0             1      2.54       1.11 
##  6 Transportation expense                  0             1    221.        67.0  
##  7 Distance from Residence to Work         0             1     29.6       14.8  
##  8 Service time                            0             1     12.6        4.38 
##  9 Age                                     0             1     36.4        6.48 
## 10 Work load Average/day                   0             1 271490.     39058.   
## 11 Hit target                              0             1     94.6        3.78 
## 12 Disciplinary failure                    0             1      0.0541     0.226
## 13 Education                               0             1      1.29       0.673
## 14 Son                                     0             1      1.02       1.10 
## 15 Social drinker                          0             1      0.568      0.496
## 16 Social smoker                           0             1      0.0730     0.260
## 17 Pet                                     0             1      0.746      1.32 
## 18 Weight                                  0             1     79.0       12.9  
## 19 Height                                  0             1    172.         6.03 
## 20 Body mass index                         0             1     26.7        4.29 
## 21 Absenteeism time in hours               0             1      6.92      13.3  
##        p0    p25    p50    p75   p100 hist 
##  1      1      9     18     28     36 ▇▇▆▆▆
##  2      0     13     23     26     28 ▁▂▂▂▇
##  3      0      3      6      9     12 ▆▆▇▅▇
##  4      2      3      4      5      6 ▇▇▇▆▇
##  5      1      2      3      4      4 ▇▇▁▇▇
##  6    118    179    225    260    388 ▃▇▅▃▂
##  7      5     16     26     50     52 ▅▃▇▂▇
##  8      1      9     13     16     29 ▂▇▆▃▁
##  9     27     31     37     40     58 ▇▇▃▂▁
## 10 205917 244387 264249 294217 378884 ▅▇▃▂▁
## 11     81     93     95     97    100 ▁▁▃▇▇
## 12      0      0      0      0      1 ▇▁▁▁▁
## 13      1      1      1      1      4 ▇▁▁▁▁
## 14      0      0      1      2      4 ▇▆▅▁▁
## 15      0      0      1      1      1 ▆▁▁▁▇
## 16      0      0      0      0      1 ▇▁▁▁▁
## 17      0      0      0      1      8 ▇▁▁▁▁
## 18     56     69     83     89    108 ▅▇▅▇▂
## 19    163    169    170    172    196 ▃▇▂▁▁
## 20     19     24     25     31     38 ▃▇▅▅▁
## 21      0      2      3      8    120 ▇▁▁▁▁

## $numeric
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable      n_missing complete_rate   mean     sd     p0    p25    p50
##  1 ID                         0             1 1.80e1 1.10e1      1      9     18
##  2 Reason for absence         0             1 1.92e1 8.43e0      0     13     23
##  3 Month of absence           0             1 6.32e0 3.44e0      0      3      6
##  4 Day of the week            0             1 3.91e0 1.42e0      2      3      4
##  5 Seasons                    0             1 2.54e0 1.11e0      1      2      3
##  6 Transportation ex…         0             1 2.21e2 6.70e1    118    179    225
##  7 Distance from Res…         0             1 2.96e1 1.48e1      5     16     26
##  8 Service time               0             1 1.26e1 4.38e0      1      9     13
##  9 Age                        0             1 3.65e1 6.48e0     27     31     37
## 10 Work load Average…         0             1 2.71e5 3.91e4 205917 244387 264249
## # … with 11 more rows, and 3 more variables: p75 <dbl>, p100 <dbl>, hist <chr>

absenteeism=data.table(absenteeism)
str(absenteeism)

## Classes 'data.table' and 'data.frame':   740 obs. of  21 variables:
##  $ ID                             : num  11 36 3 7 11 3 10 20 14 1 ...
##  $ Reason for absence             : num  26 0 23 7 23 23 22 23 19 22 ...
##  $ Month of absence               : num  7 7 7 7 7 7 7 7 7 7 ...
##  $ Day of the week                : num  3 3 4 5 5 6 6 6 2 2 ...
##  $ Seasons                        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Transportation expense         : num  289 118 179 279 289 179 361 260 155 235 ...
##  $ Distance from Residence to Work: num  36 13 51 5 36 51 52 50 12 11 ...
##  $ Service time                   : num  13 18 18 14 13 18 3 11 14 14 ...
##  $ Age                            : num  33 50 38 39 33 38 28 36 34 37 ...
##  $ Work load Average/day          : num  239554 239554 239554 239554 239554 ...
##  $ Hit target                     : num  97 97 97 97 97 97 97 97 97 97 ...
##  $ Disciplinary failure           : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ Education                      : num  1 1 1 1 1 1 1 1 1 3 ...
##  $ Son                            : num  2 1 0 2 2 0 1 4 2 1 ...
##  $ Social drinker                 : num  1 1 1 1 1 1 1 1 1 0 ...
##  $ Social smoker                  : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ Pet                            : num  1 0 0 0 1 0 4 0 0 1 ...
##  $ Weight                         : num  90 98 89 68 90 89 80 65 95 88 ...
##  $ Height                         : num  172 178 170 168 172 170 172 168 196 172 ...
##  $ Body mass index                : num  30 31 31 24 30 31 27 23 25 29 ...
##  $ Absenteeism time in hours      : num  4 0 2 4 2 2 8 4 40 8 ...
##  - attr(*, ".internal.selfref")=<externalptr>

##Dry Beans

data_path_beans=file.choose()
drybeans=read_excel(data_path_beans,sheet=1)
str(drybeans)

## tibble [13,611 × 17] (S3: tbl_df/tbl/data.frame)
##  $ Area           : num [1:13611] 28395 28734 29380 30008 30140 ...
##  $ Perimeter      : num [1:13611] 610 638 624 646 620 ...
##  $ MajorAxisLength: num [1:13611] 208 201 213 211 202 ...
##  $ MinorAxisLength: num [1:13611] 174 183 176 183 190 ...
##  $ AspectRation   : num [1:13611] 1.2 1.1 1.21 1.15 1.06 ...
##  $ Eccentricity   : num [1:13611] 0.55 0.412 0.563 0.499 0.334 ...
##  $ ConvexArea     : num [1:13611] 28715 29172 29690 30724 30417 ...
##  $ EquivDiameter  : num [1:13611] 190 191 193 195 196 ...
##  $ Extent         : num [1:13611] 0.764 0.784 0.778 0.783 0.773 ...
##  $ Solidity       : num [1:13611] 0.989 0.985 0.99 0.977 0.991 ...
##  $ roundness      : num [1:13611] 0.958 0.887 0.948 0.904 0.985 ...
##  $ Compactness    : num [1:13611] 0.913 0.954 0.909 0.928 0.971 ...
##  $ ShapeFactor1   : num [1:13611] 0.00733 0.00698 0.00724 0.00702 0.0067 ...
##  $ ShapeFactor2   : num [1:13611] 0.00315 0.00356 0.00305 0.00321 0.00366 ...
##  $ ShapeFactor3   : num [1:13611] 0.834 0.91 0.826 0.862 0.942 ...
##  $ ShapeFactor4   : num [1:13611] 0.999 0.998 0.999 0.994 0.999 ...
##  $ Class          : chr [1:13611] "SEKER" "SEKER" "SEKER" "SEKER" ...

summary_data_drybeans=skim(drybeans)
print(summary_data_drybeans)

## ── Data Summary ────────────────────────
##                            Values  
## Name                       drybeans
## Number of rows             13611   
## Number of columns          17      
## _______________________            
## Column type frequency:             
##   character                1       
##   numeric                  16      
## ________________________           
## Group variables            None    
## 
## ── Variable type: character ────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 Class                 0             1   4   8     0        7          0
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable   n_missing complete_rate        mean           sd           p0
##  1 Area                    0             1 53048.      29324.       20420       
##  2 Perimeter               0             1   855.        214.         525.      
##  3 MajorAxisLength         0             1   320.         85.7        184.      
##  4 MinorAxisLength         0             1   202.         45.0        123.      
##  5 AspectRation            0             1     1.58        0.247        1.02    
##  6 Eccentricity            0             1     0.751       0.0920       0.219   
##  7 ConvexArea              0             1 53768.      29775.       20684       
##  8 EquivDiameter           0             1   253.         59.2        161.      
##  9 Extent                  0             1     0.750       0.0491       0.555   
## 10 Solidity                0             1     0.987       0.00466      0.919   
## 11 roundness               0             1     0.873       0.0595       0.490   
## 12 Compactness             0             1     0.800       0.0617       0.641   
## 13 ShapeFactor1            0             1     0.00656     0.00113      0.00278 
## 14 ShapeFactor2            0             1     0.00172     0.000596     0.000564
## 15 ShapeFactor3            0             1     0.644       0.0990       0.410   
## 16 ShapeFactor4            0             1     0.995       0.00437      0.948   
##            p25         p50         p75         p100 hist 
##  1 36328       44652       61332       254616       ▇▂▁▁▁
##  2   704.        795.        977.        1985.      ▇▆▁▁▁
##  3   253.        297.        376.         739.      ▇▆▂▁▁
##  4   176.        192.        217.         460.      ▇▇▁▁▁
##  5     1.43        1.55        1.71         2.43    ▂▇▅▂▁
##  6     0.716       0.764       0.810        0.911   ▁▁▂▇▇
##  7 36714.      45178       62294       263261       ▇▂▁▁▁
##  8   215.        238.        279.         569.      ▇▆▁▁▁
##  9     0.719       0.760       0.787        0.866   ▁▁▅▇▂
## 10     0.986       0.988       0.990        0.995   ▁▁▁▁▇
## 11     0.832       0.883       0.917        0.991   ▁▁▂▇▇
## 12     0.762       0.801       0.834        0.987   ▂▅▇▂▁
## 13     0.00590     0.00665     0.00727      0.0105  ▁▃▇▃▁
## 14     0.00115     0.00169     0.00217      0.00366 ▇▇▇▃▁
## 15     0.581       0.642       0.696        0.975   ▂▇▇▃▁
## 16     0.994       0.996       0.998        1.00    ▁▁▁▁▇

## $character
## 
## ── Variable type: character ────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 Class                 0             1   4   8     0        7          0
## 
## $numeric
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable n_missing complete_rate    mean      sd      p0     p25     p50
##  1 Area                  0             1 5.30e+4 2.93e+4 2.04e+4 3.63e+4 4.47e+4
##  2 Perimeter             0             1 8.55e+2 2.14e+2 5.25e+2 7.04e+2 7.95e+2
##  3 MajorAxisLen…         0             1 3.20e+2 8.57e+1 1.84e+2 2.53e+2 2.97e+2
##  4 MinorAxisLen…         0             1 2.02e+2 4.50e+1 1.23e+2 1.76e+2 1.92e+2
##  5 AspectRation          0             1 1.58e+0 2.47e-1 1.02e+0 1.43e+0 1.55e+0
##  6 Eccentricity          0             1 7.51e-1 9.20e-2 2.19e-1 7.16e-1 7.64e-1
##  7 ConvexArea            0             1 5.38e+4 2.98e+4 2.07e+4 3.67e+4 4.52e+4
##  8 EquivDiameter         0             1 2.53e+2 5.92e+1 1.61e+2 2.15e+2 2.38e+2
##  9 Extent                0             1 7.50e-1 4.91e-2 5.55e-1 7.19e-1 7.60e-1
## 10 Solidity              0             1 9.87e-1 4.66e-3 9.19e-1 9.86e-1 9.88e-1
## 11 roundness             0             1 8.73e-1 5.95e-2 4.90e-1 8.32e-1 8.83e-1
## 12 Compactness           0             1 8.00e-1 6.17e-2 6.41e-1 7.62e-1 8.01e-1
## 13 ShapeFactor1          0             1 6.56e-3 1.13e-3 2.78e-3 5.90e-3 6.65e-3
## 14 ShapeFactor2          0             1 1.72e-3 5.96e-4 5.64e-4 1.15e-3 1.69e-3
## 15 ShapeFactor3          0             1 6.44e-1 9.90e-2 4.10e-1 5.81e-1 6.42e-1
## 16 ShapeFactor4          0             1 9.95e-1 4.37e-3 9.48e-1 9.94e-1 9.96e-1
## # … with 3 more variables: p75 <dbl>, p100 <dbl>, hist <chr>

drybeans=data.table(drybeans)
str(drybeans)

## Classes 'data.table' and 'data.frame':   13611 obs. of  17 variables:
##  $ Area           : num  28395 28734 29380 30008 30140 ...
##  $ Perimeter      : num  610 638 624 646 620 ...
##  $ MajorAxisLength: num  208 201 213 211 202 ...
##  $ MinorAxisLength: num  174 183 176 183 190 ...
##  $ AspectRation   : num  1.2 1.1 1.21 1.15 1.06 ...
##  $ Eccentricity   : num  0.55 0.412 0.563 0.499 0.334 ...
##  $ ConvexArea     : num  28715 29172 29690 30724 30417 ...
##  $ EquivDiameter  : num  190 191 193 195 196 ...
##  $ Extent         : num  0.764 0.784 0.778 0.783 0.773 ...
##  $ Solidity       : num  0.989 0.985 0.99 0.977 0.991 ...
##  $ roundness      : num  0.958 0.887 0.948 0.904 0.985 ...
##  $ Compactness    : num  0.913 0.954 0.909 0.928 0.971 ...
##  $ ShapeFactor1   : num  0.00733 0.00698 0.00724 0.00702 0.0067 ...
##  $ ShapeFactor2   : num  0.00315 0.00356 0.00305 0.00321 0.00366 ...
##  $ ShapeFactor3   : num  0.834 0.91 0.826 0.862 0.942 ...
##  $ ShapeFactor4   : num  0.999 0.998 0.999 0.994 0.999 ...
##  $ Class          : chr  "SEKER" "SEKER" "SEKER" "SEKER" ...
##  - attr(*, ".internal.selfref")=<externalptr>

#Analyses

##Online News Popularity

#install.packages("caTools")


#install.packages("ISLR")
#install.packages("rpart")
#install.packages("rpart.plot")

library(ISLR) 
library(rpart) 
library(rpart.plot) 
library(caTools)

set.seed(1234)                              
sample_onlinenews <- sample.split(onlinenews$shares, SplitRatio = 0.7)
train_onlinenews  <- subset(onlinenews, sample_onlinenews == TRUE)
test_onlinenews   <- subset(onlinenews, sample_onlinenews == FALSE)

dim(train_onlinenews)

## [1] 27831    61

dim(test_onlinenews)

## [1] 11813    61

tree_onlinenews <- rpart(train_onlinenews$shares ~ train_onlinenews$ global_sentiment_polarity + train_onlinenews$avg_positive_polarity, data=train_onlinenews,control=rpart.control(cp=.0001))
printcp(tree_onlinenews)

## 
## Regression tree:
## rpart(formula = train_onlinenews$shares ~ train_onlinenews$global_sentiment_polarity + 
##     train_onlinenews$avg_positive_polarity, data = train_onlinenews, 
##     control = rpart.control(cp = 1e-04))
## 
## Variables actually used in tree construction:
## [1] train_onlinenews$avg_positive_polarity    
## [2] train_onlinenews$global_sentiment_polarity
## 
## Root node error: 5.0688e+12/27831 = 182127751
## 
## n= 27831 
## 
##            CP nsplit rel error xerror    xstd
## 1  0.00453842      0   1.00000 1.0000 0.22724
## 2  0.00225979      3   0.98638 1.0382 0.22761
## 3  0.00215863     17   0.93172 1.0522 0.22762
## 4  0.00172573     18   0.92956 1.0580 0.22759
## 5  0.00064706     21   0.92438 1.0640 0.22757
## 6  0.00061514     23   0.92309 1.0708 0.22763
## 7  0.00060658     25   0.92186 1.0707 0.22763
## 8  0.00052853     31   0.91822 1.0702 0.22762
## 9  0.00042264     43   0.91125 1.0746 0.22767
## 10 0.00040661     50   0.90829 1.0796 0.22767
## 11 0.00039138     56   0.90524 1.0814 0.22768
## 12 0.00038725     60   0.90368 1.0815 0.22768
## 13 0.00037646     63   0.90251 1.0820 0.22768
## 14 0.00034240     67   0.90101 1.0831 0.22768
## 15 0.00032118     69   0.90032 1.0874 0.22784
## 16 0.00028246     70   0.90000 1.0898 0.22785
## 17 0.00027555     73   0.89915 1.0912 0.22785
## 18 0.00027416     82   0.89667 1.0916 0.22785
## 19 0.00026612     86   0.89558 1.0916 0.22785
## 20 0.00024649     90   0.89428 1.0924 0.22786
## 21 0.00023619     92   0.89378 1.0947 0.22788
## 22 0.00023601     94   0.89331 1.0951 0.22788
## 23 0.00023547     96   0.89284 1.0951 0.22788
## 24 0.00020771     97   0.89260 1.0961 0.22788
## 25 0.00020583     99   0.89219 1.0967 0.22791
## 26 0.00020273    100   0.89198 1.0968 0.22791
## 27 0.00020194    105   0.89097 1.0969 0.22791
## 28 0.00019861    107   0.89057 1.0977 0.22791
## 29 0.00019449    108   0.89037 1.0981 0.22791
## 30 0.00019225    109   0.89017 1.0981 0.22791
## 31 0.00019210    111   0.88979 1.0981 0.22791
## 32 0.00019178    115   0.88902 1.0984 0.22791
## 33 0.00018348    116   0.88883 1.0985 0.22791
## 34 0.00017771    118   0.88846 1.0989 0.22791
## 35 0.00017353    120   0.88811 1.0997 0.22793
## 36 0.00017187    123   0.88758 1.1001 0.22793
## 37 0.00017120    125   0.88724 1.1002 0.22793
## 38 0.00015912    133   0.88553 1.1008 0.22793
## 39 0.00015122    138   0.88474 1.1030 0.22793
## 40 0.00014765    140   0.88444 1.1033 0.22793
## 41 0.00014030    143   0.88399 1.1042 0.22793
## 42 0.00013620    145   0.88371 1.1048 0.22792
## 43 0.00013565    146   0.88358 1.1053 0.22792
## 44 0.00013546    147   0.88344 1.1054 0.22792
## 45 0.00013537    149   0.88317 1.1054 0.22792
## 46 0.00013347    151   0.88290 1.1058 0.22792
## 47 0.00013105    154   0.88250 1.1061 0.22792
## 48 0.00012866    155   0.88237 1.1063 0.22793
## 49 0.00012651    156   0.88224 1.1064 0.22793
## 50 0.00012479    158   0.88199 1.1067 0.22793
## 51 0.00012420    159   0.88186 1.1067 0.22793
## 52 0.00012107    164   0.88124 1.1070 0.22793
## 53 0.00011849    165   0.88112 1.1073 0.22793
## 54 0.00011803    167   0.88088 1.1075 0.22793
## 55 0.00011317    169   0.88065 1.1078 0.22793
## 56 0.00010710    170   0.88053 1.1086 0.22793
## 57 0.00010444    174   0.88011 1.1091 0.22790
## 58 0.00010380    176   0.87990 1.1093 0.22790
## 59 0.00010189    177   0.87979 1.1094 0.22790
## 60 0.00010115    183   0.87911 1.1098 0.22790
## 61 0.00010000    188   0.87855 1.1099 0.22790

best_onlinenews <- tree_onlinenews$cptable[which.min(tree_onlinenews$cptable[,"xerror"]),"CP"]

pruned_tree_onlinenews <- prune(tree_onlinenews, cp=best_onlinenews)

prp(pruned_tree_onlinenews,
    faclen=0,
      extra=1,
     roundint=F,
     digits=5)

##Absenteeism at Work

library(caTools)
library(ISLR) 
library(rpart) 
library(rpart.plot) 

set.seed(1234)                              
sample_absenteeism<- sample.split(absenteeism$'Hit target', SplitRatio = 0.7)
train_absenteeism  <- subset(absenteeism,sample_absenteeism  == TRUE)
test_absenteeism  <- subset(absenteeism, sample_absenteeism  == FALSE)

dim(train_absenteeism)

## [1] 516  21

dim(test_absenteeism )

## [1] 224  21

tree_absenteeism  <- rpart(train_absenteeism$'Hit target' ~ train_absenteeism$'Month of absence'+ train_absenteeism$Education, data=train_absenteeism,control=rpart.control(cp=.0001))
printcp(tree_absenteeism )

## 
## Regression tree:
## rpart(formula = train_absenteeism$"Hit target" ~ train_absenteeism$"Month of absence" + 
##     train_absenteeism$Education, data = train_absenteeism, control = rpart.control(cp = 1e-04))
## 
## Variables actually used in tree construction:
## [1] train_absenteeism$"Month of absence" train_absenteeism$Education         
## 
## Root node error: 7316.3/516 = 14.179
## 
## n= 516 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.30918340      0   1.00000 1.00426 0.092469
## 2  0.15844446      1   0.69082 0.71461 0.062475
## 3  0.05830782      2   0.53237 0.55118 0.042246
## 4  0.04675656      3   0.47406 0.49130 0.033956
## 5  0.03222421      4   0.42731 0.46005 0.027757
## 6  0.00863184      5   0.39508 0.41245 0.025216
## 7  0.00716863      7   0.37782 0.41602 0.025886
## 8  0.00346618      8   0.37065 0.40016 0.024142
## 9  0.00284933      9   0.36719 0.40409 0.024671
## 10 0.00229245     10   0.36434 0.39891 0.024295
## 11 0.00207349     11   0.36204 0.39733 0.024404
## 12 0.00079125     12   0.35997 0.38981 0.024480
## 13 0.00020908     13   0.35918 0.38926 0.024483
## 14 0.00018737     16   0.35855 0.39126 0.024525
## 15 0.00012301     17   0.35836 0.39170 0.024537
## 16 0.00010000     18   0.35824 0.39150 0.024543

best_absenteeism <- tree_absenteeism$cptable[which.min(tree_absenteeism$cptable[,"xerror"]),"CP"]

pruned_tree_absenteeism <- prune(tree_absenteeism, cp=best_absenteeism)

prp(pruned_tree_absenteeism,
    faclen=0,
      extra=1,
     roundint=F,
     digits=5)

##Dry Beans

library(caTools)
library(ISLR) 
library(rpart) 
library(rpart.plot) 

set.seed(1234)                              
sample_drybeans<- sample.split(drybeans$Area, SplitRatio = 0.7)
train_drybeans  <- subset(drybeans,sample_drybeans  == TRUE)
test_drybeans  <- subset(drybeans, sample_drybeans  == FALSE)

dim(train_drybeans)

## [1] 9527   17

dim(test_drybeans )

## [1] 4084   17

tree_drybeans  <- rpart(train_drybeans$roundness ~ train_drybeans$Class+ train_drybeans$Area, data=train_drybeans,control=rpart.control(cp=.0001))
printcp(tree_drybeans )

## 
## Regression tree:
## rpart(formula = train_drybeans$roundness ~ train_drybeans$Class + 
##     train_drybeans$Area, data = train_drybeans, control = rpart.control(cp = 1e-04))
## 
## Variables actually used in tree construction:
## [1] train_drybeans$Area  train_drybeans$Class
## 
## Root node error: 33.979/9527 = 0.0035666
## 
## n= 9527 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.56730987      0   1.00000 1.00038 0.016049
## 2  0.07850734      1   0.43269 0.43291 0.011108
## 3  0.05313633      2   0.35418 0.35446 0.011078
## 4  0.02516983      3   0.30105 0.30132 0.010489
## 5  0.00463457      4   0.27588 0.27617 0.010814
## 6  0.00196237      5   0.27124 0.27166 0.010812
## 7  0.00180239      6   0.26928 0.27180 0.010856
## 8  0.00111562      9   0.26387 0.26858 0.010781
## 9  0.00077146     10   0.26276 0.26783 0.010785
## 10 0.00074953     12   0.26121 0.26786 0.010821
## 11 0.00074830     13   0.26046 0.26815 0.010822
## 12 0.00061847     14   0.25972 0.26764 0.010807
## 13 0.00060038     15   0.25910 0.26719 0.010792
## 14 0.00054637     16   0.25850 0.26686 0.010786
## 15 0.00052477     17   0.25795 0.26656 0.010770
## 16 0.00041559     18   0.25743 0.26577 0.010766
## 17 0.00037320     19   0.25701 0.26738 0.010782
## 18 0.00035290     21   0.25626 0.26923 0.010837
## 19 0.00033952     25   0.25485 0.26908 0.010840
## 20 0.00030047     27   0.25417 0.27001 0.010857
## 21 0.00029488     31   0.25297 0.27277 0.010923
## 22 0.00028246     33   0.25238 0.27332 0.010933
## 23 0.00026361     34   0.25210 0.27376 0.010894
## 24 0.00025764     35   0.25184 0.27535 0.010883
## 25 0.00023795     36   0.25158 0.27628 0.010894
## 26 0.00023081     49   0.24848 0.27673 0.010893
## 27 0.00022414     58   0.24629 0.27698 0.010892
## 28 0.00021897     59   0.24606 0.27709 0.010883
## 29 0.00021892     61   0.24562 0.27686 0.010882
## 30 0.00021610     63   0.24519 0.27676 0.010881
## 31 0.00020439     64   0.24497 0.27735 0.010898
## 32 0.00019177     66   0.24456 0.27783 0.010902
## 33 0.00019007     69   0.24399 0.27857 0.010913
## 34 0.00017332     70   0.24380 0.27917 0.010930
## 35 0.00017250     72   0.24345 0.28146 0.010961
## 36 0.00017173     76   0.24276 0.28146 0.010961
## 37 0.00016651     77   0.24259 0.28180 0.010965
## 38 0.00016507     78   0.24242 0.28291 0.010991
## 39 0.00016144     79   0.24226 0.28291 0.010991
## 40 0.00015919     81   0.24193 0.28308 0.011000
## 41 0.00015682     82   0.24177 0.28352 0.011002
## 42 0.00015666     85   0.24130 0.28349 0.011006
## 43 0.00015244     89   0.24068 0.28421 0.011027
## 44 0.00015096     90   0.24052 0.28458 0.011026
## 45 0.00015006    101   0.23852 0.28485 0.011047
## 46 0.00014579    102   0.23837 0.28536 0.011055
## 47 0.00014375    105   0.23793 0.28563 0.011059
## 48 0.00014347    108   0.23750 0.28576 0.011062
## 49 0.00014174    110   0.23721 0.28601 0.011062
## 50 0.00013954    112   0.23693 0.28615 0.011062
## 51 0.00013834    114   0.23665 0.28634 0.011063
## 52 0.00013569    117   0.23624 0.28677 0.011067
## 53 0.00012894    118   0.23610 0.28695 0.011068
## 54 0.00012675    119   0.23597 0.28734 0.011063
## 55 0.00012420    120   0.23584 0.28757 0.011080
## 56 0.00012131    121   0.23572 0.28722 0.011073
## 57 0.00011929    122   0.23560 0.28785 0.011040
## 58 0.00011828    123   0.23548 0.28809 0.011069
## 59 0.00011694    126   0.23512 0.28749 0.010707
## 60 0.00011618    128   0.23489 0.28774 0.010728
## 61 0.00011496    129   0.23477 0.28778 0.010730
## 62 0.00011490    132   0.23443 0.28780 0.010730
## 63 0.00011335    133   0.23431 0.28790 0.010727
## 64 0.00011214    134   0.23420 0.28824 0.010750
## 65 0.00011206    136   0.23398 0.28847 0.010753
## 66 0.00011064    139   0.23364 0.28844 0.010753
## 67 0.00010880    142   0.23328 0.28849 0.010739
## 68 0.00010854    147   0.23274 0.28850 0.010739
## 69 0.00010578    150   0.23241 0.28866 0.010740
## 70 0.00010573    151   0.23230 0.28857 0.010740
## 71 0.00010425    152   0.23220 0.28864 0.010741
## 72 0.00010421    153   0.23209 0.28879 0.010741
## 73 0.00010000    154   0.23199 0.28894 0.010737

best_drybeans <- tree_drybeans$cptable[which.min(tree_drybeans$cptable[,"xerror"]),"CP"]

pruned_tree_drybeans <- prune(tree_drybeans, cp=best_drybeans)

prp(pruned_tree_drybeans,
    faclen=0,
      extra=1,
     roundint=F,
     digits=5)

ETM 58A-Final

Sena Torlak

`05.06.2022`