05.06.2022#Finding Datasets
Here is my data sets withing their explanations about attributes and aims.
#Preparing Datasets
##Online News Popularity
#install.packages('openxlsx')
#install.packages('data.table')
#install.packages('skimr')
#install.packages(file.choose(),repos=NULL)
#install.packages("readxl")
library(openxlsx)
library(readxl)
library(skimr)
library(data.table)
data_path_onlinenews=file.choose()
onlinenews=read_excel(data_path_onlinenews,sheet=1)
str(onlinenews)
## tibble [39,644 × 61] (S3: tbl_df/tbl/data.frame)
## $ url : chr [1:39644] "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
## $ timedelta : num [1:39644] 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num [1:39644] 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num [1:39644] 219 255 211 531 1072 ...
## $ n_unique_tokens : num [1:39644] 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num [1:39644] 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num [1:39644] 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num [1:39644] 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num [1:39644] 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num [1:39644] 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num [1:39644] 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num [1:39644] 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num [1:39644] 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num [1:39644] 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num [1:39644] 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num [1:39644] 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num [1:39644] 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num [1:39644] 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num [1:39644] 496 0 918 0 3151 ...
## $ weekday_is_monday : num [1:39644] 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num [1:39644] 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num [1:39644] 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num [1:39644] 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num [1:39644] 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num [1:39644] 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num [1:39644] 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num [1:39644] 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num [1:39644] 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num [1:39644] 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num [1:39644] 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num [1:39644] 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num [1:39644] 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num [1:39644] 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num [1:39644] 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num [1:39644] -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num [1:39644] -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num [1:39644] -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num [1:39644] 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num [1:39644] -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num [1:39644] 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num [1:39644] 0.188 0 0 0 0.136 ...
## $ shares : num [1:39644] 593 711 1500 1200 505 855 556 891 3600 710 ...
summary_data_onlinenews=skim(onlinenews)
print(summary_data_onlinenews)
## ── Data Summary ────────────────────────
## Values
## Name onlinenews
## Number of rows 39644
## Number of columns 61
## _______________________
## Column type frequency:
## character 1
## numeric 60
## ________________________
## Group variables None
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 url 0 1 34 192 0 39644 0
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd
## 1 timedelta 0 1 355. 214.
## 2 n_tokens_title 0 1 10.4 2.11
## 3 n_tokens_content 0 1 547. 471.
## 4 n_unique_tokens 0 1 0.548 3.52
## 5 n_non_stop_words 0 1 0.996 5.23
## 6 n_non_stop_unique_tokens 0 1 0.689 3.26
## 7 num_hrefs 0 1 10.9 11.3
## 8 num_self_hrefs 0 1 3.29 3.86
## 9 num_imgs 0 1 4.54 8.31
## 10 num_videos 0 1 1.25 4.11
## 11 average_token_length 0 1 4.55 0.844
## 12 num_keywords 0 1 7.22 1.91
## 13 data_channel_is_lifestyle 0 1 0.0529 0.224
## 14 data_channel_is_entertainment 0 1 0.178 0.383
## 15 data_channel_is_bus 0 1 0.158 0.365
## 16 data_channel_is_socmed 0 1 0.0586 0.235
## 17 data_channel_is_tech 0 1 0.185 0.389
## 18 data_channel_is_world 0 1 0.213 0.409
## 19 kw_min_min 0 1 26.1 69.6
## 20 kw_max_min 0 1 1154. 3858.
## 21 kw_avg_min 0 1 312. 621.
## 22 kw_min_max 0 1 13612. 57986.
## 23 kw_max_max 0 1 752324. 214502.
## 24 kw_avg_max 0 1 259282. 135102.
## 25 kw_min_avg 0 1 1117. 1137.
## 26 kw_max_avg 0 1 5657. 6099.
## 27 kw_avg_avg 0 1 3136. 1318.
## 28 self_reference_min_shares 0 1 3999. 19739.
## 29 self_reference_max_shares 0 1 10329. 41028.
## 30 self_reference_avg_sharess 0 1 6402. 24211.
## 31 weekday_is_monday 0 1 0.168 0.374
## 32 weekday_is_tuesday 0 1 0.186 0.389
## 33 weekday_is_wednesday 0 1 0.188 0.390
## 34 weekday_is_thursday 0 1 0.183 0.387
## 35 weekday_is_friday 0 1 0.144 0.351
## 36 weekday_is_saturday 0 1 0.0619 0.241
## 37 weekday_is_sunday 0 1 0.0690 0.254
## 38 is_weekend 0 1 0.131 0.337
## 39 LDA_00 0 1 0.185 0.263
## 40 LDA_01 0 1 0.141 0.220
## 41 LDA_02 0 1 0.216 0.282
## 42 LDA_03 0 1 0.224 0.295
## 43 LDA_04 0 1 0.234 0.289
## 44 global_subjectivity 0 1 0.443 0.117
## 45 global_sentiment_polarity 0 1 0.119 0.0969
## 46 global_rate_positive_words 0 1 0.0396 0.0174
## 47 global_rate_negative_words 0 1 0.0166 0.0108
## 48 rate_positive_words 0 1 0.682 0.190
## 49 rate_negative_words 0 1 0.288 0.156
## 50 avg_positive_polarity 0 1 0.354 0.105
## 51 min_positive_polarity 0 1 0.0954 0.0713
## 52 max_positive_polarity 0 1 0.757 0.248
## 53 avg_negative_polarity 0 1 -0.260 0.128
## 54 min_negative_polarity 0 1 -0.522 0.290
## 55 max_negative_polarity 0 1 -0.108 0.0954
## 56 title_subjectivity 0 1 0.282 0.324
## 57 title_sentiment_polarity 0 1 0.0714 0.265
## 58 abs_title_subjectivity 0 1 0.342 0.189
## 59 abs_title_sentiment_polarity 0 1 0.156 0.226
## 60 shares 0 1 3395. 11627.
## p0 p25 p50 p75 p100 hist
## 1 8 164 339 542 731 ▇▇▆▆▇
## 2 2 9 10 12 23 ▁▇▇▁▁
## 3 0 246 409 716 8474 ▇▁▁▁▁
## 4 0 0.471 0.539 0.609 701 ▇▁▁▁▁
## 5 0 1.00 1.00 1.00 1042 ▇▁▁▁▁
## 6 0 0.626 0.690 0.755 650 ▇▁▁▁▁
## 7 0 4 8 14 304 ▇▁▁▁▁
## 8 0 1 3 4 116 ▇▁▁▁▁
## 9 0 1 1 4 128 ▇▁▁▁▁
## 10 0 0 0 1 91 ▇▁▁▁▁
## 11 0 4.48 4.66 4.85 8.04 ▁▁▇▃▁
## 12 1 6 7 9 10 ▁▂▇▇▇
## 13 0 0 0 0 1 ▇▁▁▁▁
## 14 0 0 0 0 1 ▇▁▁▁▂
## 15 0 0 0 0 1 ▇▁▁▁▂
## 16 0 0 0 0 1 ▇▁▁▁▁
## 17 0 0 0 0 1 ▇▁▁▁▂
## 18 0 0 0 0 1 ▇▁▁▁▂
## 19 -1 -1 -1 4 377 ▇▁▁▁▁
## 20 0 445 660 1000 298400 ▇▁▁▁▁
## 21 -1 142. 236. 357 42828. ▇▁▁▁▁
## 22 0 0 1400 7900 843300 ▇▁▁▁▁
## 23 0 843300 843300 843300 843300 ▁▁▁▁▇
## 24 0 172847. 244572. 330980 843300 ▃▇▃▁▁
## 25 -1 0 1024. 2057. 3613. ▇▃▃▂▂
## 26 0 3562. 4356. 6020. 298400 ▇▁▁▁▁
## 27 0 2382. 2870. 3600. 43568. ▇▁▁▁▁
## 28 0 639 1200 2600 843300 ▇▁▁▁▁
## 29 0 1100 2800 8000 843300 ▇▁▁▁▁
## 30 0 981. 2200 5200 843300 ▇▁▁▁▁
## 31 0 0 0 0 1 ▇▁▁▁▂
## 32 0 0 0 0 1 ▇▁▁▁▂
## 33 0 0 0 0 1 ▇▁▁▁▂
## 34 0 0 0 0 1 ▇▁▁▁▂
## 35 0 0 0 0 1 ▇▁▁▁▂
## 36 0 0 0 0 1 ▇▁▁▁▁
## 37 0 0 0 0 1 ▇▁▁▁▁
## 38 0 0 0 0 1 ▇▁▁▁▁
## 39 0 0.0251 0.0334 0.241 0.927 ▇▁▁▁▁
## 40 0 0.0250 0.0333 0.151 0.926 ▇▁▁▁▁
## 41 0 0.0286 0.0400 0.334 0.920 ▇▁▁▁▁
## 42 0 0.0286 0.0400 0.376 0.927 ▇▁▁▁▂
## 43 0 0.0286 0.0407 0.400 0.927 ▇▂▁▁▂
## 44 0 0.396 0.453 0.508 1 ▁▃▇▁▁
## 45 -0.394 0.0578 0.119 0.178 0.728 ▁▂▇▁▁
## 46 0 0.0284 0.0390 0.0503 0.155 ▅▇▁▁▁
## 47 0 0.00962 0.0153 0.0217 0.185 ▇▁▁▁▁
## 48 0 0.6 0.711 0.8 1 ▁▁▃▇▃
## 49 0 0.185 0.28 0.385 1 ▅▇▃▁▁
## 50 0 0.306 0.359 0.411 1 ▁▇▃▁▁
## 51 0 0.05 0.1 0.1 1 ▇▁▁▁▁
## 52 0 0.6 0.8 1 1 ▁▁▅▅▇
## 53 -1 -0.328 -0.253 -0.187 0 ▁▁▂▇▃
## 54 -1 -0.7 -0.5 -0.3 0 ▆▆▇▅▅
## 55 -1 -0.125 -0.1 -0.05 0 ▁▁▁▁▇
## 56 0 0 0.15 0.5 1 ▇▂▂▁▂
## 57 -1 0 0 0.15 1 ▁▁▇▂▁
## 58 0 0.167 0.5 0.5 0.5 ▃▂▁▁▇
## 59 0 0 0 0.25 1 ▇▂▁▁▁
## 60 1 946 1400 2800 843300 ▇▁▁▁▁
## $character
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 url 0 1 34 192 0 39644 0
##
## $numeric
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50
## 1 timedelta 0 1 355. 214. 8 164 339
## 2 n_tokens_title 0 1 10.4 2.11 2 9 10
## 3 n_tokens_content 0 1 547. 471. 0 246 409
## 4 n_unique_tokens 0 1 0.548 3.52 0 0.471 0.539
## 5 n_non_stop_words 0 1 0.996 5.23 0 1.00 1.00
## 6 n_non_stop_unique_… 0 1 0.689 3.26 0 0.626 0.690
## 7 num_hrefs 0 1 10.9 11.3 0 4 8
## 8 num_self_hrefs 0 1 3.29 3.86 0 1 3
## 9 num_imgs 0 1 4.54 8.31 0 1 1
## 10 num_videos 0 1 1.25 4.11 0 0 0
## # … with 50 more rows, and 3 more variables: p75 <dbl>, p100 <dbl>, hist <chr>
onlinenews=data.table(onlinenews)
str(onlinenews)
## Classes 'data.table' and 'data.frame': 39644 obs. of 61 variables:
## $ url : chr "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : num 593 711 1500 1200 505 855 556 891 3600 710 ...
## - attr(*, ".internal.selfref")=<externalptr>
##Absenteeism at Work
data_path_absenteeism=file.choose()
absenteeism=read_excel(data_path_absenteeism,sheet=1)
str(absenteeism)
## tibble [740 × 21] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:740] 11 36 3 7 11 3 10 20 14 1 ...
## $ Reason for absence : num [1:740] 26 0 23 7 23 23 22 23 19 22 ...
## $ Month of absence : num [1:740] 7 7 7 7 7 7 7 7 7 7 ...
## $ Day of the week : num [1:740] 3 3 4 5 5 6 6 6 2 2 ...
## $ Seasons : num [1:740] 1 1 1 1 1 1 1 1 1 1 ...
## $ Transportation expense : num [1:740] 289 118 179 279 289 179 361 260 155 235 ...
## $ Distance from Residence to Work: num [1:740] 36 13 51 5 36 51 52 50 12 11 ...
## $ Service time : num [1:740] 13 18 18 14 13 18 3 11 14 14 ...
## $ Age : num [1:740] 33 50 38 39 33 38 28 36 34 37 ...
## $ Work load Average/day : num [1:740] 239554 239554 239554 239554 239554 ...
## $ Hit target : num [1:740] 97 97 97 97 97 97 97 97 97 97 ...
## $ Disciplinary failure : num [1:740] 0 1 0 0 0 0 0 0 0 0 ...
## $ Education : num [1:740] 1 1 1 1 1 1 1 1 1 3 ...
## $ Son : num [1:740] 2 1 0 2 2 0 1 4 2 1 ...
## $ Social drinker : num [1:740] 1 1 1 1 1 1 1 1 1 0 ...
## $ Social smoker : num [1:740] 0 0 0 1 0 0 0 0 0 0 ...
## $ Pet : num [1:740] 1 0 0 0 1 0 4 0 0 1 ...
## $ Weight : num [1:740] 90 98 89 68 90 89 80 65 95 88 ...
## $ Height : num [1:740] 172 178 170 168 172 170 172 168 196 172 ...
## $ Body mass index : num [1:740] 30 31 31 24 30 31 27 23 25 29 ...
## $ Absenteeism time in hours : num [1:740] 4 0 2 4 2 2 8 4 40 8 ...
summary_data_absenteeism=skim(absenteeism)
print(summary_data_absenteeism)
## ── Data Summary ────────────────────────
## Values
## Name absenteeism
## Number of rows 740
## Number of columns 21
## _______________________
## Column type frequency:
## numeric 21
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd
## 1 ID 0 1 18.0 11.0
## 2 Reason for absence 0 1 19.2 8.43
## 3 Month of absence 0 1 6.32 3.44
## 4 Day of the week 0 1 3.91 1.42
## 5 Seasons 0 1 2.54 1.11
## 6 Transportation expense 0 1 221. 67.0
## 7 Distance from Residence to Work 0 1 29.6 14.8
## 8 Service time 0 1 12.6 4.38
## 9 Age 0 1 36.4 6.48
## 10 Work load Average/day 0 1 271490. 39058.
## 11 Hit target 0 1 94.6 3.78
## 12 Disciplinary failure 0 1 0.0541 0.226
## 13 Education 0 1 1.29 0.673
## 14 Son 0 1 1.02 1.10
## 15 Social drinker 0 1 0.568 0.496
## 16 Social smoker 0 1 0.0730 0.260
## 17 Pet 0 1 0.746 1.32
## 18 Weight 0 1 79.0 12.9
## 19 Height 0 1 172. 6.03
## 20 Body mass index 0 1 26.7 4.29
## 21 Absenteeism time in hours 0 1 6.92 13.3
## p0 p25 p50 p75 p100 hist
## 1 1 9 18 28 36 ▇▇▆▆▆
## 2 0 13 23 26 28 ▁▂▂▂▇
## 3 0 3 6 9 12 ▆▆▇▅▇
## 4 2 3 4 5 6 ▇▇▇▆▇
## 5 1 2 3 4 4 ▇▇▁▇▇
## 6 118 179 225 260 388 ▃▇▅▃▂
## 7 5 16 26 50 52 ▅▃▇▂▇
## 8 1 9 13 16 29 ▂▇▆▃▁
## 9 27 31 37 40 58 ▇▇▃▂▁
## 10 205917 244387 264249 294217 378884 ▅▇▃▂▁
## 11 81 93 95 97 100 ▁▁▃▇▇
## 12 0 0 0 0 1 ▇▁▁▁▁
## 13 1 1 1 1 4 ▇▁▁▁▁
## 14 0 0 1 2 4 ▇▆▅▁▁
## 15 0 0 1 1 1 ▆▁▁▁▇
## 16 0 0 0 0 1 ▇▁▁▁▁
## 17 0 0 0 1 8 ▇▁▁▁▁
## 18 56 69 83 89 108 ▅▇▅▇▂
## 19 163 169 170 172 196 ▃▇▂▁▁
## 20 19 24 25 31 38 ▃▇▅▅▁
## 21 0 2 3 8 120 ▇▁▁▁▁
## $numeric
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50
## 1 ID 0 1 1.80e1 1.10e1 1 9 18
## 2 Reason for absence 0 1 1.92e1 8.43e0 0 13 23
## 3 Month of absence 0 1 6.32e0 3.44e0 0 3 6
## 4 Day of the week 0 1 3.91e0 1.42e0 2 3 4
## 5 Seasons 0 1 2.54e0 1.11e0 1 2 3
## 6 Transportation ex… 0 1 2.21e2 6.70e1 118 179 225
## 7 Distance from Res… 0 1 2.96e1 1.48e1 5 16 26
## 8 Service time 0 1 1.26e1 4.38e0 1 9 13
## 9 Age 0 1 3.65e1 6.48e0 27 31 37
## 10 Work load Average… 0 1 2.71e5 3.91e4 205917 244387 264249
## # … with 11 more rows, and 3 more variables: p75 <dbl>, p100 <dbl>, hist <chr>
absenteeism=data.table(absenteeism)
str(absenteeism)
## Classes 'data.table' and 'data.frame': 740 obs. of 21 variables:
## $ ID : num 11 36 3 7 11 3 10 20 14 1 ...
## $ Reason for absence : num 26 0 23 7 23 23 22 23 19 22 ...
## $ Month of absence : num 7 7 7 7 7 7 7 7 7 7 ...
## $ Day of the week : num 3 3 4 5 5 6 6 6 2 2 ...
## $ Seasons : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Transportation expense : num 289 118 179 279 289 179 361 260 155 235 ...
## $ Distance from Residence to Work: num 36 13 51 5 36 51 52 50 12 11 ...
## $ Service time : num 13 18 18 14 13 18 3 11 14 14 ...
## $ Age : num 33 50 38 39 33 38 28 36 34 37 ...
## $ Work load Average/day : num 239554 239554 239554 239554 239554 ...
## $ Hit target : num 97 97 97 97 97 97 97 97 97 97 ...
## $ Disciplinary failure : num 0 1 0 0 0 0 0 0 0 0 ...
## $ Education : num 1 1 1 1 1 1 1 1 1 3 ...
## $ Son : num 2 1 0 2 2 0 1 4 2 1 ...
## $ Social drinker : num 1 1 1 1 1 1 1 1 1 0 ...
## $ Social smoker : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Pet : num 1 0 0 0 1 0 4 0 0 1 ...
## $ Weight : num 90 98 89 68 90 89 80 65 95 88 ...
## $ Height : num 172 178 170 168 172 170 172 168 196 172 ...
## $ Body mass index : num 30 31 31 24 30 31 27 23 25 29 ...
## $ Absenteeism time in hours : num 4 0 2 4 2 2 8 4 40 8 ...
## - attr(*, ".internal.selfref")=<externalptr>
##Dry Beans
data_path_beans=file.choose()
drybeans=read_excel(data_path_beans,sheet=1)
str(drybeans)
## tibble [13,611 × 17] (S3: tbl_df/tbl/data.frame)
## $ Area : num [1:13611] 28395 28734 29380 30008 30140 ...
## $ Perimeter : num [1:13611] 610 638 624 646 620 ...
## $ MajorAxisLength: num [1:13611] 208 201 213 211 202 ...
## $ MinorAxisLength: num [1:13611] 174 183 176 183 190 ...
## $ AspectRation : num [1:13611] 1.2 1.1 1.21 1.15 1.06 ...
## $ Eccentricity : num [1:13611] 0.55 0.412 0.563 0.499 0.334 ...
## $ ConvexArea : num [1:13611] 28715 29172 29690 30724 30417 ...
## $ EquivDiameter : num [1:13611] 190 191 193 195 196 ...
## $ Extent : num [1:13611] 0.764 0.784 0.778 0.783 0.773 ...
## $ Solidity : num [1:13611] 0.989 0.985 0.99 0.977 0.991 ...
## $ roundness : num [1:13611] 0.958 0.887 0.948 0.904 0.985 ...
## $ Compactness : num [1:13611] 0.913 0.954 0.909 0.928 0.971 ...
## $ ShapeFactor1 : num [1:13611] 0.00733 0.00698 0.00724 0.00702 0.0067 ...
## $ ShapeFactor2 : num [1:13611] 0.00315 0.00356 0.00305 0.00321 0.00366 ...
## $ ShapeFactor3 : num [1:13611] 0.834 0.91 0.826 0.862 0.942 ...
## $ ShapeFactor4 : num [1:13611] 0.999 0.998 0.999 0.994 0.999 ...
## $ Class : chr [1:13611] "SEKER" "SEKER" "SEKER" "SEKER" ...
summary_data_drybeans=skim(drybeans)
print(summary_data_drybeans)
## ── Data Summary ────────────────────────
## Values
## Name drybeans
## Number of rows 13611
## Number of columns 17
## _______________________
## Column type frequency:
## character 1
## numeric 16
## ________________________
## Group variables None
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 Class 0 1 4 8 0 7 0
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0
## 1 Area 0 1 53048. 29324. 20420
## 2 Perimeter 0 1 855. 214. 525.
## 3 MajorAxisLength 0 1 320. 85.7 184.
## 4 MinorAxisLength 0 1 202. 45.0 123.
## 5 AspectRation 0 1 1.58 0.247 1.02
## 6 Eccentricity 0 1 0.751 0.0920 0.219
## 7 ConvexArea 0 1 53768. 29775. 20684
## 8 EquivDiameter 0 1 253. 59.2 161.
## 9 Extent 0 1 0.750 0.0491 0.555
## 10 Solidity 0 1 0.987 0.00466 0.919
## 11 roundness 0 1 0.873 0.0595 0.490
## 12 Compactness 0 1 0.800 0.0617 0.641
## 13 ShapeFactor1 0 1 0.00656 0.00113 0.00278
## 14 ShapeFactor2 0 1 0.00172 0.000596 0.000564
## 15 ShapeFactor3 0 1 0.644 0.0990 0.410
## 16 ShapeFactor4 0 1 0.995 0.00437 0.948
## p25 p50 p75 p100 hist
## 1 36328 44652 61332 254616 ▇▂▁▁▁
## 2 704. 795. 977. 1985. ▇▆▁▁▁
## 3 253. 297. 376. 739. ▇▆▂▁▁
## 4 176. 192. 217. 460. ▇▇▁▁▁
## 5 1.43 1.55 1.71 2.43 ▂▇▅▂▁
## 6 0.716 0.764 0.810 0.911 ▁▁▂▇▇
## 7 36714. 45178 62294 263261 ▇▂▁▁▁
## 8 215. 238. 279. 569. ▇▆▁▁▁
## 9 0.719 0.760 0.787 0.866 ▁▁▅▇▂
## 10 0.986 0.988 0.990 0.995 ▁▁▁▁▇
## 11 0.832 0.883 0.917 0.991 ▁▁▂▇▇
## 12 0.762 0.801 0.834 0.987 ▂▅▇▂▁
## 13 0.00590 0.00665 0.00727 0.0105 ▁▃▇▃▁
## 14 0.00115 0.00169 0.00217 0.00366 ▇▇▇▃▁
## 15 0.581 0.642 0.696 0.975 ▂▇▇▃▁
## 16 0.994 0.996 0.998 1.00 ▁▁▁▁▇
## $character
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 Class 0 1 4 8 0 7 0
##
## $numeric
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50
## 1 Area 0 1 5.30e+4 2.93e+4 2.04e+4 3.63e+4 4.47e+4
## 2 Perimeter 0 1 8.55e+2 2.14e+2 5.25e+2 7.04e+2 7.95e+2
## 3 MajorAxisLen… 0 1 3.20e+2 8.57e+1 1.84e+2 2.53e+2 2.97e+2
## 4 MinorAxisLen… 0 1 2.02e+2 4.50e+1 1.23e+2 1.76e+2 1.92e+2
## 5 AspectRation 0 1 1.58e+0 2.47e-1 1.02e+0 1.43e+0 1.55e+0
## 6 Eccentricity 0 1 7.51e-1 9.20e-2 2.19e-1 7.16e-1 7.64e-1
## 7 ConvexArea 0 1 5.38e+4 2.98e+4 2.07e+4 3.67e+4 4.52e+4
## 8 EquivDiameter 0 1 2.53e+2 5.92e+1 1.61e+2 2.15e+2 2.38e+2
## 9 Extent 0 1 7.50e-1 4.91e-2 5.55e-1 7.19e-1 7.60e-1
## 10 Solidity 0 1 9.87e-1 4.66e-3 9.19e-1 9.86e-1 9.88e-1
## 11 roundness 0 1 8.73e-1 5.95e-2 4.90e-1 8.32e-1 8.83e-1
## 12 Compactness 0 1 8.00e-1 6.17e-2 6.41e-1 7.62e-1 8.01e-1
## 13 ShapeFactor1 0 1 6.56e-3 1.13e-3 2.78e-3 5.90e-3 6.65e-3
## 14 ShapeFactor2 0 1 1.72e-3 5.96e-4 5.64e-4 1.15e-3 1.69e-3
## 15 ShapeFactor3 0 1 6.44e-1 9.90e-2 4.10e-1 5.81e-1 6.42e-1
## 16 ShapeFactor4 0 1 9.95e-1 4.37e-3 9.48e-1 9.94e-1 9.96e-1
## # … with 3 more variables: p75 <dbl>, p100 <dbl>, hist <chr>
drybeans=data.table(drybeans)
str(drybeans)
## Classes 'data.table' and 'data.frame': 13611 obs. of 17 variables:
## $ Area : num 28395 28734 29380 30008 30140 ...
## $ Perimeter : num 610 638 624 646 620 ...
## $ MajorAxisLength: num 208 201 213 211 202 ...
## $ MinorAxisLength: num 174 183 176 183 190 ...
## $ AspectRation : num 1.2 1.1 1.21 1.15 1.06 ...
## $ Eccentricity : num 0.55 0.412 0.563 0.499 0.334 ...
## $ ConvexArea : num 28715 29172 29690 30724 30417 ...
## $ EquivDiameter : num 190 191 193 195 196 ...
## $ Extent : num 0.764 0.784 0.778 0.783 0.773 ...
## $ Solidity : num 0.989 0.985 0.99 0.977 0.991 ...
## $ roundness : num 0.958 0.887 0.948 0.904 0.985 ...
## $ Compactness : num 0.913 0.954 0.909 0.928 0.971 ...
## $ ShapeFactor1 : num 0.00733 0.00698 0.00724 0.00702 0.0067 ...
## $ ShapeFactor2 : num 0.00315 0.00356 0.00305 0.00321 0.00366 ...
## $ ShapeFactor3 : num 0.834 0.91 0.826 0.862 0.942 ...
## $ ShapeFactor4 : num 0.999 0.998 0.999 0.994 0.999 ...
## $ Class : chr "SEKER" "SEKER" "SEKER" "SEKER" ...
## - attr(*, ".internal.selfref")=<externalptr>
#Analyses
##Online News Popularity
#install.packages("caTools")
#install.packages("ISLR")
#install.packages("rpart")
#install.packages("rpart.plot")
library(ISLR)
library(rpart)
library(rpart.plot)
library(caTools)
set.seed(1234)
sample_onlinenews <- sample.split(onlinenews$shares, SplitRatio = 0.7)
train_onlinenews <- subset(onlinenews, sample_onlinenews == TRUE)
test_onlinenews <- subset(onlinenews, sample_onlinenews == FALSE)
dim(train_onlinenews)
## [1] 27831 61
dim(test_onlinenews)
## [1] 11813 61
tree_onlinenews <- rpart(train_onlinenews$shares ~ train_onlinenews$ global_sentiment_polarity + train_onlinenews$avg_positive_polarity, data=train_onlinenews,control=rpart.control(cp=.0001))
printcp(tree_onlinenews)
##
## Regression tree:
## rpart(formula = train_onlinenews$shares ~ train_onlinenews$global_sentiment_polarity +
## train_onlinenews$avg_positive_polarity, data = train_onlinenews,
## control = rpart.control(cp = 1e-04))
##
## Variables actually used in tree construction:
## [1] train_onlinenews$avg_positive_polarity
## [2] train_onlinenews$global_sentiment_polarity
##
## Root node error: 5.0688e+12/27831 = 182127751
##
## n= 27831
##
## CP nsplit rel error xerror xstd
## 1 0.00453842 0 1.00000 1.0000 0.22724
## 2 0.00225979 3 0.98638 1.0382 0.22761
## 3 0.00215863 17 0.93172 1.0522 0.22762
## 4 0.00172573 18 0.92956 1.0580 0.22759
## 5 0.00064706 21 0.92438 1.0640 0.22757
## 6 0.00061514 23 0.92309 1.0708 0.22763
## 7 0.00060658 25 0.92186 1.0707 0.22763
## 8 0.00052853 31 0.91822 1.0702 0.22762
## 9 0.00042264 43 0.91125 1.0746 0.22767
## 10 0.00040661 50 0.90829 1.0796 0.22767
## 11 0.00039138 56 0.90524 1.0814 0.22768
## 12 0.00038725 60 0.90368 1.0815 0.22768
## 13 0.00037646 63 0.90251 1.0820 0.22768
## 14 0.00034240 67 0.90101 1.0831 0.22768
## 15 0.00032118 69 0.90032 1.0874 0.22784
## 16 0.00028246 70 0.90000 1.0898 0.22785
## 17 0.00027555 73 0.89915 1.0912 0.22785
## 18 0.00027416 82 0.89667 1.0916 0.22785
## 19 0.00026612 86 0.89558 1.0916 0.22785
## 20 0.00024649 90 0.89428 1.0924 0.22786
## 21 0.00023619 92 0.89378 1.0947 0.22788
## 22 0.00023601 94 0.89331 1.0951 0.22788
## 23 0.00023547 96 0.89284 1.0951 0.22788
## 24 0.00020771 97 0.89260 1.0961 0.22788
## 25 0.00020583 99 0.89219 1.0967 0.22791
## 26 0.00020273 100 0.89198 1.0968 0.22791
## 27 0.00020194 105 0.89097 1.0969 0.22791
## 28 0.00019861 107 0.89057 1.0977 0.22791
## 29 0.00019449 108 0.89037 1.0981 0.22791
## 30 0.00019225 109 0.89017 1.0981 0.22791
## 31 0.00019210 111 0.88979 1.0981 0.22791
## 32 0.00019178 115 0.88902 1.0984 0.22791
## 33 0.00018348 116 0.88883 1.0985 0.22791
## 34 0.00017771 118 0.88846 1.0989 0.22791
## 35 0.00017353 120 0.88811 1.0997 0.22793
## 36 0.00017187 123 0.88758 1.1001 0.22793
## 37 0.00017120 125 0.88724 1.1002 0.22793
## 38 0.00015912 133 0.88553 1.1008 0.22793
## 39 0.00015122 138 0.88474 1.1030 0.22793
## 40 0.00014765 140 0.88444 1.1033 0.22793
## 41 0.00014030 143 0.88399 1.1042 0.22793
## 42 0.00013620 145 0.88371 1.1048 0.22792
## 43 0.00013565 146 0.88358 1.1053 0.22792
## 44 0.00013546 147 0.88344 1.1054 0.22792
## 45 0.00013537 149 0.88317 1.1054 0.22792
## 46 0.00013347 151 0.88290 1.1058 0.22792
## 47 0.00013105 154 0.88250 1.1061 0.22792
## 48 0.00012866 155 0.88237 1.1063 0.22793
## 49 0.00012651 156 0.88224 1.1064 0.22793
## 50 0.00012479 158 0.88199 1.1067 0.22793
## 51 0.00012420 159 0.88186 1.1067 0.22793
## 52 0.00012107 164 0.88124 1.1070 0.22793
## 53 0.00011849 165 0.88112 1.1073 0.22793
## 54 0.00011803 167 0.88088 1.1075 0.22793
## 55 0.00011317 169 0.88065 1.1078 0.22793
## 56 0.00010710 170 0.88053 1.1086 0.22793
## 57 0.00010444 174 0.88011 1.1091 0.22790
## 58 0.00010380 176 0.87990 1.1093 0.22790
## 59 0.00010189 177 0.87979 1.1094 0.22790
## 60 0.00010115 183 0.87911 1.1098 0.22790
## 61 0.00010000 188 0.87855 1.1099 0.22790
best_onlinenews <- tree_onlinenews$cptable[which.min(tree_onlinenews$cptable[,"xerror"]),"CP"]
pruned_tree_onlinenews <- prune(tree_onlinenews, cp=best_onlinenews)
prp(pruned_tree_onlinenews,
faclen=0,
extra=1,
roundint=F,
digits=5)
##Absenteeism at Work
library(caTools)
library(ISLR)
library(rpart)
library(rpart.plot)
set.seed(1234)
sample_absenteeism<- sample.split(absenteeism$'Hit target', SplitRatio = 0.7)
train_absenteeism <- subset(absenteeism,sample_absenteeism == TRUE)
test_absenteeism <- subset(absenteeism, sample_absenteeism == FALSE)
dim(train_absenteeism)
## [1] 516 21
dim(test_absenteeism )
## [1] 224 21
tree_absenteeism <- rpart(train_absenteeism$'Hit target' ~ train_absenteeism$'Month of absence'+ train_absenteeism$Education, data=train_absenteeism,control=rpart.control(cp=.0001))
printcp(tree_absenteeism )
##
## Regression tree:
## rpart(formula = train_absenteeism$"Hit target" ~ train_absenteeism$"Month of absence" +
## train_absenteeism$Education, data = train_absenteeism, control = rpart.control(cp = 1e-04))
##
## Variables actually used in tree construction:
## [1] train_absenteeism$"Month of absence" train_absenteeism$Education
##
## Root node error: 7316.3/516 = 14.179
##
## n= 516
##
## CP nsplit rel error xerror xstd
## 1 0.30918340 0 1.00000 1.00426 0.092469
## 2 0.15844446 1 0.69082 0.71461 0.062475
## 3 0.05830782 2 0.53237 0.55118 0.042246
## 4 0.04675656 3 0.47406 0.49130 0.033956
## 5 0.03222421 4 0.42731 0.46005 0.027757
## 6 0.00863184 5 0.39508 0.41245 0.025216
## 7 0.00716863 7 0.37782 0.41602 0.025886
## 8 0.00346618 8 0.37065 0.40016 0.024142
## 9 0.00284933 9 0.36719 0.40409 0.024671
## 10 0.00229245 10 0.36434 0.39891 0.024295
## 11 0.00207349 11 0.36204 0.39733 0.024404
## 12 0.00079125 12 0.35997 0.38981 0.024480
## 13 0.00020908 13 0.35918 0.38926 0.024483
## 14 0.00018737 16 0.35855 0.39126 0.024525
## 15 0.00012301 17 0.35836 0.39170 0.024537
## 16 0.00010000 18 0.35824 0.39150 0.024543
best_absenteeism <- tree_absenteeism$cptable[which.min(tree_absenteeism$cptable[,"xerror"]),"CP"]
pruned_tree_absenteeism <- prune(tree_absenteeism, cp=best_absenteeism)
prp(pruned_tree_absenteeism,
faclen=0,
extra=1,
roundint=F,
digits=5)
##Dry Beans
library(caTools)
library(ISLR)
library(rpart)
library(rpart.plot)
set.seed(1234)
sample_drybeans<- sample.split(drybeans$Area, SplitRatio = 0.7)
train_drybeans <- subset(drybeans,sample_drybeans == TRUE)
test_drybeans <- subset(drybeans, sample_drybeans == FALSE)
dim(train_drybeans)
## [1] 9527 17
dim(test_drybeans )
## [1] 4084 17
tree_drybeans <- rpart(train_drybeans$roundness ~ train_drybeans$Class+ train_drybeans$Area, data=train_drybeans,control=rpart.control(cp=.0001))
printcp(tree_drybeans )
##
## Regression tree:
## rpart(formula = train_drybeans$roundness ~ train_drybeans$Class +
## train_drybeans$Area, data = train_drybeans, control = rpart.control(cp = 1e-04))
##
## Variables actually used in tree construction:
## [1] train_drybeans$Area train_drybeans$Class
##
## Root node error: 33.979/9527 = 0.0035666
##
## n= 9527
##
## CP nsplit rel error xerror xstd
## 1 0.56730987 0 1.00000 1.00038 0.016049
## 2 0.07850734 1 0.43269 0.43291 0.011108
## 3 0.05313633 2 0.35418 0.35446 0.011078
## 4 0.02516983 3 0.30105 0.30132 0.010489
## 5 0.00463457 4 0.27588 0.27617 0.010814
## 6 0.00196237 5 0.27124 0.27166 0.010812
## 7 0.00180239 6 0.26928 0.27180 0.010856
## 8 0.00111562 9 0.26387 0.26858 0.010781
## 9 0.00077146 10 0.26276 0.26783 0.010785
## 10 0.00074953 12 0.26121 0.26786 0.010821
## 11 0.00074830 13 0.26046 0.26815 0.010822
## 12 0.00061847 14 0.25972 0.26764 0.010807
## 13 0.00060038 15 0.25910 0.26719 0.010792
## 14 0.00054637 16 0.25850 0.26686 0.010786
## 15 0.00052477 17 0.25795 0.26656 0.010770
## 16 0.00041559 18 0.25743 0.26577 0.010766
## 17 0.00037320 19 0.25701 0.26738 0.010782
## 18 0.00035290 21 0.25626 0.26923 0.010837
## 19 0.00033952 25 0.25485 0.26908 0.010840
## 20 0.00030047 27 0.25417 0.27001 0.010857
## 21 0.00029488 31 0.25297 0.27277 0.010923
## 22 0.00028246 33 0.25238 0.27332 0.010933
## 23 0.00026361 34 0.25210 0.27376 0.010894
## 24 0.00025764 35 0.25184 0.27535 0.010883
## 25 0.00023795 36 0.25158 0.27628 0.010894
## 26 0.00023081 49 0.24848 0.27673 0.010893
## 27 0.00022414 58 0.24629 0.27698 0.010892
## 28 0.00021897 59 0.24606 0.27709 0.010883
## 29 0.00021892 61 0.24562 0.27686 0.010882
## 30 0.00021610 63 0.24519 0.27676 0.010881
## 31 0.00020439 64 0.24497 0.27735 0.010898
## 32 0.00019177 66 0.24456 0.27783 0.010902
## 33 0.00019007 69 0.24399 0.27857 0.010913
## 34 0.00017332 70 0.24380 0.27917 0.010930
## 35 0.00017250 72 0.24345 0.28146 0.010961
## 36 0.00017173 76 0.24276 0.28146 0.010961
## 37 0.00016651 77 0.24259 0.28180 0.010965
## 38 0.00016507 78 0.24242 0.28291 0.010991
## 39 0.00016144 79 0.24226 0.28291 0.010991
## 40 0.00015919 81 0.24193 0.28308 0.011000
## 41 0.00015682 82 0.24177 0.28352 0.011002
## 42 0.00015666 85 0.24130 0.28349 0.011006
## 43 0.00015244 89 0.24068 0.28421 0.011027
## 44 0.00015096 90 0.24052 0.28458 0.011026
## 45 0.00015006 101 0.23852 0.28485 0.011047
## 46 0.00014579 102 0.23837 0.28536 0.011055
## 47 0.00014375 105 0.23793 0.28563 0.011059
## 48 0.00014347 108 0.23750 0.28576 0.011062
## 49 0.00014174 110 0.23721 0.28601 0.011062
## 50 0.00013954 112 0.23693 0.28615 0.011062
## 51 0.00013834 114 0.23665 0.28634 0.011063
## 52 0.00013569 117 0.23624 0.28677 0.011067
## 53 0.00012894 118 0.23610 0.28695 0.011068
## 54 0.00012675 119 0.23597 0.28734 0.011063
## 55 0.00012420 120 0.23584 0.28757 0.011080
## 56 0.00012131 121 0.23572 0.28722 0.011073
## 57 0.00011929 122 0.23560 0.28785 0.011040
## 58 0.00011828 123 0.23548 0.28809 0.011069
## 59 0.00011694 126 0.23512 0.28749 0.010707
## 60 0.00011618 128 0.23489 0.28774 0.010728
## 61 0.00011496 129 0.23477 0.28778 0.010730
## 62 0.00011490 132 0.23443 0.28780 0.010730
## 63 0.00011335 133 0.23431 0.28790 0.010727
## 64 0.00011214 134 0.23420 0.28824 0.010750
## 65 0.00011206 136 0.23398 0.28847 0.010753
## 66 0.00011064 139 0.23364 0.28844 0.010753
## 67 0.00010880 142 0.23328 0.28849 0.010739
## 68 0.00010854 147 0.23274 0.28850 0.010739
## 69 0.00010578 150 0.23241 0.28866 0.010740
## 70 0.00010573 151 0.23230 0.28857 0.010740
## 71 0.00010425 152 0.23220 0.28864 0.010741
## 72 0.00010421 153 0.23209 0.28879 0.010741
## 73 0.00010000 154 0.23199 0.28894 0.010737
best_drybeans <- tree_drybeans$cptable[which.min(tree_drybeans$cptable[,"xerror"]),"CP"]
pruned_tree_drybeans <- prune(tree_drybeans, cp=best_drybeans)
prp(pruned_tree_drybeans,
faclen=0,
extra=1,
roundint=F,
digits=5)