#IMPORT DATA
df <- readxl::read_excel("C:/Users/USER/Downloads/Life Expectancy Data Clean.xlsx")
## tibble [2,938 × 22] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:2938] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : num [1:2938] 2015 2014 2013 2012 2011 ...
## $ Status : chr [1:2938] "Developing" "Developing" "Developing" "Developing" ...
## $ Life expectancy : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult Mortality : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
## $ infant deaths : num [1:2938] 62 64 66 69 71 74 77 80 82 84 ...
## $ Alcohol : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage expenditure : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis B : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : num [1:2938] 1154 492 430 2787 3013 ...
## $ BMI : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under-five deaths : num [1:2938] 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : num [1:2938] 6 58 62 67 68 66 63 64 63 58 ...
## $ Total expenditure : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV/AIDS : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
## $ Population : num [1:2938] 33736494 327582 31731688 3696958 2978599 ...
## $ thinness 1-19 years : num [1:2938] 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness 5-9 years : num [1:2938] 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income composition of resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
#Statistik Deskriptif dan Karakteristik #Statistika Deskriptif umum
summary(df)
## Country Year Status Life expectancy
## Length:2938 Min. :2000 Length:2938 Min. :36.30
## Class :character 1st Qu.:2004 Class :character 1st Qu.:63.10
## Mode :character Median :2008 Mode :character Median :72.10
## Mean :2008 Mean :69.22
## 3rd Qu.:2012 3rd Qu.:75.70
## Max. :2015 Max. :89.00
## NA's :10
## Adult Mortality infant deaths Alcohol percentage expenditure
## Min. : 1.0 Min. : 0.0 Min. : 0.0100 Min. : 0.000
## 1st Qu.: 74.0 1st Qu.: 0.0 1st Qu.: 0.8775 1st Qu.: 4.685
## Median :144.0 Median : 3.0 Median : 3.7550 Median : 64.913
## Mean :164.8 Mean : 30.3 Mean : 4.6029 Mean : 738.251
## 3rd Qu.:228.0 3rd Qu.: 22.0 3rd Qu.: 7.7025 3rd Qu.: 441.534
## Max. :723.0 Max. :1800.0 Max. :17.8700 Max. :19479.912
## NA's :10 NA's :194
## Hepatitis B Measles BMI under-five deaths
## Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00
## 1st Qu.:77.00 1st Qu.: 0.0 1st Qu.:19.30 1st Qu.: 0.00
## Median :92.00 Median : 17.0 Median :43.50 Median : 4.00
## Mean :80.94 Mean : 2419.6 Mean :38.32 Mean : 42.04
## 3rd Qu.:97.00 3rd Qu.: 360.2 3rd Qu.:56.20 3rd Qu.: 28.00
## Max. :99.00 Max. :212183.0 Max. :87.30 Max. :2500.00
## NA's :553 NA's :34
## Polio Total expenditure Diphtheria HIV/AIDS
## Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100
## 1st Qu.:78.00 1st Qu.: 4.260 1st Qu.:78.00 1st Qu.: 0.100
## Median :93.00 Median : 5.755 Median :93.00 Median : 0.100
## Mean :82.55 Mean : 5.938 Mean :82.32 Mean : 1.742
## 3rd Qu.:97.00 3rd Qu.: 7.492 3rd Qu.:97.00 3rd Qu.: 0.800
## Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600
## NA's :19 NA's :226 NA's :19
## GDP Population thinness 1-19 years
## Min. : 1.68 Min. :3.400e+01 Min. : 0.10
## 1st Qu.: 463.94 1st Qu.:1.958e+05 1st Qu.: 1.60
## Median : 1766.95 Median :1.387e+06 Median : 3.30
## Mean : 7483.16 Mean :1.275e+07 Mean : 4.84
## 3rd Qu.: 5910.81 3rd Qu.:7.420e+06 3rd Qu.: 7.20
## Max. :119172.74 Max. :1.294e+09 Max. :27.70
## NA's :448 NA's :652 NA's :34
## thinness 5-9 years Income composition of resources Schooling
## Min. : 0.10 Min. :0.0000 Min. : 0.00
## 1st Qu.: 1.50 1st Qu.:0.4930 1st Qu.:10.10
## Median : 3.30 Median :0.6770 Median :12.30
## Mean : 4.87 Mean :0.6276 Mean :11.99
## 3rd Qu.: 7.20 3rd Qu.:0.7790 3rd Qu.:14.30
## Max. :28.60 Max. :0.9480 Max. :20.70
## NA's :34 NA's :167 NA's :163
#Statistik Deskriptif variabel numerik
numeric_vars <- df[, sapply(df, is.numeric)]
describe(numeric_vars)
## vars n mean sd median
## Year 1 2938 2007.52 4.61 2008.00
## Life expectancy 2 2928 69.22 9.52 72.10
## Adult Mortality 3 2928 164.80 124.29 144.00
## infant deaths 4 2938 30.30 117.93 3.00
## Alcohol 5 2744 4.60 4.05 3.76
## percentage expenditure 6 2938 738.25 1987.91 64.91
## Hepatitis B 7 2385 80.94 25.07 92.00
## Measles 8 2938 2419.59 11467.27 17.00
## BMI 9 2904 38.32 20.04 43.50
## under-five deaths 10 2938 42.04 160.45 4.00
## Polio 11 2919 82.55 23.43 93.00
## Total expenditure 12 2712 5.94 2.50 5.76
## Diphtheria 13 2919 82.32 23.72 93.00
## HIV/AIDS 14 2938 1.74 5.08 0.10
## GDP 15 2490 7483.16 14270.17 1766.95
## Population 16 2286 12753375.12 61012096.51 1386542.00
## thinness 1-19 years 17 2904 4.84 4.42 3.30
## thinness 5-9 years 18 2904 4.87 4.51 3.30
## Income composition of resources 19 2771 0.63 0.21 0.68
## Schooling 20 2775 11.99 3.36 12.30
## trimmed mad min max
## Year 2007.52 5.93 2000.00 2.015000e+03
## Life expectancy 69.91 8.60 36.30 8.900000e+01
## Adult Mortality 150.51 112.68 1.00 7.230000e+02
## infant deaths 10.20 4.45 0.00 1.800000e+03
## Alcohol 4.23 4.81 0.01 1.787000e+01
## percentage expenditure 230.74 96.24 0.00 1.947991e+04
## Hepatitis B 86.89 8.90 1.00 9.900000e+01
## Measles 286.08 25.20 0.00 2.121830e+05
## BMI 39.05 24.17 1.00 8.730000e+01
## under-five deaths 14.15 5.93 0.00 2.500000e+03
## Polio 88.05 8.90 3.00 9.900000e+01
## Total expenditure 5.85 2.36 0.37 1.760000e+01
## Diphtheria 87.99 8.90 2.00 9.900000e+01
## HIV/AIDS 0.54 0.00 0.10 5.060000e+01
## GDP 3751.73 2360.98 1.68 1.191727e+05
## Population 3953693.58 2012347.06 34.00 1.293859e+09
## thinness 1-19 years 4.14 3.41 0.10 2.770000e+01
## thinness 5-9 years 4.15 3.41 0.10 2.860000e+01
## Income composition of resources 0.65 0.19 0.00 9.500000e-01
## Schooling 12.17 3.11 0.00 2.070000e+01
## range skew kurtosis se
## Year 1.500000e+01 -0.01 -1.21 0.09
## Life expectancy 5.270000e+01 -0.64 -0.24 0.18
## Adult Mortality 7.220000e+02 1.17 1.74 2.30
## infant deaths 1.800000e+03 9.78 115.76 2.18
## Alcohol 1.786000e+01 0.59 -0.81 0.08
## percentage expenditure 1.947991e+04 4.65 26.51 36.68
## Hepatitis B 9.800000e+01 -1.93 2.76 0.51
## Measles 2.121830e+05 9.43 114.58 211.56
## BMI 8.630000e+01 -0.22 -1.29 0.37
## under-five deaths 2.500000e+03 9.49 109.49 2.96
## Polio 9.600000e+01 -2.10 3.76 0.43
## Total expenditure 1.723000e+01 0.62 1.15 0.05
## Diphtheria 9.700000e+01 -2.07 3.55 0.44
## HIV/AIDS 5.050000e+01 5.39 34.80 0.09
## GDP 1.191711e+05 3.20 12.29 285.98
## Population 1.293859e+09 15.90 297.09 1276079.80
## thinness 1-19 years 2.760000e+01 1.71 3.96 0.08
## thinness 5-9 years 2.850000e+01 1.78 4.34 0.08
## Income composition of resources 9.500000e-01 -1.14 1.38 0.00
## Schooling 2.070000e+01 -0.60 0.88 0.06
#Melihat missing value:
colSums(is.na(df))
## Country Year
## 0 0
## Status Life expectancy
## 0 10
## Adult Mortality infant deaths
## 10 0
## Alcohol percentage expenditure
## 194 0
## Hepatitis B Measles
## 553 0
## BMI under-five deaths
## 34 0
## Polio Total expenditure
## 19 226
## Diphtheria HIV/AIDS
## 19 0
## GDP Population
## 448 652
## thinness 1-19 years thinness 5-9 years
## 34 34
## Income composition of resources Schooling
## 167 163
#Pemeriksaan skewness/kurtosis:
describe(numeric_vars)[, c("skew", "kurtosis")]
## skew kurtosis
## Year -0.01 -1.21
## Life expectancy -0.64 -0.24
## Adult Mortality 1.17 1.74
## infant deaths 9.78 115.76
## Alcohol 0.59 -0.81
## percentage expenditure 4.65 26.51
## Hepatitis B -1.93 2.76
## Measles 9.43 114.58
## BMI -0.22 -1.29
## under-five deaths 9.49 109.49
## Polio -2.10 3.76
## Total expenditure 0.62 1.15
## Diphtheria -2.07 3.55
## HIV/AIDS 5.39 34.80
## GDP 3.20 12.29
## Population 15.90 297.09
## thinness 1-19 years 1.71 3.96
## thinness 5-9 years 1.78 4.34
## Income composition of resources -1.14 1.38
## Schooling -0.60 0.88
#Korelasi antar variabel numerik:
cor_matrix <- cor(numeric_vars, use = "pairwise.complete.obs")
cor_matrix[1:10, 1:10] # hanya menampilkan sebagian
## Year Life expectancy Adult Mortality
## Year 1.00000000 0.1700330 -0.07905159
## Life expectancy 0.17003302 1.0000000 -0.69635931
## Adult Mortality -0.07905159 -0.6963593 1.00000000
## infant deaths -0.03741493 -0.1965572 0.07875601
## Alcohol -0.05298978 0.4048768 -0.19584820
## percentage expenditure 0.03139998 0.3818635 -0.24285953
## Hepatitis B 0.10433334 0.2567619 -0.16247633
## Measles -0.08249298 -0.1575858 0.03117641
## BMI 0.10897365 0.5676935 -0.38701678
## under-five deaths -0.04293699 -0.2225291 0.09414613
## infant deaths Alcohol percentage expenditure
## Year -0.03741493 -0.05298978 0.03139998
## Life expectancy -0.19655718 0.40487676 0.38186350
## Adult Mortality 0.07875601 -0.19584820 -0.24285953
## infant deaths 1.00000000 -0.11563768 -0.08561222
## Alcohol -0.11563768 1.00000000 0.34128531
## percentage expenditure -0.08561222 0.34128531 1.00000000
## Hepatitis B -0.22356628 0.08754871 0.01627369
## Measles 0.50112834 -0.05182667 -0.05659568
## BMI -0.22727889 0.33040846 0.22869975
## under-five deaths 0.99662888 -0.11237040 -0.08785231
## Hepatitis B Measles BMI under-five deaths
## Year 0.10433334 -0.08249298 0.1089736 -0.04293699
## Life expectancy 0.25676195 -0.15758580 0.5676935 -0.22252912
## Adult Mortality -0.16247633 0.03117641 -0.3870168 0.09414613
## infant deaths -0.22356628 0.50112834 -0.2272789 0.99662888
## Alcohol 0.08754871 -0.05182667 0.3304085 -0.11237040
## percentage expenditure 0.01627369 -0.05659568 0.2286998 -0.08785231
## Hepatitis B 1.00000000 -0.12052937 0.1503795 -0.23312625
## Measles -0.12052937 1.00000000 -0.1759771 0.50780871
## BMI 0.15037953 -0.17597706 1.0000000 -0.23766852
## under-five deaths -0.23312625 0.50780871 -0.2376685 1.00000000
#cek variabel prediktor
# Variabel respon
Y <- "Life expectancy"
# Variabel predictor
X <- names(df)[names(df) != "Life expectancy"]
X
## [1] "Country" "Year"
## [3] "Status" "Adult Mortality"
## [5] "infant deaths" "Alcohol"
## [7] "percentage expenditure" "Hepatitis B"
## [9] "Measles" "BMI"
## [11] "under-five deaths" "Polio"
## [13] "Total expenditure" "Diphtheria"
## [15] "HIV/AIDS" "GDP"
## [17] "Population" "thinness 1-19 years"
## [19] "thinness 5-9 years" "Income composition of resources"
## [21] "Schooling"
##Cek Multikolinearitas (VIF) .
numeric_vars <- df %>%
select(where(is.numeric))
names(numeric_vars)
## [1] "Year" "Life expectancy"
## [3] "Adult Mortality" "infant deaths"
## [5] "Alcohol" "percentage expenditure"
## [7] "Hepatitis B" "Measles"
## [9] "BMI" "under-five deaths"
## [11] "Polio" "Total expenditure"
## [13] "Diphtheria" "HIV/AIDS"
## [15] "GDP" "Population"
## [17] "thinness 1-19 years" "thinness 5-9 years"
## [19] "Income composition of resources" "Schooling"
model_vif <- lm(`Life expectancy` ~ ., data = numeric_vars)
vif(model_vif)
## Year `Adult Mortality`
## 1.157920 1.809171
## `infant deaths` Alcohol
## 213.609554 2.067310
## `percentage expenditure` `Hepatitis B`
## 12.904426 1.680406
## Measles BMI
## 1.516630 1.802986
## `under-five deaths` Polio
## 203.591034 1.722414
## `Total expenditure` Diphtheria
## 1.124370 2.094307
## `HIV/AIDS` GDP
## 1.500870 13.649710
## Population `thinness 1-19 years`
## 1.943421 7.606109
## `thinness 5-9 years` `Income composition of resources`
## 7.584832 3.028945
## Schooling
## 3.538093