This is an R-markdown document for PCL1491 - Introduction to machine learning (and related fields).
Let’s start by loading all the packages we will need for this session (or install first, as necessary).
library(stats) # for basic regression and stats
library(glmnet) # for the regressions
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library(broom) # to clean/tidy regression results
Now let’s load our data, which we get from https://healthdata.gov/Hospital/COVID-19-Reported-Patient-Impact-and-Hospital-Capa/anag-cw7u and https://databank.worldbank.org/source/health-nutrition-and-population-statistics# .
model_data <- read.csv("/external/rprshnas01/kcni/ychen/git/TeachingScraps/L1_covid_facility_filtered.csv", row.names=1)
pca_data <- read.csv("/external/rprshnas01/kcni/ychen/git/TeachingScraps/L2_WB_health_filtered.csv", row.names=1)
Now that we have our data, we can try some basic demonstrations of the methods mentioned in the lecture portion. Let’s start with ridge and lasso regressions.
### "normal" regression
our_lr_model <- lm(total_beds_7_day_avg ~ is_metro_micro + hospital_subtype + state,
data = model_data) # a simple model for us to test; give us the "m" or slope for "is_metro_micro", "hospital_subtype", "state", with "total_beds_7_day_avg"
summary(our_lr_model)
##
## Call:
## lm(formula = total_beds_7_day_avg ~ is_metro_micro + hospital_subtype +
## state, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -214.12 -57.35 -8.65 21.63 1630.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.4066 26.4226 3.762 0.000169
## is_metro_microtrue 27.7185 3.1409 8.825 < 2e-16
## hospital_subtypeCritical Access Hospitals -69.4483 6.2621 -11.090 < 2e-16
## hospital_subtypeLong Term -71.2646 6.0001 -11.877 < 2e-16
## hospital_subtypeShort Term 20.7200 5.8947 3.515 0.000441
## stateAL -35.4552 26.8623 -1.320 0.186893
## stateAR -33.7800 26.3950 -1.280 0.200639
## stateAZ -23.8960 27.0733 -0.883 0.377446
## stateCA 5.4005 26.1582 0.206 0.836436
## stateCO -3.4038 26.7232 -0.127 0.898648
## stateCT 52.7943 33.8885 1.558 0.119283
## stateDC 96.5742 30.6337 3.153 0.001622
## stateDE 20.5382 35.1380 0.585 0.558892
## stateFL 70.3810 26.2778 2.678 0.007407
## stateGA 15.6371 26.3833 0.593 0.553399
## stateGU 9.8734 128.3877 0.077 0.938701
## stateHI -32.1498 29.0886 -1.105 0.269076
## stateIA -8.6677 27.4814 -0.315 0.752460
## stateID -41.2636 27.9056 -1.479 0.139246
## stateIL 5.3934 26.3929 0.204 0.838082
## stateIN -20.0398 26.7443 -0.749 0.453682
## stateKS -34.6161 26.0764 -1.327 0.184368
## stateKY 23.0772 26.4129 0.874 0.382291
## stateLA -51.3752 26.0661 -1.971 0.048747
## stateMA 26.7704 27.3075 0.980 0.326940
## stateMD 88.2743 28.3552 3.113 0.001855
## stateME -21.2126 29.1916 -0.727 0.467440
## stateMI -15.7411 26.4493 -0.595 0.551758
## stateMN -8.6010 27.3232 -0.315 0.752926
## stateMO -19.0445 26.7107 -0.713 0.475863
## stateMP -45.1266 43.4207 -1.039 0.298689
## stateMS -36.7780 26.2891 -1.399 0.161840
## stateMT -18.3265 26.5172 -0.691 0.489504
## stateNC 1.6895 28.0008 0.060 0.951886
## stateND -17.4279 26.8991 -0.648 0.517061
## stateNE -11.4790 26.2851 -0.437 0.662326
## stateNH -5.0747 29.1948 -0.174 0.862007
## stateNJ 19.6339 29.3081 0.670 0.502922
## stateNM -19.8370 32.0832 -0.618 0.536388
## stateNV -20.5451 128.3891 -0.160 0.872866
## stateNY 63.2496 26.5770 2.380 0.017332
## stateOH 0.2513 26.3270 0.010 0.992384
## stateOK -54.2185 26.1847 -2.071 0.038412
## stateOR -3.4639 27.5550 -0.126 0.899965
## statePA 47.5020 26.3583 1.802 0.071540
## statePR 117.4049 51.4395 2.282 0.022481
## stateRI 9.5327 49.2582 0.194 0.846550
## stateSC 1.4804 27.9356 0.053 0.957738
## stateSD -49.8855 27.2319 -1.832 0.066991
## stateTN -8.1266 26.4741 -0.307 0.758875
## stateTX 39.5221 28.7995 1.372 0.169985
## stateUT -29.6968 27.8748 -1.065 0.286730
## stateVA 22.3171 27.7753 0.803 0.421705
## stateVI -26.1266 67.9889 -0.384 0.700779
## stateVT -3.8556 31.7909 -0.121 0.903472
## stateWA -4.0253 27.1575 -0.148 0.882172
## stateWI 6.2761 26.4476 0.237 0.812424
## stateWV -0.5587 27.9368 -0.020 0.984045
## stateWY -18.5634 28.0730 -0.661 0.508460
##
## (Intercept) ***
## is_metro_microtrue ***
## hospital_subtypeCritical Access Hospitals ***
## hospital_subtypeLong Term ***
## hospital_subtypeShort Term ***
## stateAL
## stateAR
## stateAZ
## stateCA
## stateCO
## stateCT
## stateDC **
## stateDE
## stateFL **
## stateGA
## stateGU
## stateHI
## stateIA
## stateID
## stateIL
## stateIN
## stateKS
## stateKY
## stateLA *
## stateMA
## stateMD **
## stateME
## stateMI
## stateMN
## stateMO
## stateMP
## stateMS
## stateMT
## stateNC
## stateND
## stateNE
## stateNH
## stateNJ
## stateNM
## stateNV
## stateNY *
## stateOH
## stateOK *
## stateOR
## statePA .
## statePR *
## stateRI
## stateSC
## stateSD .
## stateTN
## stateTX
## stateUT
## stateVA
## stateVI
## stateVT
## stateWA
## stateWI
## stateWV
## stateWY
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 125.8 on 14709 degrees of freedom
## Multiple R-squared: 0.2055, Adjusted R-squared: 0.2024
## F-statistic: 65.61 on 58 and 14709 DF, p-value: < 2.2e-16
tidy_lr_results <- tidy(our_lr_model) # a function to make the model object into a more intepretable table
our_lr_model <- lm(total_beds_7_day_avg ~ all_adult_hospital_beds_7_day_avg +
inpatient_beds_used_7_day_avg,
data = model_data) # harder-to-interpret model that we can compare with ridge/lasso
tidy_lr_results <- tidy(our_lr_model)
head(tidy_lr_results) # peek at our results
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 8.89 0.308 28.8 6.22e-178
## 2 all_adult_hospital_beds_7_day_avg 0.408 0.00532 76.8 0.
## 3 inpatient_beds_used_7_day_avg 0.881 0.00690 128. 0.
### ridge/lasso/elastic net regression
set.seed(777) # to control the "randomization" that's part of ML
# for ridge and lasso regression, we need to separate our dataset
independent_variables_matrix <- as.matrix(model_data[, c("all_adult_hospital_beds_7_day_avg",
"inpatient_beds_used_7_day_avg")]) # independent variables (predictors)
dependent_variable <- model_data$total_beds_7_day_avg # dependent variable (what we're trying to predict)
# ridge regression
fit_for_lambda <- cv.glmnet(independent_variables_matrix, # get lambda for ridge regression
dependent_variable,
alpha=0) # alpha = 0 is how we specify ridge regression
our_ridge_model <- glmnet(independent_variables_matrix, # get ridge regression model
dependent_variable,
lambda = fit_for_lambda$lambda.1se) # this is one version of the "best" lambda stored in "fit_for_lambda"
tidy_ridge_results <- tidy(our_ridge_model) # our ridge results
head(tidy_ridge_results) # peek at our results
## # A tibble: 3 x 5
## term step estimate lambda dev.ratio
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1 19.2 19.6 0.926
## 2 all_adult_hospital_beds_7_day_avg 1 0.330 19.6 0.926
## 3 inpatient_beds_used_7_day_avg 1 0.776 19.6 0.926
# lasoo regression
fit_for_lambda <- cv.glmnet(independent_variables_matrix, # get lambda for lasso regression
dependent_variable,
alpha=1) # alpha = 1 is how we specify lasso regression
our_lasso_model <- glmnet(independent_variables_matrix, # get lasso regression model
dependent_variable,
lambda = fit_for_lambda$lambda.1se) # this is one version of the "best" lambda stored in "fit_for_lambda"
tidy_lasso_results <- tidy(our_lasso_model) # our lasso results
head(tidy_lasso_results) # peek at our results
## # A tibble: 3 x 5
## term step estimate lambda dev.ratio
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1 13.2 8.31 0.942
## 2 all_adult_hospital_beds_7_day_avg 1 0.376 8.31 0.942
## 3 inpatient_beds_used_7_day_avg 1 0.835 8.31 0.942
# elastic net regression
fit_for_lambda <- cv.glmnet(independent_variables_matrix, # get lambda for EN regression
dependent_variable,
alpha=0.5) # alpha between 0 and 1 is how we specify EN regression
our_EN_model <- glmnet(independent_variables_matrix, # get EN regression model
dependent_variable,
lambda = fit_for_lambda$lambda.1se) # this is one version of the "best" lambda stored in "fit_for_lambda"
tidy_EN_results <- tidy(our_EN_model) # our lasso results
head(tidy_EN_results) # peek at our results
## # A tibble: 3 x 5
## term step estimate lambda dev.ratio
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1 14.9 11.4 0.939
## 2 all_adult_hospital_beds_7_day_avg 1 0.363 11.4 0.939
## 3 inpatient_beds_used_7_day_avg 1 0.819 11.4 0.939
We can see that these regression methods give similar results, but ridge and lasso give more moderate estimates (towards zero).
The other method we will look at with code is PCA.
# before PCA
str(pca_data) # look at the structure of our data, we want to enter only numeric (or int) columns for PCA
## 'data.frame': 148 obs. of 189 variables:
## $ Year : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ Year.Code : chr "YR2018" "YR2018" "YR2018" "YR2018" ...
## $ Country.Name : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ Country.Code : chr "AFG" "ALB" "DZA" "AGO" ...
## $ Adolescent.fertility.rate..births.per.1.000.women.ages.15.19...SP.ADO.TFRT. : num 65.14 19.57 9.83 147.96 62.57 ...
## $ Age.dependency.ratio....of.working.age.population...SP.POP.DPND. : num 84.1 45.8 57.5 96.2 56 ...
## $ Age.dependency.ratio..old..SP.POP.DPND.OL. : num 4.76 20.04 10.02 4.35 17.34 ...
## $ Age.dependency.ratio..young..SP.POP.DPND.YG. : num 79.3 25.8 47.5 91.8 38.6 ...
## $ Birth.rate..crude..per.1.000.people...SP.DYN.CBRT.IN. : num 32.5 11.8 24.3 40.7 17 ...
## $ Current.health.expenditure....of.GDP...SH.XPD.CHEX.GD.ZS. : num 9.4 5.26 6.22 2.55 9.62 ...
## $ Current.health.expenditure.per.capita..current.US....SH.XPD.CHEX.PC.CD. : num 49.8 274.9 255.9 87.6 1127.9 ...
## $ Current.health.expenditure.per.capita..PPP..current.international.....SH.XPD.CHEX.PP.CD. : num 186 697 963 165 1990 ...
## $ Death.rate..crude..per.1.000.people...SP.DYN.CDRT.IN. : num 6.42 7.9 4.72 8.19 7.61 ...
## $ Domestic.general.government.health.expenditure....of.current.health.expenditure...SH.XPD.GHED.CH.ZS. : num 5.17 53.99 65.83 41.93 61.41 ...
## $ Domestic.general.government.health.expenditure....of.GDP...SH.XPD.GHED.GD.ZS. : num 0.486 2.842 4.094 1.069 5.91 ...
## $ Domestic.general.government.health.expenditure.per.capita..current.US....SH.XPD.GHED.PC.CD. : num 2.58 148.44 168.45 36.74 692.62 ...
## $ Domestic.general.government.health.expenditure.per.capita..PPP..current.international.....SH.XPD.GHED.PP.CD. : num 9.64 376.5 633.8 69.06 1221.78 ...
## $ Domestic.private.health.expenditure....of.current.health.expenditure...SH.XPD.PVTD.CH.ZS. : num 78.4 44.7 34.1 54.7 38.3 ...
## $ Domestic.private.health.expenditure.per.capita..current.US....SH.XPD.PVTD.PC.CD. : num 39.1 122.8 87.4 48 431.9 ...
## $ Domestic.private.health.expenditure.per.capita..PPP...current.international.....SH.XPD.PVTD.PP.CD. : num 146.1 311.4 328.7 90.2 761.8 ...
## $ External.health.expenditure....of.current.health.expenditure...SH.XPD.EHEX.CH.ZS. : num 16.445 1.348 0.027 3.323 0.303 ...
## $ External.health.expenditure.per.capita..current.US....SH.XPD.EHEX.PC.CD. : num 8.197 3.707 0.069 2.911 3.413 ...
## $ External.health.expenditure.per.capita..PPP..current.international.....SH.XPD.EHEX.PP.CD. : num 30.655 9.403 0.259 5.473 6.02 ...
## $ Fertility.rate..total..births.per.woman...SP.DYN.TFRT.IN. : num 4.47 1.62 3.02 5.52 2.26 ...
## $ GNI.per.capita..Atlas.method..current.US....NY.GNP.PCAP.CD. : num 520 4860 3980 3210 12370 ...
## $ Immunization..DPT....of.children.ages.12.23.months...SH.IMM.IDPT. : num 66 99 91 63 86 92 95 95 90 99 ...
## $ Immunization..HepB3....of.one.year.old.children...SH.IMM.HEPB. : num 66 99 91 59 86 92 95 95 90 99 ...
## $ Immunization..Hib3....of.children.ages.12.23.months...SH.IMM.HIB3. : num 66 99 91 59 86 92 94 95 90 99 ...
## $ Immunization..measles....of.children.ages.12.23.months...SH.IMM.MEAS. : num 64 94 80 50 94 95 95 96 89 99 ...
## $ Immunization..Pol3....of.one.year.old.children...SH.IMM.POL3. : num 73 99 91 56 84 92 95 96 90 99 ...
## $ Incidence.of.tuberculosis..per.100.000.people...SH.TBS.INCD. : num 189 18 69 355 27 31 6.6 63 14 11 ...
## $ Labor.force..female....of.total.labor.force...SL.TLF.TOTL.FE.ZS. : num 21.2 42.2 20.1 50.2 43 ...
## $ Labor.force..total..SL.TLF.TOTL.IN. : num 10319282 1400072 12589369 12705650 20551682 ...
## $ Life.expectancy.at.birth..female..years...SP.DYN.LE00.FE.IN. : num 66 80.2 77.9 63.7 79.9 ...
## $ Life.expectancy.at.birth..male..years...SP.DYN.LE00.MA.IN. : num 63 76.8 75.5 58.1 73.1 ...
## $ Life.expectancy.at.birth..total..years...SP.DYN.LE00.IN. : num 64.5 78.5 76.7 60.8 76.5 ...
## $ Mortality.caused.by.road.traffic.injury..per.100.000.people...SH.STA.TRAF.P5. : num 14.2 11 21 26 13.6 20.4 4.8 8.1 8.2 5.7 ...
## $ Mortality.caused.by.road.traffic.injury..female..per.100.000.female.population...SH.STA.TRAF.FE.P5. : num 5.3 5.1 13.6 17.8 5.5 8.6 2.5 3.2 3.5 1.8 ...
## $ Mortality.caused.by.road.traffic.injury..male..per.100.000.male.population...SH.STA.TRAF.MA.P5. : num 22.7 16.7 28.1 34.4 22 33.6 7.1 13 13.2 7.9 ...
## $ Mortality.from.CVD..cancer..diabetes.or.CRD.between.exact.ages.30.and.70......SH.DYN.NCOM.ZS. : num 35.4 11.8 13.9 22.1 16.1 20 8.7 27.4 19.8 16.1 ...
## $ Mortality.from.CVD..cancer..diabetes.or.CRD.between.exact.ages.30.and.70..female......SH.DYN.NCOM.FE.ZS. : num 36 6.7 12.9 19.3 12.2 13.4 6.9 20.6 16.6 15.6 ...
## $ Mortality.from.CVD..cancer..diabetes.or.CRD.between.exact.ages.30.and.70..male......SH.DYN.NCOM.MA.ZS. : num 34.7 16.8 14.9 24.9 20.3 28.1 10.6 34.7 23.4 16.3 ...
## $ Mortality.rate.attributed.to.unintentional.poisoning..per.100.000.population...SH.STA.POIS.P5. : num 0.9 0.3 0.7 2 0.5 0.7 0.1 0.9 0.2 0.2 ...
## $ Mortality.rate.attributed.to.unintentional.poisoning..female..per.100.000.female.population...SH.STA.POIS.P5.FE. : num 1.6 0.2 0.6 1.4 0.4 0.3 0.1 0.6 0.1 0.1 ...
## $ Mortality.rate.attributed.to.unintentional.poisoning..male..per.100.000.male.population...SH.STA.POIS.P5.MA. : num 0.3 0.4 0.8 2.7 0.7 1.1 0.2 1.3 0.3 0.3 ...
## $ Mortality.rate..adult..female..per.1.000.female.adults...SP.DYN.AMRT.FE. : num 192.5 49.5 80.7 220.3 79.5 ...
## $ Mortality.rate..adult..male..per.1.000.male.adults...SP.DYN.AMRT.MA. : num 237.6 93.3 102.9 327 147.1 ...
## $ Mortality.rate..infant..per.1.000.live.births...SP.DYN.IMRT.IN. : num 48 8.5 20.4 51.9 8.7 11 3.1 19.3 11.1 6.1 ...
## $ Mortality.rate..infant..female..per.1.000.live.births...SP.DYN.IMRT.FE.IN. : num 44.6 7.6 19 46.3 7.6 9.8 2.8 17.2 10.3 5.9 ...
## $ Mortality.rate..infant..male..per.1.000.live.births...SP.DYN.IMRT.MA.IN. : num 51.3 9.3 21.8 57.1 9.7 12.2 3.4 21.2 11.9 6.2 ...
## $ Mortality.rate..neonatal..per.1.000.live.births...SH.DYN.NMRT. : num 36.9 7.2 16 28.3 6.2 6.7 2.3 11.5 6.9 3 ...
## $ Mortality.rate..under.5..per.1.000...SH.DYN.MORT. : num 62.5 9.5 23.8 77.7 9.8 12.4 3.7 21.6 12.9 7.1 ...
## $ Mortality.rate..under.5..female..per.1.000...SH.DYN.MORT.FE. : num 58.8 8.7 22.3 71.4 8.6 11 3.3 19.4 12 6.8 ...
## $ Mortality.rate..under.5..male..per.1.000...SH.DYN.MORT.MA. : num 66.1 10.3 25.2 83.5 10.9 13.6 4 23.7 13.8 7.3 ...
## $ Number.of.deaths.ages.10.14.years..SH.DTH.1014. : int 2030 38 1153 5157 780 36 133 192 9 15 ...
## $ Number.of.deaths.ages.15.19.years..SH.DTH.1519. : int 12310 77 1520 8620 2449 78 444 344 32 33 ...
## $ Number.of.deaths.ages.20.24.years..SH.DTH.2024. : int 12990 94 2148 10858 3421 107 646 517 86 63 ...
## $ Number.of.deaths.ages.5.9.years..SH.DTH.0509. : int 2744 32 1506 10494 623 43 123 262 7 20 ...
## $ Number.of.infant.deaths..SH.DTH.IMRT. : int 57394 289 20875 64019 6560 458 984 3247 60 132 ...
## $ Number.of.infant.deaths..female..SH.DTH.IMRT.FE. : int 25852 124 9489 28240 2828 193 435 1362 27 63 ...
## $ Number.of.infant.deaths..male..SH.DTH.IMRT.MA. : int 31542 165 11386 35779 3732 265 549 1885 33 69 ...
## $ Number.of.neonatal.deaths..SH.DTH.NMRT. : int 44503 243 16407 35489 4650 277 732 1925 37 66 ...
## $ Number.of.stillbirths..SH.DTH.STLB. : int 35934 137 9768 25745 4042 549 713 1557 64 132 ...
## $ Number.of.under.five.deaths..SH.DTH.MORT. : int 74595 326 24279 94443 7373 515 1162 3660 69 154 ...
## $ Number.of.under.five.deaths..female..SH.DTH.MORT.FE. : int 34070 143 11112 42823 3197 217 512 1537 31 73 ...
## $ Number.of.under.five.deaths..male..SH.DTH.MORT.MA. : int 40525 183 13167 51620 4176 298 650 2123 38 81 ...
## $ Out.of.pocket.expenditure....of.current.health.expenditure...SH.XPD.OOPC.CH.ZS. : num 78.4 44.6 32.6 36.8 27.7 ...
## $ Out.of.pocket.expenditure.per.capita..current.US....SH.XPD.OOPC.PC.CD. : num 39.1 122.6 83.5 32.2 312.8 ...
## $ Out.of.pocket.expenditure.per.capita..PPP..current.international.....SH.XPD.OOPC.PP.CD. : num 146.1 310.9 314.3 60.6 551.7 ...
## $ Population.ages.00.04..female..SP.POP.0004.FE. : int 2727073 82505 2422461 2749390 1844394 99327 794602 408391 12565 52909 ...
## $ Population.ages.00.04..female....of.female.population...SP.POP.0004.FE.5Y. : num 15.08 5.87 11.59 17.66 8.09 ...
## $ Population.ages.00.04..male..SP.POP.0004.MA. : int 2874440 89622 2528375 2803152 1914611 111514 838004 463199 13399 55227 ...
## $ Population.ages.00.04..male....of.male.population...SP.POP.0004.MA.5Y. : num 15.05 6.14 11.85 18.39 8.82 ...
## $ Population.ages.00.14..total..SP.POP.0014.TO. : int 16017646 506571 12731313 14421718 11017254 609052 4794332 2322882 86711 302185 ...
## $ Population.ages.0.14....of.total.population...SP.POP.0014.TO.ZS. : num 43.1 17.7 30.1 46.8 24.8 ...
## $ Population.ages.0.14..female..SP.POP.0014.FE.IN. : int 7814407 238481 6232890 7189744 5407790 285125 2334187 1081862 42787 147405 ...
## $ Population.ages.0.14..female....of.female.population...SP.POP.0014.FE.ZS. : num 43.2 17 29.8 46.2 23.7 ...
## $ Population.ages.0.14..male..SP.POP.0014.MA.IN. : int 8203224 268103 6498425 7231968 5609438 323941 2460104 1241004 43927 154722 ...
## $ Population.ages.0.14..male....of.male.population...SP.POP.0014.MA.ZS. : num 43 18.4 30.5 47.4 25.9 ...
## $ Population.ages.05.09..female..SP.POP.0509.FE. : int 2632208 75338 2141363 2414154 1803649 98837 788084 369379 14159 50135 ...
## $ Population.ages.05.09..female....of.female.population...SP.POP.0509.FE.5Y. : num 14.56 5.36 10.25 15.51 7.91 ...
## $ Population.ages.05.09..male..SP.POP.0509.MA. : int 2756089 84841 2232391 2425919 1870993 112450 830426 425679 14563 52956 ...
## $ Population.ages.05.09..male....of.male.population...SP.POP.0509.MA.5Y. : num 14.43 5.81 10.46 15.92 8.62 ...
## $ Population.ages.10.14..female..SP.POP.1014.FE. : int 2455126 80638 1669067 2026200 1759748 86961 751502 304092 16062 44361 ...
## $ Population.ages.10.14..female....of.female.population...SP.POP.1014.FE.5Y. : num 13.58 5.73 7.99 13.01 7.72 ...
## $ Population.ages.10.14..male..SP.POP.1014.MA. : int 2572695 93640 1737660 2002897 1823833 99976 791675 352126 15965 46539 ...
## $ Population.ages.10.14..male....of.male.population...SP.POP.1014.MA.5Y. : num 13.47 6.41 8.15 13.14 8.41 ...
## $ Population.ages.15.19..female..SP.POP.1519.FE. : int 2148072 102209 1387891 1644830 1732000 77952 731126 304713 16402 37517 ...
## $ Population.ages.15.19..female....of.female.population...SP.POP.1519.FE.5Y. : num 11.88 7.27 6.64 10.57 7.6 ...
## $ Population.ages.15.19..male..SP.POP.1519.MA. : int 2266701 112218 1447710 1616143 1790240 90656 764512 348524 16286 43070 ...
## $ Population.ages.15.19..male....of.male.population...SP.POP.1519.MA.5Y. : num 11.87 7.69 6.79 10.6 8.25 ...
## $ Population.ages.15.64....of.total.population...SP.POP.1564.TO.ZS. : num 54.3 68.6 63.5 51 64.1 ...
## $ Population.ages.15.64..female..SP.POP.1564.FE.IN. : num 9749139 963389 13290323 8000112 14464449 ...
## $ Population.ages.15.64..female....of.female.population...SP.POP.1564.FE.ZS. : num 53.9 68.5 63.6 51.4 63.4 ...
## $ Population.ages.15.64..male..SP.POP.1564.MA.IN. : num 10444774 1002443 13520007 7705073 14065973 ...
## $ Population.ages.15.64..male....of.male.population...SP.POP.1564.MA.ZS. : num 54.7 68.7 63.4 50.6 64.8 ...
## $ Population.ages.15.64..total..SP.POP.1564.TO. : num 20193861 1965829 26810333 15705184 28530443 ...
## $ Population.ages.20.24..female..SP.POP.2024.FE. : int 1774121 116433 1586720 1364692 1734969 97623 797931 379511 16708 44464 ...
## $ Population.ages.20.24..female....of.female.population...SP.POP.2024.FE.5Y. : num 9.81 8.28 7.59 8.77 7.61 ...
## [list output truncated]
colnames(pca_data[,5:189]) # we see columns 5+ (12-49, if we want to use model_data) work for our purposes, let's see what columns they are
## [1] "Adolescent.fertility.rate..births.per.1.000.women.ages.15.19...SP.ADO.TFRT."
## [2] "Age.dependency.ratio....of.working.age.population...SP.POP.DPND."
## [3] "Age.dependency.ratio..old..SP.POP.DPND.OL."
## [4] "Age.dependency.ratio..young..SP.POP.DPND.YG."
## [5] "Birth.rate..crude..per.1.000.people...SP.DYN.CBRT.IN."
## [6] "Current.health.expenditure....of.GDP...SH.XPD.CHEX.GD.ZS."
## [7] "Current.health.expenditure.per.capita..current.US....SH.XPD.CHEX.PC.CD."
## [8] "Current.health.expenditure.per.capita..PPP..current.international.....SH.XPD.CHEX.PP.CD."
## [9] "Death.rate..crude..per.1.000.people...SP.DYN.CDRT.IN."
## [10] "Domestic.general.government.health.expenditure....of.current.health.expenditure...SH.XPD.GHED.CH.ZS."
## [11] "Domestic.general.government.health.expenditure....of.GDP...SH.XPD.GHED.GD.ZS."
## [12] "Domestic.general.government.health.expenditure.per.capita..current.US....SH.XPD.GHED.PC.CD."
## [13] "Domestic.general.government.health.expenditure.per.capita..PPP..current.international.....SH.XPD.GHED.PP.CD."
## [14] "Domestic.private.health.expenditure....of.current.health.expenditure...SH.XPD.PVTD.CH.ZS."
## [15] "Domestic.private.health.expenditure.per.capita..current.US....SH.XPD.PVTD.PC.CD."
## [16] "Domestic.private.health.expenditure.per.capita..PPP...current.international.....SH.XPD.PVTD.PP.CD."
## [17] "External.health.expenditure....of.current.health.expenditure...SH.XPD.EHEX.CH.ZS."
## [18] "External.health.expenditure.per.capita..current.US....SH.XPD.EHEX.PC.CD."
## [19] "External.health.expenditure.per.capita..PPP..current.international.....SH.XPD.EHEX.PP.CD."
## [20] "Fertility.rate..total..births.per.woman...SP.DYN.TFRT.IN."
## [21] "GNI.per.capita..Atlas.method..current.US....NY.GNP.PCAP.CD."
## [22] "Immunization..DPT....of.children.ages.12.23.months...SH.IMM.IDPT."
## [23] "Immunization..HepB3....of.one.year.old.children...SH.IMM.HEPB."
## [24] "Immunization..Hib3....of.children.ages.12.23.months...SH.IMM.HIB3."
## [25] "Immunization..measles....of.children.ages.12.23.months...SH.IMM.MEAS."
## [26] "Immunization..Pol3....of.one.year.old.children...SH.IMM.POL3."
## [27] "Incidence.of.tuberculosis..per.100.000.people...SH.TBS.INCD."
## [28] "Labor.force..female....of.total.labor.force...SL.TLF.TOTL.FE.ZS."
## [29] "Labor.force..total..SL.TLF.TOTL.IN."
## [30] "Life.expectancy.at.birth..female..years...SP.DYN.LE00.FE.IN."
## [31] "Life.expectancy.at.birth..male..years...SP.DYN.LE00.MA.IN."
## [32] "Life.expectancy.at.birth..total..years...SP.DYN.LE00.IN."
## [33] "Mortality.caused.by.road.traffic.injury..per.100.000.people...SH.STA.TRAF.P5."
## [34] "Mortality.caused.by.road.traffic.injury..female..per.100.000.female.population...SH.STA.TRAF.FE.P5."
## [35] "Mortality.caused.by.road.traffic.injury..male..per.100.000.male.population...SH.STA.TRAF.MA.P5."
## [36] "Mortality.from.CVD..cancer..diabetes.or.CRD.between.exact.ages.30.and.70......SH.DYN.NCOM.ZS."
## [37] "Mortality.from.CVD..cancer..diabetes.or.CRD.between.exact.ages.30.and.70..female......SH.DYN.NCOM.FE.ZS."
## [38] "Mortality.from.CVD..cancer..diabetes.or.CRD.between.exact.ages.30.and.70..male......SH.DYN.NCOM.MA.ZS."
## [39] "Mortality.rate.attributed.to.unintentional.poisoning..per.100.000.population...SH.STA.POIS.P5."
## [40] "Mortality.rate.attributed.to.unintentional.poisoning..female..per.100.000.female.population...SH.STA.POIS.P5.FE."
## [41] "Mortality.rate.attributed.to.unintentional.poisoning..male..per.100.000.male.population...SH.STA.POIS.P5.MA."
## [42] "Mortality.rate..adult..female..per.1.000.female.adults...SP.DYN.AMRT.FE."
## [43] "Mortality.rate..adult..male..per.1.000.male.adults...SP.DYN.AMRT.MA."
## [44] "Mortality.rate..infant..per.1.000.live.births...SP.DYN.IMRT.IN."
## [45] "Mortality.rate..infant..female..per.1.000.live.births...SP.DYN.IMRT.FE.IN."
## [46] "Mortality.rate..infant..male..per.1.000.live.births...SP.DYN.IMRT.MA.IN."
## [47] "Mortality.rate..neonatal..per.1.000.live.births...SH.DYN.NMRT."
## [48] "Mortality.rate..under.5..per.1.000...SH.DYN.MORT."
## [49] "Mortality.rate..under.5..female..per.1.000...SH.DYN.MORT.FE."
## [50] "Mortality.rate..under.5..male..per.1.000...SH.DYN.MORT.MA."
## [51] "Number.of.deaths.ages.10.14.years..SH.DTH.1014."
## [52] "Number.of.deaths.ages.15.19.years..SH.DTH.1519."
## [53] "Number.of.deaths.ages.20.24.years..SH.DTH.2024."
## [54] "Number.of.deaths.ages.5.9.years..SH.DTH.0509."
## [55] "Number.of.infant.deaths..SH.DTH.IMRT."
## [56] "Number.of.infant.deaths..female..SH.DTH.IMRT.FE."
## [57] "Number.of.infant.deaths..male..SH.DTH.IMRT.MA."
## [58] "Number.of.neonatal.deaths..SH.DTH.NMRT."
## [59] "Number.of.stillbirths..SH.DTH.STLB."
## [60] "Number.of.under.five.deaths..SH.DTH.MORT."
## [61] "Number.of.under.five.deaths..female..SH.DTH.MORT.FE."
## [62] "Number.of.under.five.deaths..male..SH.DTH.MORT.MA."
## [63] "Out.of.pocket.expenditure....of.current.health.expenditure...SH.XPD.OOPC.CH.ZS."
## [64] "Out.of.pocket.expenditure.per.capita..current.US....SH.XPD.OOPC.PC.CD."
## [65] "Out.of.pocket.expenditure.per.capita..PPP..current.international.....SH.XPD.OOPC.PP.CD."
## [66] "Population.ages.00.04..female..SP.POP.0004.FE."
## [67] "Population.ages.00.04..female....of.female.population...SP.POP.0004.FE.5Y."
## [68] "Population.ages.00.04..male..SP.POP.0004.MA."
## [69] "Population.ages.00.04..male....of.male.population...SP.POP.0004.MA.5Y."
## [70] "Population.ages.00.14..total..SP.POP.0014.TO."
## [71] "Population.ages.0.14....of.total.population...SP.POP.0014.TO.ZS."
## [72] "Population.ages.0.14..female..SP.POP.0014.FE.IN."
## [73] "Population.ages.0.14..female....of.female.population...SP.POP.0014.FE.ZS."
## [74] "Population.ages.0.14..male..SP.POP.0014.MA.IN."
## [75] "Population.ages.0.14..male....of.male.population...SP.POP.0014.MA.ZS."
## [76] "Population.ages.05.09..female..SP.POP.0509.FE."
## [77] "Population.ages.05.09..female....of.female.population...SP.POP.0509.FE.5Y."
## [78] "Population.ages.05.09..male..SP.POP.0509.MA."
## [79] "Population.ages.05.09..male....of.male.population...SP.POP.0509.MA.5Y."
## [80] "Population.ages.10.14..female..SP.POP.1014.FE."
## [81] "Population.ages.10.14..female....of.female.population...SP.POP.1014.FE.5Y."
## [82] "Population.ages.10.14..male..SP.POP.1014.MA."
## [83] "Population.ages.10.14..male....of.male.population...SP.POP.1014.MA.5Y."
## [84] "Population.ages.15.19..female..SP.POP.1519.FE."
## [85] "Population.ages.15.19..female....of.female.population...SP.POP.1519.FE.5Y."
## [86] "Population.ages.15.19..male..SP.POP.1519.MA."
## [87] "Population.ages.15.19..male....of.male.population...SP.POP.1519.MA.5Y."
## [88] "Population.ages.15.64....of.total.population...SP.POP.1564.TO.ZS."
## [89] "Population.ages.15.64..female..SP.POP.1564.FE.IN."
## [90] "Population.ages.15.64..female....of.female.population...SP.POP.1564.FE.ZS."
## [91] "Population.ages.15.64..male..SP.POP.1564.MA.IN."
## [92] "Population.ages.15.64..male....of.male.population...SP.POP.1564.MA.ZS."
## [93] "Population.ages.15.64..total..SP.POP.1564.TO."
## [94] "Population.ages.20.24..female..SP.POP.2024.FE."
## [95] "Population.ages.20.24..female....of.female.population...SP.POP.2024.FE.5Y."
## [96] "Population.ages.20.24..male..SP.POP.2024.MA."
## [97] "Population.ages.20.24..male....of.male.population...SP.POP.2024.MA.5Y."
## [98] "Population.ages.25.29..female..SP.POP.2529.FE."
## [99] "Population.ages.25.29..female....of.female.population...SP.POP.2529.FE.5Y."
## [100] "Population.ages.25.29..male..SP.POP.2529.MA."
## [101] "Population.ages.25.29..male....of.male.population...SP.POP.2529.MA.5Y."
## [102] "Population.ages.30.34..female..SP.POP.3034.FE."
## [103] "Population.ages.30.34..female....of.female.population...SP.POP.3034.FE.5Y."
## [104] "Population.ages.30.34..male..SP.POP.3034.MA."
## [105] "Population.ages.30.34..male....of.male.population...SP.POP.3034.MA.5Y."
## [106] "Population.ages.35.39..female..SP.POP.3539.FE."
## [107] "Population.ages.35.39..female....of.female.population...SP.POP.3539.FE.5Y."
## [108] "Population.ages.35.39..male..SP.POP.3539.MA."
## [109] "Population.ages.35.39..male....of.male.population...SP.POP.3539.MA.5Y."
## [110] "Population.ages.40.44..female..SP.POP.4044.FE."
## [111] "Population.ages.40.44..female....of.female.population...SP.POP.4044.FE.5Y."
## [112] "Population.ages.40.44..male..SP.POP.4044.MA."
## [113] "Population.ages.40.44..male....of.male.population...SP.POP.4044.MA.5Y."
## [114] "Population.ages.45.49..female..SP.POP.4549.FE."
## [115] "Population.ages.45.49..female....of.female.population...SP.POP.4549.FE.5Y."
## [116] "Population.ages.45.49..male..SP.POP.4549.MA."
## [117] "Population.ages.45.49..male....of.male.population...SP.POP.4549.MA.5Y."
## [118] "Population.ages.50.54..female..SP.POP.5054.FE."
## [119] "Population.ages.50.54..female....of.female.population...SP.POP.5054.FE.5Y."
## [120] "Population.ages.50.54..male..SP.POP.5054.MA."
## [121] "Population.ages.50.54..male....of.male.population...SP.POP.5054.MA.5Y."
## [122] "Population.ages.55.59..female..SP.POP.5559.FE."
## [123] "Population.ages.55.59..female....of.female.population...SP.POP.5559.FE.5Y."
## [124] "Population.ages.55.59..male..SP.POP.5559.MA."
## [125] "Population.ages.55.59..male....of.male.population...SP.POP.5559.MA.5Y."
## [126] "Population.ages.60.64..female..SP.POP.6064.FE."
## [127] "Population.ages.60.64..female....of.female.population...SP.POP.6064.FE.5Y."
## [128] "Population.ages.60.64..male..SP.POP.6064.MA."
## [129] "Population.ages.60.64..male....of.male.population...SP.POP.6064.MA.5Y."
## [130] "Population.ages.65.and.above....of.total.population...SP.POP.65UP.TO.ZS."
## [131] "Population.ages.65.and.above..female..SP.POP.65UP.FE.IN."
## [132] "Population.ages.65.and.above..female....of.female.population...SP.POP.65UP.FE.ZS."
## [133] "Population.ages.65.and.above..male..SP.POP.65UP.MA.IN."
## [134] "Population.ages.65.and.above..male....of.male.population...SP.POP.65UP.MA.ZS."
## [135] "Population.ages.65.and.above..total..SP.POP.65UP.TO."
## [136] "Population.ages.65.69..female..SP.POP.6569.FE."
## [137] "Population.ages.65.69..female....of.female.population...SP.POP.6569.FE.5Y."
## [138] "Population.ages.65.69..male..SP.POP.6569.MA."
## [139] "Population.ages.65.69..male....of.male.population...SP.POP.6569.MA.5Y."
## [140] "Population.ages.70.74..female..SP.POP.7074.FE."
## [141] "Population.ages.70.74..female....of.female.population...SP.POP.7074.FE.5Y."
## [142] "Population.ages.70.74..male..SP.POP.7074.MA."
## [143] "Population.ages.70.74..male....of.male.population...SP.POP.7074.MA.5Y."
## [144] "Population.ages.75.79..female..SP.POP.7579.FE."
## [145] "Population.ages.75.79..female....of.female.population...SP.POP.7579.FE.5Y."
## [146] "Population.ages.75.79..male..SP.POP.7579.MA."
## [147] "Population.ages.75.79..male....of.male.population...SP.POP.7579.MA.5Y."
## [148] "Population.ages.80.and.above..female..SP.POP.80UP.FE."
## [149] "Population.ages.80.and.above..male..SP.POP.80UP.MA."
## [150] "Population.ages.80.and.above..male....of.male.population...SP.POP.80UP.MA.5Y."
## [151] "Population.ages.80.and.older..female....of.female.population...SP.POP.80UP.FE.5Y."
## [152] "Population.growth..annual.....SP.POP.GROW."
## [153] "Population..female..SP.POP.TOTL.FE.IN."
## [154] "Population..female....of.total.population...SP.POP.TOTL.FE.ZS."
## [155] "Population..male..SP.POP.TOTL.MA.IN."
## [156] "Population..male....of.total.population...SP.POP.TOTL.MA.ZS."
## [157] "Population..total..SP.POP.TOTL."
## [158] "Prevalence.of.anemia.among.non.pregnant.women....of.women.ages.15.49...SH.ANM.NPRG.ZS."
## [159] "Prevalence.of.anemia.among.pregnant.women......SH.PRG.ANEM."
## [160] "Prevalence.of.anemia.among.women.of.reproductive.age....of.women.ages.15.49...SH.ANM.ALLW.ZS."
## [161] "Probability.of.dying.among.adolescents.ages.10.14.years..per.1.000...SH.DYN.1014."
## [162] "Probability.of.dying.among.adolescents.ages.15.19.years..per.1.000...SH.DYN.1519."
## [163] "Probability.of.dying.among.children.ages.5.9.years..per.1.000...SH.DYN.0509."
## [164] "Probability.of.dying.among.youth.ages.20.24.years..per.1.000...SH.DYN.2024."
## [165] "Rural.population..SP.RUR.TOTL."
## [166] "Rural.population....of.total.population...SP.RUR.TOTL.ZS."
## [167] "Rural.population.growth..annual.....SP.RUR.TOTL.ZG."
## [168] "Sex.ratio.at.birth..male.births.per.female.births...SP.POP.BRTH.MF."
## [169] "Stillbirth.rate..per.1.000.total.births...SH.DYN.STLB."
## [170] "Suicide.mortality.rate..per.100.000.population...SH.STA.SUIC.P5."
## [171] "Suicide.mortality.rate..female..per.100.000.female.population...SH.STA.SUIC.FE.P5."
## [172] "Suicide.mortality.rate..male..per.100.000.male.population...SH.STA.SUIC.MA.P5."
## [173] "Survival.to.age.65..female....of.cohort...SP.DYN.TO65.FE.ZS."
## [174] "Survival.to.age.65..male....of.cohort...SP.DYN.TO65.MA.ZS."
## [175] "Total.alcohol.consumption.per.capita..liters.of.pure.alcohol..projected.estimates..15..years.of.age...SH.ALC.PCAP.LI."
## [176] "Total.alcohol.consumption.per.capita..female..liters.of.pure.alcohol..projected.estimates..female.15..years.of.age...SH.ALC.PCAP.FE.LI."
## [177] "Total.alcohol.consumption.per.capita..male..liters.of.pure.alcohol..projected.estimates..male.15..years.of.age...SH.ALC.PCAP.MA.LI."
## [178] "Tuberculosis.case.detection.rate.....all.forms...SH.TBS.DTEC.ZS."
## [179] "Tuberculosis.death.rate..per.100.000.people...SH.TBS.MORT."
## [180] "Unemployment..female....of.female.labor.force...SL.UEM.TOTL.FE.ZS."
## [181] "Unemployment..male....of.male.labor.force...SL.UEM.TOTL.MA.ZS."
## [182] "Unemployment..total....of.total.labor.force...SL.UEM.TOTL.ZS."
## [183] "Urban.population..SP.URB.TOTL."
## [184] "Urban.population....of.total.population...SP.URB.TOTL.IN.ZS."
## [185] "Urban.population.growth..annual.....SP.URB.GROW."
# perform PCA
PCA_overall_results <- prcomp(pca_data[,5:189]) # the basic PCA run
# unpack our PCA results
var_res <- data.frame(princ_comp = colnames(PCA_overall_results$x),
stdev = PCA_overall_results$sdev) # start by looking at variance explained, take these variables from our result object
var_res$variance <- var_res$stdev^2 # get variance from sdev squared
var_res$var_explained_pct <- round((var_res$variance/sum(var_res$variance))*100,2) #convert each variance value into % of total var (then round)
head(var_res) # peek at our PC's and how they perform
## princ_comp stdev variance var_explained_pct
## 1 PC1 1112381702 1.237393e+18 99.72
## 2 PC2 52525320 2.758909e+15 0.22
## 3 PC3 23833633 5.680421e+14 0.05
## 4 PC4 8910208 7.939180e+13 0.01
## 5 PC5 2569784 6.603792e+12 0.00
## 6 PC6 1096299 1.201871e+12 0.00
PC_res <- PCA_overall_results$x # get actual PC values
PC_res <- PC_res[,1:4] # take the first 4 PCs since they explain all variance (we see from var_res)
row.names(PC_res) <- pca_data$Country.Name # set row names to countries
head(PC_res) # peek at our reduced matrix
## PC1 PC2 PC3 PC4
## Afghanistan -122097986 -13113334 2211329 -211729.9
## Albania -178305961 6141970 2860026 1423776.1
## Algeria -111597303 12284542 -4433777 -7069062.9
## Angola -131222935 4506801 -8407058 -189091.3
## Argentina -105732110 26058598 -8679147 -3233015.5
## Armenia -178196351 6150329 2695433 1238518.1
rotation_res <- PCA_overall_results$rotation # rotation results, essentially helps us see the correlation between components and variables
head(rotation_res[order(rotation_res[,1], decreasing = T),1:4]) # peek at the cor. between (1st 4) PCs and variables, sorted by PC1 results
## PC1 PC2 PC3
## Population..total..SP.POP.TOTL. 0.5786763 -0.06963585 -0.05254202
## Population.ages.15.64..total..SP.POP.1564.TO. 0.3760739 0.13457510 0.37777992
## Urban.population..SP.URB.TOTL. 0.3138990 0.55043186 -0.37816864
## Population..male..SP.POP.TOTL.MA.IN. 0.2918076 -0.04613614 0.04159357
## Population..female..SP.POP.TOTL.FE.IN. 0.2866681 -0.02447826 -0.09432308
## Rural.population..SP.RUR.TOTL. 0.2646467 -0.62070487 0.32550454
## PC4
## Population..total..SP.POP.TOTL. -0.08339044
## Population.ages.15.64..total..SP.POP.1564.TO. -0.20992133
## Urban.population..SP.URB.TOTL. -0.21783109
## Population..male..SP.POP.TOTL.MA.IN. -0.14197758
## Population..female..SP.POP.TOTL.FE.IN. 0.05607432
## Rural.population..SP.RUR.TOTL. 0.13280449
# plot
plot(PCA_overall_results$x[,1], PCA_overall_results$x[,2]) # see our "clusters"
plot(pca_data$Population..total..SP.POP.TOTL., pca_data$Population.ages.15.64..total..SP.POP.1564.TO.) # see the two highest-correlated variables of PC1