Project 3 Outputs

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
global_health <- read_csv("global_health.csv")
Rows: 1880 Columns: 29
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): Country, Country_Code
dbl (27): Year, Fertility_Rate, Urban_Population_Percent, Total_Population, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(global_health)
 [1] "Country"                        "Country_Code"                  
 [3] "Year"                           "Fertility_Rate"                
 [5] "Urban_Population_Percent"       "Total_Population"              
 [7] "Water_Access_Percent"           "Unemployment_Rate"             
 [9] "Sanitary_Expense_Per_GDP"       "Life_Expectancy"               
[11] "Life_Expectancy_Female"         "Life_Expectancy_Male"          
[13] "Infant_Deaths"                  "GDP_Per_Capita"                
[15] "Hospital_Beds_Per_1000"         "Female_Population"             
[17] "Male_Population"                "Alcohol_Consumption_Per_Capita"
[19] "Immunization_Rate"              "Sanitary_Expense_Per_Capita"   
[21] "CO2_Exposure_Percent"           "Air_Pollution"                 
[23] "Labour_Force_Total"             "Tuberculosis_Per_100000"       
[25] "Suicide_Rate_Percent"           "Obesity_Rate_Percent"          
[27] "Underweight_Rate_Percent"       "Overweight_Rate_Percent"       
[29] "Safe_Water_Access_Percent"     
global_health
# A tibble: 1,880 × 29
   Country     Country_Code  Year Fertility_Rate Urban_Population_Percent
   <chr>       <chr>        <dbl>          <dbl>                    <dbl>
 1 Afghanistan AFG           2012           5.83                     24.2
 2 Afghanistan AFG           2013           5.70                     24.4
 3 Afghanistan AFG           2014           5.56                     24.6
 4 Afghanistan AFG           2015           5.40                     24.8
 5 Afghanistan AFG           2016           5.26                     25.0
 6 Afghanistan AFG           2017           5.13                     25.2
 7 Afghanistan AFG           2018           5.00                     25.5
 8 Afghanistan AFG           2019           4.87                     25.8
 9 Afghanistan AFG           2020           4.75                     26.0
10 Afghanistan AFG           2021           4.64                     26.3
# ℹ 1,870 more rows
# ℹ 24 more variables: Total_Population <dbl>, Water_Access_Percent <dbl>,
#   Unemployment_Rate <dbl>, Sanitary_Expense_Per_GDP <dbl>,
#   Life_Expectancy <dbl>, Life_Expectancy_Female <dbl>,
#   Life_Expectancy_Male <dbl>, Infant_Deaths <dbl>, GDP_Per_Capita <dbl>,
#   Hospital_Beds_Per_1000 <dbl>, Female_Population <dbl>,
#   Male_Population <dbl>, Alcohol_Consumption_Per_Capita <dbl>, …
hist(global_health$Life_Expectancy_Male)

library(dplyr)
global_health_new <- global_health%>% filter(Country == "India" | Country == "China" | Country == "United States" | Country == "Indonesia" | Country == "Pakistan" | Country == "Nigeria" | Country == "Brazil" | Country == "Bangladesh" | Country =="Russia" | Country == "Ethiopia" | Country == "Mexico" | Country == "Japan" | Country == "Egypt" | Country == "Philipines" | Country == "Dominican Republic" | Country == "vietnam" | Country == "Iran" | Country == "Turkey" | Country == "Germany" | Country == "Thailand" | Country == "Tanzania" | Country == "United Kingdom"| Country == "France" | Country =="South Africa" | Country == "Italy")
mod1 <- lm(Life_Expectancy_Male ~ GDP_Per_Capita + Tuberculosis_Per_100000 + Safe_Water_Access_Percent + Immunization_Rate + Air_Pollution + Suicide_Rate_Percent, data = global_health_new)
library(psych)
Warning: package 'psych' was built under R version 4.3.3

Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':

    %+%, alpha
data_lm <- global_health_new %>%  select(Life_Expectancy_Male,GDP_Per_Capita,Tuberculosis_Per_100000,Safe_Water_Access_Percent,Immunization_Rate,Air_Pollution,Suicide_Rate_Percent)
pairs.panels(data_lm)

mod2<-lm(Life_Expectancy_Male~GDP_Per_Capita+Tuberculosis_Per_100000+Safe_Water_Access_Percent+Immunization_Rate+Air_Pollution+Suicide_Rate_Percent,data = data_lm)
summary(mod2)

Call:
lm(formula = Life_Expectancy_Male ~ GDP_Per_Capita + Tuberculosis_Per_100000 + 
    Safe_Water_Access_Percent + Immunization_Rate + Air_Pollution + 
    Suicide_Rate_Percent, data = data_lm)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.0878 -1.0732  0.1389  1.2499  4.5823 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                4.239e+01  1.357e+00  31.251  < 2e-16 ***
GDP_Per_Capita             1.255e-04  1.939e-05   6.473 2.15e-09 ***
Tuberculosis_Per_100000   -1.097e-02  1.065e-03 -10.302  < 2e-16 ***
Safe_Water_Access_Percent  3.184e-02  1.661e-02   1.917 0.057605 .  
Immunization_Rate          2.920e-01  2.083e-02  14.017  < 2e-16 ***
Air_Pollution             -3.772e-02  1.071e-02  -3.523 0.000603 ***
Suicide_Rate_Percent       1.350e-01  4.010e-02   3.366 0.001022 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.895 on 121 degrees of freedom
  (62 observations deleted due to missingness)
Multiple R-squared:  0.9444,    Adjusted R-squared:  0.9416 
F-statistic: 342.3 on 6 and 121 DF,  p-value: < 2.2e-16
step(mod1)
Start:  AIC=170.44
Life_Expectancy_Male ~ GDP_Per_Capita + Tuberculosis_Per_100000 + 
    Safe_Water_Access_Percent + Immunization_Rate + Air_Pollution + 
    Suicide_Rate_Percent

                            Df Sum of Sq     RSS    AIC
<none>                                    434.52 170.44
- Safe_Water_Access_Percent  1     13.20  447.72 172.27
- Suicide_Rate_Percent       1     40.69  475.22 179.90
- Air_Pollution              1     44.57  479.09 180.94
- GDP_Per_Capita             1    150.46  584.99 206.50
- Tuberculosis_Per_100000    1    381.16  815.68 249.06
- Immunization_Rate          1    705.52 1140.05 291.91

Call:
lm(formula = Life_Expectancy_Male ~ GDP_Per_Capita + Tuberculosis_Per_100000 + 
    Safe_Water_Access_Percent + Immunization_Rate + Air_Pollution + 
    Suicide_Rate_Percent, data = global_health_new)

Coefficients:
              (Intercept)             GDP_Per_Capita  
               42.3935850                  0.0001255  
  Tuberculosis_Per_100000  Safe_Water_Access_Percent  
               -0.0109688                  0.0318391  
        Immunization_Rate              Air_Pollution  
                0.2920089                 -0.0377223  
     Suicide_Rate_Percent  
                0.1349724  
summary(mod1)

Call:
lm(formula = Life_Expectancy_Male ~ GDP_Per_Capita + Tuberculosis_Per_100000 + 
    Safe_Water_Access_Percent + Immunization_Rate + Air_Pollution + 
    Suicide_Rate_Percent, data = global_health_new)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.0878 -1.0732  0.1389  1.2499  4.5823 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                4.239e+01  1.357e+00  31.251  < 2e-16 ***
GDP_Per_Capita             1.255e-04  1.939e-05   6.473 2.15e-09 ***
Tuberculosis_Per_100000   -1.097e-02  1.065e-03 -10.302  < 2e-16 ***
Safe_Water_Access_Percent  3.184e-02  1.661e-02   1.917 0.057605 .  
Immunization_Rate          2.920e-01  2.083e-02  14.017  < 2e-16 ***
Air_Pollution             -3.772e-02  1.071e-02  -3.523 0.000603 ***
Suicide_Rate_Percent       1.350e-01  4.010e-02   3.366 0.001022 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.895 on 121 degrees of freedom
  (62 observations deleted due to missingness)
Multiple R-squared:  0.9444,    Adjusted R-squared:  0.9416 
F-statistic: 342.3 on 6 and 121 DF,  p-value: < 2.2e-16
plot(mod1)

summary(mod1)

Call:
lm(formula = Life_Expectancy_Male ~ GDP_Per_Capita + Tuberculosis_Per_100000 + 
    Safe_Water_Access_Percent + Immunization_Rate + Air_Pollution + 
    Suicide_Rate_Percent, data = global_health_new)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.0878 -1.0732  0.1389  1.2499  4.5823 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                4.239e+01  1.357e+00  31.251  < 2e-16 ***
GDP_Per_Capita             1.255e-04  1.939e-05   6.473 2.15e-09 ***
Tuberculosis_Per_100000   -1.097e-02  1.065e-03 -10.302  < 2e-16 ***
Safe_Water_Access_Percent  3.184e-02  1.661e-02   1.917 0.057605 .  
Immunization_Rate          2.920e-01  2.083e-02  14.017  < 2e-16 ***
Air_Pollution             -3.772e-02  1.071e-02  -3.523 0.000603 ***
Suicide_Rate_Percent       1.350e-01  4.010e-02   3.366 0.001022 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.895 on 121 degrees of freedom
  (62 observations deleted due to missingness)
Multiple R-squared:  0.9444,    Adjusted R-squared:  0.9416 
F-statistic: 342.3 on 6 and 121 DF,  p-value: < 2.2e-16
global_health_2 <- global_health%>% filter(Country == "India" | Country == "China" | Country == "Pakistan" | Country == "Nigeria" | Country == "Brazil" | Country == "Bangladesh"|Country=="vietnam" )
global_health_2
# A tibble: 60 × 29
   Country    Country_Code  Year Fertility_Rate Urban_Population_Percent
   <chr>      <chr>        <dbl>          <dbl>                    <dbl>
 1 Bangladesh BGD           2012           2.21                     32.0
 2 Bangladesh BGD           2013           2.18                     32.8
 3 Bangladesh BGD           2014           2.15                     33.5
 4 Bangladesh BGD           2015           2.11                     34.3
 5 Bangladesh BGD           2016           2.10                     35.1
 6 Bangladesh BGD           2017           2.04                     35.9
 7 Bangladesh BGD           2018           2.04                     36.6
 8 Bangladesh BGD           2019           2.03                     37.4
 9 Bangladesh BGD           2020           2.00                     38.2
10 Bangladesh BGD           2021           1.98                     38.9
# ℹ 50 more rows
# ℹ 24 more variables: Total_Population <dbl>, Water_Access_Percent <dbl>,
#   Unemployment_Rate <dbl>, Sanitary_Expense_Per_GDP <dbl>,
#   Life_Expectancy <dbl>, Life_Expectancy_Female <dbl>,
#   Life_Expectancy_Male <dbl>, Infant_Deaths <dbl>, GDP_Per_Capita <dbl>,
#   Hospital_Beds_Per_1000 <dbl>, Female_Population <dbl>,
#   Male_Population <dbl>, Alcohol_Consumption_Per_Capita <dbl>, …
names(global_health_2)
 [1] "Country"                        "Country_Code"                  
 [3] "Year"                           "Fertility_Rate"                
 [5] "Urban_Population_Percent"       "Total_Population"              
 [7] "Water_Access_Percent"           "Unemployment_Rate"             
 [9] "Sanitary_Expense_Per_GDP"       "Life_Expectancy"               
[11] "Life_Expectancy_Female"         "Life_Expectancy_Male"          
[13] "Infant_Deaths"                  "GDP_Per_Capita"                
[15] "Hospital_Beds_Per_1000"         "Female_Population"             
[17] "Male_Population"                "Alcohol_Consumption_Per_Capita"
[19] "Immunization_Rate"              "Sanitary_Expense_Per_Capita"   
[21] "CO2_Exposure_Percent"           "Air_Pollution"                 
[23] "Labour_Force_Total"             "Tuberculosis_Per_100000"       
[25] "Suicide_Rate_Percent"           "Obesity_Rate_Percent"          
[27] "Underweight_Rate_Percent"       "Overweight_Rate_Percent"       
[29] "Safe_Water_Access_Percent"     
data_pca<-global_health_2 %>% select(Life_Expectancy_Male, Safe_Water_Access_Percent,Immunization_Rate, GDP_Per_Capita,Tuberculosis_Per_100000,Air_Pollution,Suicide_Rate_Percent,Country)
data_omit <-na.omit(data_pca)
data_omit
# A tibble: 48 × 8
   Life_Expectancy_Male Safe_Water_Access_Per…¹ Immunization_Rate GDP_Per_Capita
                  <dbl>                   <dbl>             <dbl>          <dbl>
 1                 67.6                    96.2                94           877.
 2                 67.1                    96.5                96           974.
 3                 67.9                    96.7                97          1109.
 4                 68.5                    96.9                98          1236.
 5                 69.1                    97.2                98          1660.
 6                 69.8                    97.4                98          1815.
 7                 70.9                    97.6                98          1963.
 8                 70.7                    97.8                98          2122.
 9                 70.3                    85.7                95         12328.
10                 70.7                    86.7                97         12259.
# ℹ 38 more rows
# ℹ abbreviated name: ¹​Safe_Water_Access_Percent
# ℹ 4 more variables: Tuberculosis_Per_100000 <dbl>, Air_Pollution <dbl>,
#   Suicide_Rate_Percent <dbl>, Country <chr>
pca_1 <- prcomp(data_omit[,1:7], scale = TRUE)

summary(pca_1)
Importance of components:
                          PC1    PC2    PC3     PC4     PC5     PC6     PC7
Standard deviation     1.9481 1.4119 0.9038 0.53740 0.24003 0.18767 0.11414
Proportion of Variance 0.5422 0.2848 0.1167 0.04126 0.00823 0.00503 0.00186
Cumulative Proportion  0.5422 0.8269 0.9436 0.98488 0.99311 0.99814 1.00000
pca_1
Standard deviations (1, .., p=7):
[1] 1.9480992 1.4118688 0.9038086 0.5374046 0.2400286 0.1876714 0.1141410

Rotation (n x k) = (7 x 7):
                                   PC1        PC2         PC3         PC4
Life_Expectancy_Male      -0.453424248  0.3128736 -0.04300466 -0.19840758
Safe_Water_Access_Percent -0.342059858  0.4698841 -0.11814104  0.56471316
Immunization_Rate         -0.402829437  0.3304494 -0.38056280 -0.27227819
GDP_Per_Capita            -0.423841334 -0.3481564  0.23002723 -0.20151939
Tuberculosis_Per_100000    0.441689067  0.3147034 -0.11907259  0.33452254
Air_Pollution              0.374695099  0.3960597 -0.17233316 -0.63921476
Suicide_Rate_Percent       0.001680272  0.4415969  0.86172833 -0.08054289
                                 PC5         PC6         PC7
Life_Expectancy_Male      -0.2519143  0.13021101 -0.75821228
Safe_Water_Access_Percent -0.4187967 -0.10066693  0.37923777
Immunization_Rate          0.6701656 -0.07209552  0.23504983
GDP_Per_Capita            -0.1391077 -0.76294736  0.06467951
Tuberculosis_Per_100000    0.2557205 -0.59394685 -0.40202373
Air_Pollution             -0.4264316 -0.17492632  0.22804150
Suicide_Rate_Percent       0.2131636  0.04746297  0.09074689
pca_1$sdev^2
[1] 3.79509038 1.99337352 0.81686990 0.28880372 0.05761374 0.03522056 0.01302818

eigenvalues

screeplot(pca_1, type = "lines")

pca_data_1 <- data.frame(pca_1$x,pca_1=data_omit$Country)
ggplot(pca_data_1, aes(x=PC1, y=PC2,shape = data_omit$Country,colour = data_omit$Country)) + geom_point()

pca_1
Standard deviations (1, .., p=7):
[1] 1.9480992 1.4118688 0.9038086 0.5374046 0.2400286 0.1876714 0.1141410

Rotation (n x k) = (7 x 7):
                                   PC1        PC2         PC3         PC4
Life_Expectancy_Male      -0.453424248  0.3128736 -0.04300466 -0.19840758
Safe_Water_Access_Percent -0.342059858  0.4698841 -0.11814104  0.56471316
Immunization_Rate         -0.402829437  0.3304494 -0.38056280 -0.27227819
GDP_Per_Capita            -0.423841334 -0.3481564  0.23002723 -0.20151939
Tuberculosis_Per_100000    0.441689067  0.3147034 -0.11907259  0.33452254
Air_Pollution              0.374695099  0.3960597 -0.17233316 -0.63921476
Suicide_Rate_Percent       0.001680272  0.4415969  0.86172833 -0.08054289
                                 PC5         PC6         PC7
Life_Expectancy_Male      -0.2519143  0.13021101 -0.75821228
Safe_Water_Access_Percent -0.4187967 -0.10066693  0.37923777
Immunization_Rate          0.6701656 -0.07209552  0.23504983
GDP_Per_Capita            -0.1391077 -0.76294736  0.06467951
Tuberculosis_Per_100000    0.2557205 -0.59394685 -0.40202373
Air_Pollution             -0.4264316 -0.17492632  0.22804150
Suicide_Rate_Percent       0.2131636  0.04746297  0.09074689
aov_N <- aov(pca_data_1$PC1~data_omit$Country)
summary(aov_N)
                  Df Sum Sq Mean Sq F value Pr(>F)    
data_omit$Country  5  173.6   34.71   303.6 <2e-16 ***
Residuals         42    4.8    0.11                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov_N)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = pca_data_1$PC1 ~ data_omit$Country)

$`data_omit$Country`
                          diff        lwr       upr     p adj
Brazil-Bangladesh   -2.6153545 -3.1200345 -2.110674 0.0000000
China-Bangladesh    -1.9364817 -2.4411618 -1.431802 0.0000000
India-Bangladesh     0.7135286  0.2088486  1.218209 0.0016716
Nigeria-Bangladesh   2.9397224  2.4350424  3.444402 0.0000000
Pakistan-Bangladesh  1.4138915  0.9092115  1.918572 0.0000000
China-Brazil         0.6788727  0.1741927  1.183553 0.0030787
India-Brazil         3.3288831  2.8242031  3.833563 0.0000000
Nigeria-Brazil       5.5550768  5.0503968  6.059757 0.0000000
Pakistan-Brazil      4.0292460  3.5245659  4.533926 0.0000000
India-China          2.6500104  2.1453303  3.154690 0.0000000
Nigeria-China        4.8762041  4.3715241  5.380884 0.0000000
Pakistan-China       3.3503732  2.8456932  3.855053 0.0000000
Nigeria-India        2.2261938  1.7215137  2.730874 0.0000000
Pakistan-India       0.7003629  0.1956829  1.205043 0.0021116
Pakistan-Nigeria    -1.5258309 -2.0305109 -1.021151 0.0000000