Project 2

Load Libraries

library(corrplot)

corrplot 0.92 loaded

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(dplyr)
library(sf)

Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE

library(rnaturalearth)

Load Dataset

setwd("C:/Users/rafiz/Downloads")
data <- read_csv("life_exp_kaggle_full.csv")

Rows: 3306 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): Country Name, Country Code, Region, IncomeGroup
dbl (12): Year, Life Expectancy World Bank, Prevelance of Undernourishment, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(data)

# A tibble: 6 × 16
  `Country Name`  `Country Code` Region IncomeGroup  Year Life Expectancy Worl…¹
  <chr>           <chr>          <chr>  <chr>       <dbl>                  <dbl>
1 Afghanistan     AFG            South… Low income   2001                   56.3
2 Angola          AGO            Sub-S… Lower midd…  2001                   47.1
3 Albania         ALB            Europ… Upper midd…  2001                   74.3
4 Andorra         AND            Europ… High income  2001                   NA  
5 United Arab Em… ARE            Middl… High income  2001                   74.5
6 Argentina       ARG            Latin… Upper midd…  2001                   73.8
# ℹ abbreviated name: ¹`Life Expectancy World Bank`
# ℹ 10 more variables: `Prevelance of Undernourishment` <dbl>, CO2 <dbl>,
#   `Health Expenditure %` <dbl>, `Education Expenditure %` <dbl>,
#   Unemployment <dbl>, Corruption <dbl>, Sanitation <dbl>, Injuries <dbl>,
#   Communicable <dbl>, NonCommunicable <dbl>

Clean the data

#removing all NA data
data_clean <- data |>
  filter(!is.na(`Life Expectancy World Bank`)) |>
  filter(!is.na(`Prevelance of Undernourishment`)) |>
  filter(!is.na(CO2)) |>
  filter(!is.na(`Health Expenditure %`)) |>
  filter(!is.na(`Education Expenditure %`)) |>
  filter(!is.na(Unemployment)) |>
  filter(!is.na(Sanitation)) |>
  filter(!is.na(Injuries)) 

#getting rid of columns i dont want/deem unnecessary  
data_clean2 <- data_clean[, -c(12, 15, 16)]

#removing the non-numeric columns for the correlation plot
cor_data_clean3 <- data_clean2[, -c(1, 2, 3, 4)]

#renaming for ease
cor_data_clean4 <- cor_data_clean3 |>
  rename(lifeExp = `Life Expectancy World Bank`, undernourished = `Prevelance of Undernourishment`, healthExp = `Health Expenditure %`, eduExp = `Education Expenditure %`, unemployment = Unemployment, sanitation = Sanitation, injuries = Injuries)

Correlation Plot

cor <- cor(cor_data_clean4)
corrplot(cor, 
         method = "number",
         tl.cex = 0.6,
         number.cex = 0.8
         )

Scatter Plot

ggplot(cor_data_clean4, aes(x = `undernourished`, y = `lifeExp`)) +
  labs(title = "Undernourishment vs Life Expectancy", x = "Undernourishment", y = "Life Expectancy") +
  geom_point(color = "pink") +
  geom_smooth(method = lm) +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

Linear Regression Models

fit1 <- lm(data = cor_data_clean4, lifeExp ~ undernourished + CO2 + healthExp + eduExp + unemployment + sanitation + injuries)
summary(fit1)


Call:
lm(formula = lifeExp ~ undernourished + CO2 + healthExp + eduExp + 
    unemployment + sanitation + injuries, data = cor_data_clean4)

Residuals:
     Min       1Q   Median       3Q      Max 
-19.8491  -2.0877   0.3807   2.9323  20.0466 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     7.171e+01  6.239e-01 114.929  < 2e-16 ***
undernourished -5.634e-01  1.796e-02 -31.375  < 2e-16 ***
CO2            -2.036e-07  1.803e-07  -1.129   0.2592    
healthExp       5.543e-01  5.960e-02   9.301  < 2e-16 ***
eduExp         -1.641e-01  8.107e-02  -2.025   0.0431 *  
unemployment   -1.986e-01  2.460e-02  -8.072 1.57e-15 ***
sanitation      7.684e-02  5.962e-03  12.889  < 2e-16 ***
injuries        5.695e-08  2.657e-08   2.144   0.0323 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.576 on 1288 degrees of freedom
Multiple R-squared:  0.7246,    Adjusted R-squared:  0.7231 
F-statistic: 484.2 on 7 and 1288 DF,  p-value: < 2.2e-16

removing co2 and exuExp

fit2 <- lm(data = cor_data_clean4, lifeExp ~ undernourished + healthExp + unemployment + sanitation + injuries)
summary(fit2)


Call:
lm(formula = lifeExp ~ undernourished + healthExp + unemployment + 
    sanitation + injuries, data = cor_data_clean4)

Residuals:
     Min       1Q   Median       3Q      Max 
-20.9908  -2.0431   0.3706   2.9597  19.9203 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     7.113e+01  5.489e-01 129.582  < 2e-16 ***
undernourished -5.561e-01  1.768e-02 -31.451  < 2e-16 ***
healthExp       5.257e-01  5.830e-02   9.018  < 2e-16 ***
unemployment   -2.027e-01  2.444e-02  -8.293 2.74e-16 ***
sanitation      7.597e-02  5.956e-03  12.755  < 2e-16 ***
injuries        3.794e-08  1.871e-08   2.027   0.0428 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.582 on 1290 degrees of freedom
Multiple R-squared:  0.7235,    Adjusted R-squared:  0.7225 
F-statistic: 675.2 on 5 and 1290 DF,  p-value: < 2.2e-16

r squared went down so we stick with the first fit

Diagnostic Plots

par(mfrow = c(2,2))
plot(fit1)

Equation

The equation for my model is: 71.71 + (-0.5634)undernourished + (-2.036e-07)CO2 + (0.5543)healthExp + (-0.1641)eduExp + (-0.1986)unemployment + (7.684e-02)sanitation + (5.695e-08)injuries