# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(broom)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
# Load dataset
setwd("~/Downloads/25_Semesters/Fall/DATA101")
df <- read_csv("AllCountries.csv")
## Rows: 217 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Code
## dbl (24): LandArea, Population, Density, GDP, Rural, CO2, PumpPrice, Militar...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect structure
str(df)
## spc_tbl_ [217 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Country : chr [1:217] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## $ Code : chr [1:217] "AFG" "ALB" "DZA" "ASM" ...
## $ LandArea : num [1:217] 652.86 27.4 2381.74 0.2 0.47 ...
## $ Population : num [1:217] 37.172 2.866 42.228 0.055 0.077 ...
## $ Density : num [1:217] 56.9 104.6 17.7 277.3 163.8 ...
## $ GDP : num [1:217] 521 5254 4279 NA 42030 ...
## $ Rural : num [1:217] 74.5 39.7 27.4 12.8 11.9 34.5 75.4 8.1 36.9 56.6 ...
## $ CO2 : num [1:217] 0.29 1.98 3.74 NA 5.83 1.29 5.74 4.78 1.9 8.41 ...
## $ PumpPrice : num [1:217] 0.7 1.36 0.28 NA NA 0.97 NA 1.1 0.77 NA ...
## $ Military : num [1:217] 3.72 4.08 13.81 NA NA ...
## $ Health : num [1:217] 2.01 9.51 10.73 NA 14.02 ...
## $ ArmedForces : num [1:217] 323 9 317 NA NA 117 0 105 49 NA ...
## $ Internet : num [1:217] 11.4 71.8 47.7 NA 98.9 14.3 76 75.8 69.7 97.2 ...
## $ Cell : num [1:217] 67.4 123.7 111 NA 104.4 ...
## $ HIV : num [1:217] NA 0.1 0.1 NA NA 1.9 NA 0.4 0.2 NA ...
## $ Hunger : num [1:217] 30.3 5.5 4.7 NA NA 23.9 NA 3.8 4.3 NA ...
## $ Diabetes : num [1:217] 9.6 10.1 6.7 NA 8 3.9 13.2 5.5 7.1 11.6 ...
## $ BirthRate : num [1:217] 32.5 11.7 22.3 NA NA 41.3 16.1 17 13.1 11 ...
## $ DeathRate : num [1:217] 6.6 7.5 4.8 NA NA 8.4 5.8 7.6 9.7 8.9 ...
## $ ElderlyPop : num [1:217] 2.6 13.6 6.4 NA NA 2.5 7.2 11.3 11.4 13.6 ...
## $ LifeExpectancy: num [1:217] 64 78.5 76.3 NA NA 61.8 76.5 76.7 74.8 76 ...
## $ FemaleLabor : num [1:217] 50.3 55.9 16.4 NA NA 76.4 NA 57.1 55.8 NA ...
## $ Unemployment : num [1:217] 1.5 13.9 12.1 NA NA 7.3 NA 9.5 17.7 NA ...
## $ Energy : num [1:217] NA 808 1328 NA NA ...
## $ Electricity : num [1:217] NA 2309 1363 NA NA ...
## $ Developed : num [1:217] NA 1 1 NA NA 1 NA 2 1 NA ...
## - attr(*, "spec")=
## .. cols(
## .. Country = col_character(),
## .. Code = col_character(),
## .. LandArea = col_double(),
## .. Population = col_double(),
## .. Density = col_double(),
## .. GDP = col_double(),
## .. Rural = col_double(),
## .. CO2 = col_double(),
## .. PumpPrice = col_double(),
## .. Military = col_double(),
## .. Health = col_double(),
## .. ArmedForces = col_double(),
## .. Internet = col_double(),
## .. Cell = col_double(),
## .. HIV = col_double(),
## .. Hunger = col_double(),
## .. Diabetes = col_double(),
## .. BirthRate = col_double(),
## .. DeathRate = col_double(),
## .. ElderlyPop = col_double(),
## .. LifeExpectancy = col_double(),
## .. FemaleLabor = col_double(),
## .. Unemployment = col_double(),
## .. Energy = col_double(),
## .. Electricity = col_double(),
## .. Developed = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Summary statistics
summary(df)
## Country Code LandArea Population
## Length:217 Length:217 Min. : 0.01 Min. : 0.0120
## Class :character Class :character 1st Qu.: 10.83 1st Qu.: 0.7728
## Mode :character Mode :character Median : 94.28 Median : 6.5725
## Mean : 608.38 Mean : 35.0335
## 3rd Qu.: 446.30 3rd Qu.: 25.0113
## Max. :16376.87 Max. :1392.7300
## NA's :8 NA's :1
## Density GDP Rural CO2
## Min. : 0.1 Min. : 275 Min. : 0.00 Min. : 0.0400
## 1st Qu.: 37.5 1st Qu.: 2032 1st Qu.:19.62 1st Qu.: 0.8575
## Median : 92.1 Median : 5950 Median :38.15 Median : 2.7550
## Mean : 361.4 Mean : 14733 Mean :39.10 Mean : 4.9780
## 3rd Qu.: 219.8 3rd Qu.: 17298 3rd Qu.:57.83 3rd Qu.: 6.2525
## Max. :20777.5 Max. :114340 Max. :87.00 Max. :43.8600
## NA's :8 NA's :30 NA's :3 NA's :13
## PumpPrice Military Health ArmedForces
## Min. :0.1100 Min. : 0.000 Min. : 0.000 Min. : 0.0
## 1st Qu.:0.7450 1st Qu.: 3.015 1st Qu.: 6.157 1st Qu.: 12.0
## Median :0.9800 Median : 4.650 Median : 9.605 Median : 31.5
## Mean :0.9851 Mean : 6.178 Mean :10.597 Mean : 162.1
## 3rd Qu.:1.1800 3rd Qu.: 8.445 3rd Qu.:13.713 3rd Qu.: 146.5
## Max. :2.0000 Max. :31.900 Max. :39.460 Max. :3031.0
## NA's :50 NA's :67 NA's :29 NA's :49
## Internet Cell HIV Hunger
## Min. : 1.30 Min. : 13.70 Min. : 0.100 Min. : 2.50
## 1st Qu.:29.18 1st Qu.: 83.83 1st Qu.: 0.175 1st Qu.: 2.50
## Median :58.35 Median :110.00 Median : 0.400 Median : 6.50
## Mean :54.47 Mean :107.05 Mean : 1.941 Mean :11.25
## 3rd Qu.:78.92 3rd Qu.:127.50 3rd Qu.: 1.400 3rd Qu.:14.80
## Max. :98.90 Max. :328.80 Max. :27.400 Max. :61.80
## NA's :13 NA's :15 NA's :81 NA's :52
## Diabetes BirthRate DeathRate ElderlyPop
## Min. : 1.000 Min. : 7.00 Min. : 1.600 Min. : 1.200
## 1st Qu.: 5.350 1st Qu.:11.40 1st Qu.: 5.800 1st Qu.: 3.600
## Median : 7.200 Median :17.85 Median : 7.250 Median : 6.600
## Mean : 8.542 Mean :20.11 Mean : 7.683 Mean : 8.953
## 3rd Qu.:10.750 3rd Qu.:27.65 3rd Qu.: 9.350 3rd Qu.:14.500
## Max. :30.500 Max. :47.80 Max. :15.500 Max. :27.500
## NA's :10 NA's :15 NA's :15 NA's :24
## LifeExpectancy FemaleLabor Unemployment Energy
## Min. :52.20 Min. : 6.20 Min. : 0.100 Min. : 66
## 1st Qu.:66.90 1st Qu.:50.15 1st Qu.: 3.400 1st Qu.: 738
## Median :74.30 Median :60.60 Median : 5.600 Median : 1574
## Mean :72.46 Mean :57.95 Mean : 7.255 Mean : 2664
## 3rd Qu.:77.70 3rd Qu.:69.25 3rd Qu.: 9.400 3rd Qu.: 3060
## Max. :84.70 Max. :85.80 Max. :30.200 Max. :17923
## NA's :18 NA's :30 NA's :30 NA's :82
## Electricity Developed
## Min. : 39 Min. :1.00
## 1st Qu.: 904 1st Qu.:1.00
## Median : 2620 Median :2.00
## Mean : 4270 Mean :1.81
## 3rd Qu.: 5600 3rd Qu.:3.00
## Max. :53832 Max. :3.00
## NA's :76 NA's :75
head(df)
## # A tibble: 6 × 26
## Country Code LandArea Population Density GDP Rural CO2 PumpPrice Military
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan… AFG 653. 37.2 56.9 521 74.5 0.29 0.7 3.72
## 2 Albania ALB 27.4 2.87 105. 5254 39.7 1.98 1.36 4.08
## 3 Algeria DZA 2382. 42.2 17.7 4279 27.4 3.74 0.28 13.8
## 4 Americ… ASM 0.2 0.055 277. NA 12.8 NA NA NA
## 5 Andorra AND 0.47 0.077 164. 42030 11.9 5.83 NA NA
## 6 Angola AGO 1247. 30.8 24.7 3432 34.5 1.29 0.97 9.4
## # ℹ 16 more variables: Health <dbl>, ArmedForces <dbl>, Internet <dbl>,
## # Cell <dbl>, HIV <dbl>, Hunger <dbl>, Diabetes <dbl>, BirthRate <dbl>,
## # DeathRate <dbl>, ElderlyPop <dbl>, LifeExpectancy <dbl>, FemaleLabor <dbl>,
## # Unemployment <dbl>, Energy <dbl>, Electricity <dbl>, Developed <dbl>
# Names of columns
names(df)
## [1] "Country" "Code" "LandArea" "Population"
## [5] "Density" "GDP" "Rural" "CO2"
## [9] "PumpPrice" "Military" "Health" "ArmedForces"
## [13] "Internet" "Cell" "HIV" "Hunger"
## [17] "Diabetes" "BirthRate" "DeathRate" "ElderlyPop"
## [21] "LifeExpectancy" "FemaleLabor" "Unemployment" "Energy"
## [25] "Electricity" "Developed"
Handle missing values
# How many NA(s)
colSums(is.na(df))
## Country Code LandArea Population Density
## 0 0 8 1 8
## GDP Rural CO2 PumpPrice Military
## 30 3 13 50 67
## Health ArmedForces Internet Cell HIV
## 29 49 13 15 81
## Hunger Diabetes BirthRate DeathRate ElderlyPop
## 52 10 15 15 24
## LifeExpectancy FemaleLabor Unemployment Energy Electricity
## 18 30 30 82 76
## Developed
## 75
# Clean & filter: keep only numeric columns + identifiers
df_clean <- df |>
select(LifeExpectancy, GDP, Health, Internet) |>
mutate(across(where(is.numeric),
~ ifelse(is.na(.x), median(.x, na.rm = TRUE), .x)))
view(df_clean)
# Simple Linear Regression: LifeExpectancy -> GDP
simple_lm <- lm(LifeExpectancy ~ GDP, data = df_clean)
summary(simple_lm)
##
## Call:
## lm(formula = LifeExpectancy ~ GDP, data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.453 -3.473 1.478 4.082 11.509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.953e+01 4.996e-01 139.19 <2e-16 ***
## GDP 2.281e-04 2.134e-05 10.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.007 on 215 degrees of freedom
## Multiple R-squared: 0.3469, Adjusted R-squared: 0.3439
## F-statistic: 114.2 on 1 and 215 DF, p-value: < 2.2e-16
The expected life expectancy would be roughly around the age 70(69.5) when all factors are omitted
When incorporating factors such as GDP in US dollars we see a slight increase in life expectancy as the slope rises at the rate of 0.00023 years
About 35% of the variation in life expectancy is explained by GDP. Suggesting a weak but meaningful relationship
# MULTIPLE REGRESSION
multiple_lm <- lm(LifeExpectancy ~ GDP + Health + Internet, data = df_clean)
summary(multiple_lm)
##
## Call:
## lm(formula = LifeExpectancy ~ GDP + Health + Internet, data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.8409 -1.7949 0.4177 2.5649 9.1198
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.943e+01 7.417e-01 80.127 < 2e-16 ***
## GDP 2.637e-05 1.894e-05 1.392 0.165372
## Health 2.309e-01 5.888e-02 3.922 0.000119 ***
## Internet 1.905e-01 1.317e-02 14.466 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.017 on 213 degrees of freedom
## Multiple R-squared: 0.7106, Adjusted R-squared: 0.7065
## F-statistic: 174.3 on 3 and 213 DF, p-value: < 2.2e-16
adj_r2_mult <- summary(multiple_lm)$adj.r.squared
The expected life expectancy would be would increase by .23 years when incorporating the factor of government spending on residential health
Government funding on subsidies focused on the primary health of the citizens has an impact on the average life expectancy in a country.
About 71% of the variation in life expectancy is explained by GDP, Health, and Internet. Suggesting a strong and meaningful relationship
Using Homoscedasticity in which there is an assumption of equal variance within each group in this case countries allows for the presumption of little to no bias in the dataset.
Using Normality of Residuals in which there is an assumption that the dataset is normally distributed indicating that the dataset is in no shape or form skwed until evidently proven so with test that prove the observation are or are not normally distributed .
# Homoscedasticity (constant variance)
plot(simple_lm, which = 1)
# Normality of Residuals
plot(simple_lm, which = 2)
Homoscedasticity was violated due to the concentration of the observations residing on the left side of the graph
The residuals are not perfectly normal indicating that the relationship might not be linear.
# Core diagnostics (covers: linearity, homoscedasticity, normality, influence)**
par(mfrow=c(2,2)); plot(simple_lm); par(mfrow=c(1,1))
# MODEL FIT: RMSE FOR MULTIPLE MODEL
residuals_mult <- resid(multiple_lm)
rmse_mult <- sqrt(mean(residuals_mult^2))
rmse_mult
## [1] 3.980289
# Hypothetical Multicollinearity –> CO2
lm(CO2 ~ Energy + Electricity, data = df)
##
## Call:
## lm(formula = CO2 ~ Energy + Electricity, data = df)
##
## Coefficients:
## (Intercept) Energy Electricity
## 0.7998384 0.0031220 -0.0007044