library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

setwd("~/Desktop/Data101 csv")
df <- read_csv("AllCountries.csv")
## Rows: 217 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Country, Code
## dbl (24): LandArea, Population, Density, GDP, Rural, CO2, PumpPrice, Militar...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 26
##   Country Code  LandArea Population Density   GDP Rural   CO2 PumpPrice Military
##   <chr>   <chr>    <dbl>      <dbl>   <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>
## 1 Afghan… AFG     653.       37.2      56.9   521  74.5  0.29      0.7      3.72
## 2 Albania ALB      27.4       2.87    105.   5254  39.7  1.98      1.36     4.08
## 3 Algeria DZA    2382.       42.2      17.7  4279  27.4  3.74      0.28    13.8 
## 4 Americ… ASM       0.2       0.055   277.     NA  12.8 NA        NA       NA   
## 5 Andorra AND       0.47      0.077   164.  42030  11.9  5.83     NA       NA   
## 6 Angola  AGO    1247.       30.8      24.7  3432  34.5  1.29      0.97     9.4 
## # ℹ 16 more variables: Health <dbl>, ArmedForces <dbl>, Internet <dbl>,
## #   Cell <dbl>, HIV <dbl>, Hunger <dbl>, Diabetes <dbl>, BirthRate <dbl>,
## #   DeathRate <dbl>, ElderlyPop <dbl>, LifeExpectancy <dbl>, FemaleLabor <dbl>,
## #   Unemployment <dbl>, Energy <dbl>, Electricity <dbl>, Developed <dbl>

Handle missing values

#How many NA(s)
colSums(is.na(df))            # NAs per column
##        Country           Code       LandArea     Population        Density 
##              0              0              8              1              8 
##            GDP          Rural            CO2      PumpPrice       Military 
##             30              3             13             50             67 
##         Health    ArmedForces       Internet           Cell            HIV 
##             29             49             13             15             81 
##         Hunger       Diabetes      BirthRate      DeathRate     ElderlyPop 
##             52             10             15             15             24 
## LifeExpectancy    FemaleLabor   Unemployment         Energy    Electricity 
##             18             30             30             82             76 
##      Developed 
##             75

Impute NA’s

#impute life expectancy
median_life <- median(df$LifeExpectancy, na.rm = TRUE)

df <- df |>
  mutate(LifeExpectancy_imputed = ifelse(is.na(LifeExpectancy),
                                      median_life, 
                                      LifeExpectancy))
#impute GDP
median_gdp <- median(df$GDP, na.rm = TRUE)

df <- df |>
  mutate(GDP_imputed = ifelse(is.na(GDP),
                                      median_gdp, 
                                      GDP))
#impute Health
median_health <- median(df$Health, na.rm = TRUE)

df <- df |>
  mutate(Health_imputed = ifelse(is.na(Health),
                                      median_health, 
                                      Health))
#impute Internet
median_internet <- median(df$Internet, na.rm = TRUE)

df <- df |>
  mutate(Internet_imputed = ifelse(is.na(Internet),
                                      median_internet, 
                                      Internet))

colSums(is.na(df)) 
##                Country                   Code               LandArea 
##                      0                      0                      8 
##             Population                Density                    GDP 
##                      1                      8                     30 
##                  Rural                    CO2              PumpPrice 
##                      3                     13                     50 
##               Military                 Health            ArmedForces 
##                     67                     29                     49 
##               Internet                   Cell                    HIV 
##                     13                     15                     81 
##                 Hunger               Diabetes              BirthRate 
##                     52                     10                     15 
##              DeathRate             ElderlyPop         LifeExpectancy 
##                     15                     24                     18 
##            FemaleLabor           Unemployment                 Energy 
##                     30                     30                     82 
##            Electricity              Developed LifeExpectancy_imputed 
##                     76                     75                      0 
##            GDP_imputed         Health_imputed       Internet_imputed 
##                      0                      0                      0

Simple Linear Regression (Fitting and Interpretation): Using the AllCountries dataset, fit a simple linear regression model to predict LifeExpectancy (average life expectancy in years) based on GDP (gross domestic product per capita in $US). Report the intercept and slope coefficients and interpret their meaning in the context of the dataset. What does the R² value tell you about how well GDP explains variation in life expectancy across countries?

# Fit simple linear regression: LifeExpectancy ~ GDP
simple_model <- lm(LifeExpectancy_imputed ~ GDP_imputed, data = df)

# View the model summary
summary(simple_model) 
## 
## Call:
## lm(formula = LifeExpectancy_imputed ~ GDP_imputed, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.453  -3.473   1.478   4.082  11.509 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.953e+01  4.996e-01  139.19   <2e-16 ***
## GDP_imputed 2.281e-04  2.134e-05   10.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.007 on 215 degrees of freedom
## Multiple R-squared:  0.3469, Adjusted R-squared:  0.3439 
## F-statistic: 114.2 on 1 and 215 DF,  p-value: < 2.2e-16

Interpretation

Multiple Linear Regression (Fitting and Interpretation) Fit a multiple linear regression model to predict LifeExpectancy using GDP, Health (percentage of government expenditures on healthcare), and Internet (percentage of population with internet access) as predictors. Interpret the coefficient for Health, explaining what it means in terms of life expectancy while controlling for GDP and Internet. How does the adjusted R² compare to the simple regression model from Question 1, and what does this suggest about the additional predictors?

# Fit multiple linear regression: LifeExpectancy ~ GDP + Health + Internet
multiple_model <- lm(LifeExpectancy_imputed ~ GDP_imputed + Health_imputed + Internet_imputed, data = df)

# View the model summary
summary(multiple_model)
## 
## Call:
## lm(formula = LifeExpectancy_imputed ~ GDP_imputed + Health_imputed + 
##     Internet_imputed, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.8409  -1.7949   0.4177   2.5649   9.1198 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.943e+01  7.417e-01  80.127  < 2e-16 ***
## GDP_imputed      2.637e-05  1.894e-05   1.392 0.165372    
## Health_imputed   2.309e-01  5.888e-02   3.922 0.000119 ***
## Internet_imputed 1.905e-01  1.317e-02  14.466  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.017 on 213 degrees of freedom
## Multiple R-squared:  0.7106, Adjusted R-squared:  0.7065 
## F-statistic: 174.3 on 3 and 213 DF,  p-value: < 2.2e-16

Interpretation

Intercept: Predicted Life expectancy when all predictors are 0 is 59.08.

Coefficients (slope):

GDP (0.00002637) shows a positive relationship for life expectancy. Increase in GDP would increase life expectancy. Health (0.2309) shows a positive relationship for life expectancy. Increasing government expenditures on healthcare would increase life expectancy. Internet (0.1905) shows a positive relationship for life expectancy. Having more internet access would increase life expectancy.

P-values: Health (p-value = 0.000119) and Internet (p-value = 2e-16) are significant (<0.05). GDP (p-value = 0.165372) is not a significant indicator for life expectancy.

Adjusted R²: about 0.71. This means about 71% of LifeExpectancy variance is explained by this model. This is better than the 35% from the simple model. The additional predictors have improved the model.

Checking Assumptions (Homoscedasticity and Normality) For the simple linear regression model from Question 1 (LifeExpectancy ~ GDP), describe how you would check the assumptions of homoscedasticity and normality of residuals. For each assumption, explain what an ideal outcome would look like and what a violation might indicate about the model’s reliability for predicting life expectancy. Afterwords, code your answer and reflect if it matched the ideal outcome.

Visual linearity check

plot(df$GDP_imputed, df$LifeExpectancy_imputed,
     xlab="GDP", ylab="Life Expectancy", main="GDP vs Life Expectancy")
abline(simple_model, col=1, lwd=2)

Core Diagnostics

par(mfrow=c(2,2)); plot(simple_model); par(mfrow=c(1,1))

Residuals vs Fitted: Residuals are more clustered around the fitted value of 70. Curvature at lower fitted values. The spread of residuals starts as compact then thins out as fitted values increases. Overall, the residuals become more narrow towards higher fitted values, illustrating slight patterns, suggesting mild heteroscedasticity, and unequal variance. Thus, this shows that this is not the best model.

Countries with life expectancy of 70. The distance begins to spread around 75 to 80.

Scale–Location: Cone shaped spread between residuals and fitted values. There is some heteroscedasticity.

Q–Q plot: Both tails deviate, suggesting outliers. This means that it is not completely normally distributed.

Residuals vs Leverage: A few low leverage points; none look extreme but worth a glance (check Cook’s distances).

Residuals are crammed in one spot. Leverage: There are some countries affecting the residual.

High GDP and low GDP can be influential.

Diagnosing Model Fit (RMSE and Residuals) For the multiple regression model from Question 2 (LifeExpectancy ~ GDP + Health + Internet), calculate the RMSE and explain what it represents in the context of predicting life expectancy. How would large residuals for certain countries (e.g., those with unusually high or low life expectancy) affect your confidence in the model’s predictions, and what might you investigate further?

Diagnose Model Fit with Metrics

# Calculate residuals
residuals_simple <- resid(simple_model)

# Calculate RMSE for simple model
rmse_simple <- sqrt(mean(residuals_simple^2)) #need to know how much error I'm making in my simple model only using GDP. I am making 5.9 root mean square error.
rmse_simple
## [1] 5.979395
# For multiple model

# Calculate residuals
residuals_multiple <- resid(multiple_model) #error went down to 3.9 which is a closer prediction to actual life expectancy

# Calculate RMSE for multiple model
rmse_multiple <- sqrt(mean(residuals_multiple^2))
rmse_multiple
## [1] 3.980289

Large residuals for certain countries can indicate an unreliable and poor prediction for life expectancy, lessening the confidence of the model’s. Investigating the variables/predictors alone can reveal that there may be potential missing values or other factors that affects the RMSE.

Hypothetical Example (Multicollinearity in Multiple Regression) Suppose you are analyzing the AllCountries dataset and fit a multiple linear regression model to predict CO2 emissions (metric tons per capita) using Energy (kilotons of oil equivalent) and Electricity (kWh per capita) as predictors. You notice that Energy and Electricity are highly correlated. Explain how this multicollinearity might affect the interpretation of the regression coefficients and the reliability of the model.

cor(df[, c("Energy", "Electricity")], use = "complete.obs")
##                Energy Electricity
## Energy      1.0000000   0.7970054
## Electricity 0.7970054   1.0000000

Correlations among predictors: Energy–Electricity = 0.79. This is indicates a strong correlation between them.

High multicollinearity makes it hard to tell which predictor is useful. Coefficients for Energy and Electricity may be inflated. Individual p-values can look non-significant even though the model fits well.