setwd("C:/Users/SwagD/Desktop/Data 101")

data <- read.csv("AllCountries.csv")

head(data)
##          Country Code LandArea Population Density   GDP Rural  CO2 PumpPrice
## 1    Afghanistan  AFG   652.86     37.172    56.9   521  74.5 0.29      0.70
## 2        Albania  ALB    27.40      2.866   104.6  5254  39.7 1.98      1.36
## 3        Algeria  DZA  2381.74     42.228    17.7  4279  27.4 3.74      0.28
## 4 American Samoa  ASM     0.20      0.055   277.3    NA  12.8   NA        NA
## 5        Andorra  AND     0.47      0.077   163.8 42030  11.9 5.83        NA
## 6         Angola  AGO  1246.70     30.810    24.7  3432  34.5 1.29      0.97
##   Military Health ArmedForces Internet  Cell HIV Hunger Diabetes BirthRate
## 1     3.72   2.01         323     11.4  67.4  NA   30.3      9.6      32.5
## 2     4.08   9.51           9     71.8 123.7 0.1    5.5     10.1      11.7
## 3    13.81  10.73         317     47.7 111.0 0.1    4.7      6.7      22.3
## 4       NA     NA          NA       NA    NA  NA     NA       NA        NA
## 5       NA  14.02          NA     98.9 104.4  NA     NA      8.0        NA
## 6     9.40   5.43         117     14.3  44.7 1.9   23.9      3.9      41.3
##   DeathRate ElderlyPop LifeExpectancy FemaleLabor Unemployment Energy
## 1       6.6        2.6           64.0        50.3          1.5     NA
## 2       7.5       13.6           78.5        55.9         13.9    808
## 3       4.8        6.4           76.3        16.4         12.1   1328
## 4        NA         NA             NA          NA           NA     NA
## 5        NA         NA             NA          NA           NA     NA
## 6       8.4        2.5           61.8        76.4          7.3    545
##   Electricity Developed
## 1          NA        NA
## 2        2309         1
## 3        1363         1
## 4          NA        NA
## 5          NA        NA
## 6         312         1

1: Simple Linear Regression

model1 <- lm(LifeExpectancy ~ GDP, data = data)

summary(model1)
## 
## Call:
## lm(formula = LifeExpectancy ~ GDP, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.352  -3.882   1.550   4.458   9.330 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.842e+01  5.415e-01  126.36   <2e-16 ***
## GDP         2.476e-04  2.141e-05   11.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.901 on 177 degrees of freedom
##   (38 observations deleted due to missingness)
## Multiple R-squared:  0.4304, Adjusted R-squared:  0.4272 
## F-statistic: 133.7 on 1 and 177 DF,  p-value: < 2.2e-16

The intercept is the predicted life expectancy when GDP = 0

The slope is the change in life expectancy for a 1-unit increase in GDP

R² is how much variation in life expectancy is explained by GDP

2: Multiple Linear Regression

model2 <- lm(LifeExpectancy ~ GDP + Health + Internet, data = data)

summary(model2)
## 
## Call:
## lm(formula = LifeExpectancy ~ GDP + Health + Internet, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5662  -1.8227   0.4108   2.5422   9.4161 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.908e+01  8.149e-01  72.499  < 2e-16 ***
## GDP         2.367e-05  2.287e-05   1.035 0.302025    
## Health      2.479e-01  6.619e-02   3.745 0.000247 ***
## Internet    1.903e-01  1.656e-02  11.490  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.104 on 169 degrees of freedom
##   (44 observations deleted due to missingness)
## Multiple R-squared:  0.7213, Adjusted R-squared:  0.7164 
## F-statistic: 145.8 on 3 and 169 DF,  p-value: < 2.2e-16

Health coefficient is the change in life expectancy for a 1-unit increase in Health spending, holding GDP and internet constant

Adjusted R² lets us know if adding variables improves the model

3: Checking Assumptions

plot(model1$fitted.values, model1$residuals)

abline(h = 0)

The ideal is a random scatter

The violation is Pattern or funnel shape

Normality of Residuals

qqnorm(model1$residuals)

qqline(model1$residuals)

The points follow the line

The violation is curved or scattered away from line

4: RMSE Calculation

residuals <- model2$residuals

rmse <- sqrt(mean(residuals^2)
             )
rmse
## [1] 4.056417

RMSE measures the average prediction error in life expectancy

Lower RMSE means a better model fit

5: Residual Interpretation

Large residuals means worse predictions for certain countries This means there could be, outliers, missing variables or unique country conditions

6: Multicollinearity Explanation

Multicollinearity happens when two predictors are highly correlated, such as Energy and Electricity.

This means it can make coefficients unstable, make it difficult to interpret individual effects, and reduce the reliability of the model.

Even if the model predicts well overall, individual variables may appear insignificant due to overlap.