# load necessary libraries

library (tidyverse, dyplr)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (car) # for the crPlots function
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library (ggplot2)

1. Upload the data set AllCountries.csv and check its contents and structure as well as check NAs

data <- read_csv("AllCountries.csv")
## Rows: 217 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Country, Code
## dbl (24): LandArea, Population, Density, GDP, Rural, CO2, PumpPrice, Militar...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head (data)
## # A tibble: 6 × 26
##   Country Code  LandArea Population Density   GDP Rural   CO2 PumpPrice Military
##   <chr>   <chr>    <dbl>      <dbl>   <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>
## 1 Afghan… AFG     653.       37.2      56.9   521  74.5  0.29      0.7      3.72
## 2 Albania ALB      27.4       2.87    105.   5254  39.7  1.98      1.36     4.08
## 3 Algeria DZA    2382.       42.2      17.7  4279  27.4  3.74      0.28    13.8 
## 4 Americ… ASM       0.2       0.055   277.     NA  12.8 NA        NA       NA   
## 5 Andorra AND       0.47      0.077   164.  42030  11.9  5.83     NA       NA   
## 6 Angola  AGO    1247.       30.8      24.7  3432  34.5  1.29      0.97     9.4 
## # ℹ 16 more variables: Health <dbl>, ArmedForces <dbl>, Internet <dbl>,
## #   Cell <dbl>, HIV <dbl>, Hunger <dbl>, Diabetes <dbl>, BirthRate <dbl>,
## #   DeathRate <dbl>, ElderlyPop <dbl>, LifeExpectancy <dbl>, FemaleLabor <dbl>,
## #   Unemployment <dbl>, Energy <dbl>, Electricity <dbl>, Developed <dbl>
str (data)
## spc_tbl_ [217 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Country       : chr [1:217] "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##  $ Code          : chr [1:217] "AFG" "ALB" "DZA" "ASM" ...
##  $ LandArea      : num [1:217] 652.86 27.4 2381.74 0.2 0.47 ...
##  $ Population    : num [1:217] 37.172 2.866 42.228 0.055 0.077 ...
##  $ Density       : num [1:217] 56.9 104.6 17.7 277.3 163.8 ...
##  $ GDP           : num [1:217] 521 5254 4279 NA 42030 ...
##  $ Rural         : num [1:217] 74.5 39.7 27.4 12.8 11.9 34.5 75.4 8.1 36.9 56.6 ...
##  $ CO2           : num [1:217] 0.29 1.98 3.74 NA 5.83 1.29 5.74 4.78 1.9 8.41 ...
##  $ PumpPrice     : num [1:217] 0.7 1.36 0.28 NA NA 0.97 NA 1.1 0.77 NA ...
##  $ Military      : num [1:217] 3.72 4.08 13.81 NA NA ...
##  $ Health        : num [1:217] 2.01 9.51 10.73 NA 14.02 ...
##  $ ArmedForces   : num [1:217] 323 9 317 NA NA 117 0 105 49 NA ...
##  $ Internet      : num [1:217] 11.4 71.8 47.7 NA 98.9 14.3 76 75.8 69.7 97.2 ...
##  $ Cell          : num [1:217] 67.4 123.7 111 NA 104.4 ...
##  $ HIV           : num [1:217] NA 0.1 0.1 NA NA 1.9 NA 0.4 0.2 NA ...
##  $ Hunger        : num [1:217] 30.3 5.5 4.7 NA NA 23.9 NA 3.8 4.3 NA ...
##  $ Diabetes      : num [1:217] 9.6 10.1 6.7 NA 8 3.9 13.2 5.5 7.1 11.6 ...
##  $ BirthRate     : num [1:217] 32.5 11.7 22.3 NA NA 41.3 16.1 17 13.1 11 ...
##  $ DeathRate     : num [1:217] 6.6 7.5 4.8 NA NA 8.4 5.8 7.6 9.7 8.9 ...
##  $ ElderlyPop    : num [1:217] 2.6 13.6 6.4 NA NA 2.5 7.2 11.3 11.4 13.6 ...
##  $ LifeExpectancy: num [1:217] 64 78.5 76.3 NA NA 61.8 76.5 76.7 74.8 76 ...
##  $ FemaleLabor   : num [1:217] 50.3 55.9 16.4 NA NA 76.4 NA 57.1 55.8 NA ...
##  $ Unemployment  : num [1:217] 1.5 13.9 12.1 NA NA 7.3 NA 9.5 17.7 NA ...
##  $ Energy        : num [1:217] NA 808 1328 NA NA ...
##  $ Electricity   : num [1:217] NA 2309 1363 NA NA ...
##  $ Developed     : num [1:217] NA 1 1 NA NA 1 NA 2 1 NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Country = col_character(),
##   ..   Code = col_character(),
##   ..   LandArea = col_double(),
##   ..   Population = col_double(),
##   ..   Density = col_double(),
##   ..   GDP = col_double(),
##   ..   Rural = col_double(),
##   ..   CO2 = col_double(),
##   ..   PumpPrice = col_double(),
##   ..   Military = col_double(),
##   ..   Health = col_double(),
##   ..   ArmedForces = col_double(),
##   ..   Internet = col_double(),
##   ..   Cell = col_double(),
##   ..   HIV = col_double(),
##   ..   Hunger = col_double(),
##   ..   Diabetes = col_double(),
##   ..   BirthRate = col_double(),
##   ..   DeathRate = col_double(),
##   ..   ElderlyPop = col_double(),
##   ..   LifeExpectancy = col_double(),
##   ..   FemaleLabor = col_double(),
##   ..   Unemployment = col_double(),
##   ..   Energy = col_double(),
##   ..   Electricity = col_double(),
##   ..   Developed = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
colSums(is.na(data))
##        Country           Code       LandArea     Population        Density 
##              0              0              8              1              8 
##            GDP          Rural            CO2      PumpPrice       Military 
##             30              3             13             50             67 
##         Health    ArmedForces       Internet           Cell            HIV 
##             29             49             13             15             81 
##         Hunger       Diabetes      BirthRate      DeathRate     ElderlyPop 
##             52             10             15             15             24 
## LifeExpectancy    FemaleLabor   Unemployment         Energy    Electricity 
##             18             30             30             82             76 
##      Developed 
##             75

Since there are NAs in LifeExpectancy, Health and Internet, and DeathRate#, CO2#, Energy#, Electricity#, impute LE, Health and Internet with their mean values; impute GDP, CO2, Energy and Electricity with their median, since they are strongly right skewed. Note(#) these variables are used later in the homework.

data_2 <- data |> mutate(
    GDP = ifelse(is.na(GDP), median(GDP, na.rm = TRUE), GDP),
    LifeExpectancy = ifelse(is.na(LifeExpectancy), mean(LifeExpectancy, na.rm = TRUE), LifeExpectancy),
    Health = ifelse(is.na(Health), mean(Health, na.rm = TRUE), Health),
    Internet = ifelse(is.na(Internet), mean(Internet, na.rm = TRUE), Internet),
    DeathRate = ifelse(is.na(DeathRate), mean(DeathRate, na.rm = TRUE), DeathRate),
    CO2 = ifelse(is.na(CO2), median(CO2 , na.rm = TRUE), CO2),
    Energy = ifelse(is.na(Energy), median(Energy, na.rm = TRUE), Energy),
    Electricity = ifelse(is.na(Electricity), median(Electricity, na.rm = TRUE), Electricity)
  )
colSums(is.na(data_2))  # check it out
##        Country           Code       LandArea     Population        Density 
##              0              0              8              1              8 
##            GDP          Rural            CO2      PumpPrice       Military 
##              0              3              0             50             67 
##         Health    ArmedForces       Internet           Cell            HIV 
##              0             49              0             15             81 
##         Hunger       Diabetes      BirthRate      DeathRate     ElderlyPop 
##             52             10             15              0             24 
## LifeExpectancy    FemaleLabor   Unemployment         Energy    Electricity 
##              0             30             30              0              0 
##      Developed 
##             75
summary(data_2)
##    Country              Code              LandArea          Population       
##  Length:217         Length:217         Min.   :    0.01   Min.   :   0.0120  
##  Class :character   Class :character   1st Qu.:   10.83   1st Qu.:   0.7728  
##  Mode  :character   Mode  :character   Median :   94.28   Median :   6.5725  
##                                        Mean   :  608.38   Mean   :  35.0335  
##                                        3rd Qu.:  446.30   3rd Qu.:  25.0113  
##                                        Max.   :16376.87   Max.   :1392.7300  
##                                        NA's   :8          NA's   :1          
##     Density             GDP             Rural            CO2        
##  Min.   :    0.1   Min.   :   275   Min.   : 0.00   Min.   : 0.040  
##  1st Qu.:   37.5   1st Qu.:  2549   1st Qu.:19.62   1st Qu.: 0.890  
##  Median :   92.1   Median :  5950   Median :38.15   Median : 2.755  
##  Mean   :  361.4   Mean   : 13519   Mean   :39.10   Mean   : 4.845  
##  3rd Qu.:  219.8   3rd Qu.: 15575   3rd Qu.:57.83   3rd Qu.: 6.160  
##  Max.   :20777.5   Max.   :114340   Max.   :87.00   Max.   :43.860  
##  NA's   :8                          NA's   :3                       
##    PumpPrice         Military          Health       ArmedForces    
##  Min.   :0.1100   Min.   : 0.000   Min.   : 0.00   Min.   :   0.0  
##  1st Qu.:0.7450   1st Qu.: 3.015   1st Qu.: 7.10   1st Qu.:  12.0  
##  Median :0.9800   Median : 4.650   Median :10.60   Median :  31.5  
##  Mean   :0.9851   Mean   : 6.178   Mean   :10.60   Mean   : 162.1  
##  3rd Qu.:1.1800   3rd Qu.: 8.445   3rd Qu.:13.37   3rd Qu.: 146.5  
##  Max.   :2.0000   Max.   :31.900   Max.   :39.46   Max.   :3031.0  
##  NA's   :50       NA's   :67                       NA's   :49      
##     Internet          Cell             HIV             Hunger     
##  Min.   : 1.30   Min.   : 13.70   Min.   : 0.100   Min.   : 2.50  
##  1st Qu.:30.30   1st Qu.: 83.83   1st Qu.: 0.175   1st Qu.: 2.50  
##  Median :55.70   Median :110.00   Median : 0.400   Median : 6.50  
##  Mean   :54.47   Mean   :107.05   Mean   : 1.941   Mean   :11.25  
##  3rd Qu.:77.60   3rd Qu.:127.50   3rd Qu.: 1.400   3rd Qu.:14.80  
##  Max.   :98.90   Max.   :328.80   Max.   :27.400   Max.   :61.80  
##                  NA's   :15       NA's   :81       NA's   :52     
##     Diabetes        BirthRate       DeathRate        ElderlyPop    
##  Min.   : 1.000   Min.   : 7.00   Min.   : 1.600   Min.   : 1.200  
##  1st Qu.: 5.350   1st Qu.:11.40   1st Qu.: 5.900   1st Qu.: 3.600  
##  Median : 7.200   Median :17.85   Median : 7.500   Median : 6.600  
##  Mean   : 8.542   Mean   :20.11   Mean   : 7.683   Mean   : 8.953  
##  3rd Qu.:10.750   3rd Qu.:27.65   3rd Qu.: 9.000   3rd Qu.:14.500  
##  Max.   :30.500   Max.   :47.80   Max.   :15.500   Max.   :27.500  
##  NA's   :10       NA's   :15                       NA's   :24      
##  LifeExpectancy   FemaleLabor     Unemployment        Energy     
##  Min.   :52.20   Min.   : 6.20   Min.   : 0.100   Min.   :   66  
##  1st Qu.:67.60   1st Qu.:50.15   1st Qu.: 3.400   1st Qu.: 1180  
##  Median :73.30   Median :60.60   Median : 5.600   Median : 1574  
##  Mean   :72.46   Mean   :57.95   Mean   : 7.255   Mean   : 2252  
##  3rd Qu.:77.40   3rd Qu.:69.25   3rd Qu.: 9.400   3rd Qu.: 2177  
##  Max.   :84.70   Max.   :85.80   Max.   :30.200   Max.   :17923  
##                  NA's   :30      NA's   :30                      
##   Electricity      Developed   
##  Min.   :   39   Min.   :1.00  
##  1st Qu.: 1683   1st Qu.:1.00  
##  Median : 2620   Median :2.00  
##  Mean   : 3692   Mean   :1.81  
##  3rd Qu.: 3714   3rd Qu.:3.00  
##  Max.   :53832   Max.   :3.00  
##                  NA's   :75

2. Simple Linear Regression (Fitting and Interpretation)

Using the AllCountries dataset, fit a simple linear regression model to predict LifeExpectancy (average life expectancy in years) based on GDP (gross domestic product per capita in $US). Report the intercept and slope coefficients and interpret their meaning in the context of the dataset. What does the R² value tell you about how well GDP explains variation in life expectancy across countries?

# Fit simple linear regression:

LE_GDP_model <- lm(LifeExpectancy ~ GDP, data = data_2)     # all 217 observations are used since the NAs in GDP and LE were imputed.

# View the model summary
summary(LE_GDP_model)
## 
## Call:
## lm(formula = LifeExpectancy ~ GDP, data = data_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.284  -3.305   1.554   4.242  11.671 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.936e+01  4.963e-01  139.77   <2e-16 ***
## GDP         2.293e-04  2.120e-05   10.82   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.968 on 215 degrees of freedom
## Multiple R-squared:  0.3524, Adjusted R-squared:  0.3494 
## F-statistic:   117 on 1 and 215 DF,  p-value: < 2.2e-16

Interpretation

Both coefficients are statistically significant with p<<.05.

The intercept of about 69.4 years (at GDP 0) is only mathematically meaningful. The slope of about .00023 indicates that LifeExpectancy increases about 0.00023 years above 69.4 years for every dollar increment of GDP. E.g., for a median GDP of $5950, LE is 69.2+.00023*5950= 70.57 years. For a max GDP of $114340, LE is 95.5 years. People in the richest countries live an average of 25 years longer. The residual standard error of about 6 years is rather high meaning that the linear prediction is not accurate. The multiple R-squared of 0.3524 indicates that the linear model predicts about 35% of the variance of LE based on GDP per capita, which is a rather poor fit.

3. Multiple Linear Regression (Fitting and Interpretation)

Fit a multiple linear regression model to predict LifeExpectancy using GDP, Health (percentage of government expenditures on healthcare), and Internet (percentage of population with internet access) as predictors. Interpret the coefficient for Health, explaining what it means in terms of life expectancy while controlling for GDP and Internet. How does the adjusted R² compare to the simple regression model from Question 1, and what does this suggest about the additional predictors?

# Fit multiple linear regression:

LE_GDP_Health_Internet_model <- lm(LifeExpectancy ~ GDP + Health + Internet, data = data_2)    # all 217 observations are used

# View the model summary
summary(LE_GDP_Health_Internet_model)
## 
## Call:
## lm(formula = LifeExpectancy ~ GDP + Health + Internet, data = data_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.6466  -1.9335   0.2351   2.4153   9.1803 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.943e+01  7.503e-01  79.207  < 2e-16 ***
## GDP         3.107e-05  1.916e-05   1.622    0.106    
## Health      2.418e-01  5.984e-02   4.041 7.43e-05 ***
## Internet    1.846e-01  1.341e-02  13.766  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.063 on 213 degrees of freedom
## Multiple R-squared:  0.7026, Adjusted R-squared:  0.6984 
## F-statistic: 167.7 on 3 and 213 DF,  p-value: < 2.2e-16

Interpretation

The intercept, Health and Internet are statistically significant predictors with p<<0.05. However, GDP, with p about 0.1 is not a statistically significant predictor. The intercept of 59.4 years (at GDP=0,Health=0,Internet=0) is only mathematically meaningful.

GDP coefficient indicates that $1 increment in GDP increases LE by 0.00003 years. Health coefficient indicates that each 1% increment in health expenditure increases LE by .24 years. Internet coefficient indicates that each 1% increment in population using internet increases LE by .18 years.

The residual standard error of 4 years is smaller than for the simple linear regression, thus the multiple linear model is better. The 213 degrees of freedom were calculated as: n-p-1 (217-3-1).

The Adjusted R-squared indicates that about 69.8% of the variance in LE is explained by this model. This is not a very good fit, but it is much higher than the 34% obtained with the simple model. Adding the 2 variables (Health and Internet) improved the predictability of the model.

4. Checking Assumptions (Homoscedasticity and Normality)

For the simple linear regression model from Question 1 (LifeExpectancy ~ GDP), describe how you would check the assumptions of homoscedasticity and normality of residuals. For each assumption, explain what an ideal outcome would look like and what a violation might indicate about the model’s reliability for predicting life expectancy. Afterwords, code your answer and reflect if it matched the ideal outcome.

Answer

To examine the assumptions of normality and homoscedasticity of residuals, I would first look at the output of the lm() residual statistics, then I would do a visual examination of the residuals using scatter plot and also the 4 diagnostic plots from the R package: residuals vs. fitted, Q-Q, residuals vs. leverage, and scale location. For a normal distribution of residuals the statistics should show a uniform distribution around the mean. The visual examination of residuals should show a homogeneous spread the entire range of indeces (order). Likewise, homoscedasticity should be obviated by uniform scatter around zero and no lumping at the ends or extreme deviation from a horizontal. If the residual metrics, particularly R-squared, is not above 70%, I would not trust the ability of the model to predict the dependent variable with high degree of confidence, and I would adjust the model. Here’s the process:

Linear examination with plots of scatter and of residuals

plot(data_2$GDP, data_2$LifeExpectancy,
     xlab="GDP ($)", ylab="Life Expectancy (y)", main="Life Expectancy vs GDP per capita")
abline(LE_GDP_model, col=1, lwd=2)

plot(resid(LE_GDP_model), type="b", main="Residuals vs Order", ylab="Residuals")
abline(h=0, lty=2)

Visually it can be seen that GDP per capita is a terrible single-variable predictor of LifeExpectancy. The predicted line shows large errors and nonuniform scatter for the observations at GDP<$30K, and large residuals (from -15 to 10 years) at various locations of the GDP range.

# set up the 2x2 grid of diagnostic plots; fill them with the 4 diagnostics plots; reset the plot to a single 1x1 plot.

par(mfrow=c(2,2)); plot(LE_GDP_model); par(mfrow=c(1,1))

Residuals vs Fitted: Both scatter plot (LE vs GDP) and residuals vs fitted plot show very strong deviation from linearity at the lowest range of GDP (up to $15,000) and lower range of life expectancy (up to 72 ). The residuals vs fitted values follow a straight line of negative slope, meaning that observed life expectancy is smaller than predicted values at the higher range of GDP. This shows unequal variance — larger GDP countries might have bigger prediction errors than smaller GDP ones. The residuals are not evenly scattered around zero, indicating that the homoscedasticity assumption is not met. Thus the simpler model is not adequate, as also shown in part 2 of this homework.

Scale–Location: Spread is largest at the low end of LifeExpectancy. Showing heteroscedasticity (variance grows for low GDP low LifeExpectancy predictions.)

Q–Q plot: The quantile residual deviations are largest for the left tail and mild for the right tail. The mean and first quantiles display normality. But overall, the residuals for the second and higher quantilies do not show normality.

Residuals vs Leverage: There is high leverage point with strong negative residual.

The Cook’s distance is significant but does not influence the model substantially.

5. Diagnosing Model Fit (RMSE and Residuals)

For the multiple regression model from Question 2 (LifeExpectancy ~ GDP + Health + Internet), calculate the RMSE and explain what it represents in the context of predicting life expectancy. How would large residuals for certain countries (e.g., those with unusually high or low life expectancy) affect your confidence in the model’s predictions, and what might you investigate further?

# Calculating the residuals for the 3 variables model:

resid_LE_GDP_Health_Internet <- resid(LE_GDP_Health_Internet_model)

# Calculate RMSE

rmse_LE_GDP_Health_Internet <- sqrt(mean(resid_LE_GDP_Health_Internet^2))
rmse_LE_GDP_Health_Internet
## [1] 4.025728
# For comparison, calculate residuals for 1 variable model

resid_LE_GDP <- resid(LE_GDP_model)

# Calculate RMSE 

rmse_LE_GDP <- sqrt(mean(resid_LE_GDP^2))
rmse_LE_GDP
## [1] 5.940333

The RMSE of about 4 shows that the 3-variable model misses predictions from the model by about 4 years of life expectancy, in the average. In comparison, the 1-variable model was worse, missing predictions by about 6 years in average.

The results from section 3 above for the 3-variable model giving an Adjusted R-squared of 0.6984 (about 70% prediction of the variance of LE) reaffirms that the 3-variable model is acceptable, although it should be improved.

I would not have confidence in this model for predicting LE in countries with very low or very high GDP per capita or with very high Health investment.

To improve the model one option is to include the more significant variables, removing the least significant ones. For example, one could remove GDP since it is not a good predictor, and include DeathRate, which by definition should be a good predictor of LifeExpectancy. I did that below.

# Fit multiple linear regression to a 3-variable model including DeathRate

LE_Health_Internet_DeathRate_model <- lm(LifeExpectancy ~ Health + Internet + DeathRate, data = data_2)    # all 217 observations are used

# View the model summary
summary(LE_Health_Internet_DeathRate_model)
## 
## Call:
## lm(formula = LifeExpectancy ~ Health + Internet + DeathRate, 
##     data = data_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.3289  -2.1487   0.3243   2.3365  10.2902 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 63.27328    1.01844  62.127  < 2e-16 ***
## Health       0.28469    0.05489   5.186 4.98e-07 ***
## Internet     0.19225    0.01069  17.976  < 2e-16 ***
## DeathRate   -0.55916    0.10137  -5.516 1.00e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.824 on 213 degrees of freedom
## Multiple R-squared:  0.7365, Adjusted R-squared:  0.7328 
## F-statistic: 198.5 on 3 and 213 DF,  p-value: < 2.2e-16
# do component and residual plots for the 3-variable model:

crPlots(LE_Health_Internet_DeathRate_model)

This combination of variables (Health, Internet and DeathRate) shows excellent p<<.05 for all 3 variables, and an Adjusted R-Squared of 73%, an improvement over the previous models.

6. Hypothetical Example (Multicollinearity in Multiple Regression)

Suppose you are analyzing the AllCountries dataset and fit a multiple linear regression model to predict CO2 emissions (metric tons per capita) using Energy (kilotons of oil equivalent) and Electricity (kWh per capita) as predictors. You notice that Energy and Electricity are highly correlated. Explain how this multicollinearity might affect the interpretation of the regression coefficients and the reliability of the model.

Answer

If there is a strong correlation among the variables (Energy, Electricity) the standard error coefficients for CO2 emissions vs Energy/Electricity may be inflated. The p-values may seem not significant although an R-squared value may show good fit. One would need to test a model with only Energy or Electricity as variable, and adopt it if the R-squared value improves.

Let’s try it:

# check correlations among variables

cor(data_2[, c("CO2", "Energy", "Electricity")], use = "complete.obs")
##                   CO2    Energy Electricity
## CO2         1.0000000 0.8258091   0.4784142
## Energy      0.8258091 1.0000000   0.8037273
## Electricity 0.4784142 0.8037273   1.0000000

Electricy and Energy are strongly correlated (0.80).

Do a multilinear regression for CO2 (Energy, Electricity)

# Fit multiple linear regression

CO2_Energy_Electricity_model <- lm(CO2 ~ Energy + Electricity, data = data_2)    # all 217 observations are used

# View the model summary
summary(CO2_Energy_Electricity_model)
## 
## Call:
## lm(formula = CO2 ~ Energy + Electricity, data = data_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5422  -1.4796  -0.3844   1.0156  16.0056 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.3170262  0.2730774   1.161    0.247    
## Energy       0.0031320  0.0001357  23.077   <2e-16 ***
## Electricity -0.0006841  0.0000706  -9.691   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.993 on 214 degrees of freedom
## Multiple R-squared:  0.779,  Adjusted R-squared:  0.7769 
## F-statistic: 377.1 on 2 and 214 DF,  p-value: < 2.2e-16
# Plot residual and and component for the 2 variable model
crPlots(CO2_Energy_Electricity_model)

R-squared- 0.777 is a good fit, but not excellent. I notice that the p for intercept does not have a value lower than p<.05. Also notice that both Energy and Electricity with p<<0.05 are significant (although correlated). Electricity has a small negative value (leading to a decrease of CO2 emissions with increase in electricity), which makes sense. The residual plots show linearity for Energy, but heteroscesdaticity for Electricity.

To test whether the model can be improved by dropping Electricity, try:

# Fit 1-variable linear regression

CO2_Energy_model <- lm(CO2 ~ Energy,  data = data_2)    # all 217 observations are used

# View the model summary
summary(CO2_Energy_model)
## 
## Call:
## lm(formula = CO2 ~ Energy, data = data_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -31.2862  -1.2677  -0.2762   0.8956  16.0223 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.717e-01  3.263e-01   0.526    0.599    
## Energy      2.075e-03  9.664e-05  21.471   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.581 on 215 degrees of freedom
## Multiple R-squared:  0.682,  Adjusted R-squared:  0.6805 
## F-statistic:   461 on 1 and 215 DF,  p-value: < 2.2e-16

In this case, I observe that R-squared has decreased to 0.6805. So, the 2-parameter model seems better. Thus, although there is correlation among Energy and Electricity, multicollinearity does not affect the model. To visualize each variable independently, let’s use 2 simple 2D plots for expediency.

plot(y=data_2$CO2, x=data_2$Energy,
     xlab="Energy", ylab="CO2", main="CO2 vs Energy")
abline(lm(CO2 ~ Energy, data= data_2), col='red', lwd=2)

plot(y=data_2$CO2, x=data_2$Electricity,
     xlab="Electricity", ylab="CO2", main="CO2 vs Electricity")
abline(lm(CO2 ~ Electricity, data= data_2), col='blue', lwd=2)

Conclusion on part 6

The visual examination of the fit shows that neither Energy nor Electricity variables separately are perfect predictors of CO2. As stated above, Electricity heteroscesdaticity is obvious (broad scatter). The model including both Energy and Electricity displays a more significant R-square value and is preferred.

Publication

Published at https://rpubs.com/rmiranda/1364312