The Problem

Put some economics theory here

The data

Describe the data

rm(list = ls())
### Read all the data
##'NY.GDP.MKTP.KD',    = GDP constant US$ 2015
##'SP.POP.TOTL',       = Population total
##'EN.ATM.CO2E.PC',    = CO2 emission in tons per/capita
##"EG.ELC.RNWX.ZS",    =  Electricity from renovable sources % of total
##'NY.GDP.PCAP.CD',    = GDP per capita (current US$) 
##"NY.GDP.PCAP.PP.CD", = GDP per capita, PPP (current international $) 
##'EG.ELC.COAL.ZS',    = Electricity produced from coal % of total
##'AG.PRD.LVSK.XD',    = Livstock production index (2014--2016=100)
##"AG.CON.FERT.ZS",    = Fertilizer consumption (kg/hectare of arable land)
##"EG.USE.PCAP.KG.OE"  = Energy use (kg of oil equivalent per capita)
wb <- read.csv("https://raw.githubusercontent.com/hrpunio/Erasmus_2021EE/main/WBdata.csv", sep = ';',  header=T, na.string="NA")
### Clean the data
##unwated <- c('OSS', 'LTE', 'AFW', 'LCN', 'LCD')
wb.unwanted <- read.csv("https://raw.githubusercontent.com/hrpunio/Erasmus_2021EE/main/wb_groups_list.csv", 
    sep = ';',  header=T, na.string="NA")

More transformations

## Transform to _wider_ format
## change colum names to something less complicated
## Crossectional data: year == 2015
## Remove small countries: year >= 1990
wbl <- wb %>% filter (! code %in% wb.unwanted$code ) %>%
  pivot_wider( names_from = indicatorcode, values_from = value) %>%
  dplyr::select (countryname, code, year, 
          co2=EN.ATM.CO2E.PC, 
          gdp_per_cap=NY.GDP.PCAP.PP.CD, 
          e4coal=EG.ELC.COAL.ZS,
          ren_energy = EG.ELC.RNWX.ZS,
          fcons=AG.CON.FERT.ZS, 
          euse=EG.USE.PCAP.KG.OE, 
          livestock = AG.PRD.LVSK.XD,
          gdpt=NY.GDP.MKTP.KD,  
          pop=SP.POP.TOTL) %>%
  dplyr::filter (year == 2015) %>%
  dplyr::filter(pop > 1000000) %>% 
  mutate(log_gdp_per_cap = log(gdp_per_cap),
                   log_co2 = log(co2),
                   gdp_per_cap_squared = gdp_per_cap^2,
                   log_gdp_per_cap_squared = log(gdp_per_cap)^2)

## check sample size:
nrow(wbl)
## [1] 158

Scatter-plot matrix

## scatter-plot matrix for co2, gdp, fcons, euse
wbl %>% dplyr::select(co2, gdp_per_cap, fcons, euse, e4coal) %>% pairs()

Correlation matrix for

## NOTE: na.omit function omits rows with NA values
wb.corr <- wbl %>% dplyr::select (co2, gdp_per_cap, fcons, euse, e4coal) %>% 
  na.omit() %>% 
  cor()

wb.corr
##                     co2 gdp_per_cap       fcons        euse     e4coal
## co2          1.00000000   0.2953878 -0.04254805  0.75092882  0.3193346
## gdp_per_cap  0.29538777   1.0000000  0.21445252  0.49515965 -0.2790240
## fcons       -0.04254805   0.2144525  1.00000000 -0.01101101 -0.1402689
## euse         0.75092882   0.4951597 -0.01101101  1.00000000 -0.1585325
## e4coal       0.31933460  -0.2790240 -0.14026886 -0.15853248  1.0000000
##

Model 0

\(CO2 = a + b GDP\)

lin-lin model

lm0 <- lm(co2 ~ gdp_per_cap, data=wbl)
summary(lm0)
## 
## Call:
## lm(formula = co2 ~ gdp_per_cap, data = wbl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.2944  -1.1120  -0.5472   0.2284  12.1932 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.566e-01  3.873e-01   0.921    0.359    
## gdp_per_cap 2.203e-04  1.494e-05  14.743   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.352 on 147 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.5966, Adjusted R-squared:  0.5938 
## F-statistic: 217.4 on 1 and 147 DF,  p-value: < 2.2e-16

Model 1

\(\ln CO2 = a + b \ln GDP\)

log-log model

lm1 <- lm(log_co2 ~ log_gdp_per_cap, data=wbl)
summary(lm1)
## 
## Call:
## lm(formula = log_co2 ~ log_gdp_per_cap, data = wbl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.35159 -0.35330 -0.03269  0.33589  1.58092 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -10.65235    0.36179  -29.44   <2e-16 ***
## log_gdp_per_cap   1.22135    0.03887   31.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5633 on 147 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.8704, Adjusted R-squared:  0.8695 
## F-statistic: 987.3 on 1 and 147 DF,  p-value: < 2.2e-16

Model 2

$CO2 = a + b GDP + c ^2 GDP $

the environmental Kuznets curve with logs as the theory of the Kuznets curves suggests, the sign of the coefficient before log_gdp_per_cap is positive and log_gdp_per_cap_squared is negative, resulting in the U-shape The model is great in terms of R2adj and p-values of the coefficients

lm2 <- lm(log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared, data=wbl)
summary(lm2)
## 
## Call:
## lm(formula = log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared, 
##     data = wbl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.04757 -0.37222 -0.06513  0.31844  1.39431 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -22.97153    2.42851  -9.459  < 2e-16 ***
## log_gdp_per_cap           3.99139    0.54206   7.363 1.21e-11 ***
## log_gdp_per_cap_squared  -0.15298    0.02987  -5.121 9.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5205 on 146 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.8901, Adjusted R-squared:  0.8886 
## F-statistic: 591.5 on 2 and 146 DF,  p-value: < 2.2e-16

chart residuals vs fitted

plot(lm2, which = 1) 

Check for heteroscedasticity

Breusch-Pagan Test For Homoscedasticity

#high p = no heteroscedasticity
bptest(lm2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm2
## BP = 0.33513, df = 2, p-value = 0.8457

Check for autocorrelation of error term

## Check autocorrelation of error term
## Durbin Watson Test for Autocorrelation
## high p = errors not autocorrelated
durbinWatsonTest(lm2)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01873193      2.017379   0.934
##  Alternative hypothesis: rho != 0

Check for normality of residuals

## visually looking at qq-plot
plot(lm2, which = 2) 

qqPlot(lm2)

## [1]  80 138
shapiro.test(lm2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm2$residuals
## W = 0.97942, p-value = 0.02466

Check multicollinearity

for models with

omcdiag(lm2)
## 
## Call:
## omcdiag(mod = lm2)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.0044         1
## Farrar Chi-Square:       795.3085         1
## Red Indicator:             0.9978         1
## Sum of Lambda Inverse:   455.7179         1
## Theil's Method:            1.1011         1
## Condition Number:        250.4906         1
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
vif(lm2)
##         log_gdp_per_cap log_gdp_per_cap_squared 
##                227.8589                227.8589

lm2 is highly collinear but it is OK

Perhaps some other models …

Model 3

$ CO2 = a + b GDP + c GDP^2 $

the classical environmental Kuznets curve the sings of the coefficients are concurring with the theory, but the values are very hard to work with - the previous model is clearly better (both by R2adj and p-values of the coefficients)

lm3 <- lm(co2 ~ gdp_per_cap + gdp_per_cap_squared, data=wbl)
summary(lm3)
## 
## Call:
## lm(formula = co2 ~ gdp_per_cap + gdp_per_cap_squared, data = wbl)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7463 -1.2155 -0.3414  0.2384 13.8834 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.987e-03  4.833e-01   0.004    0.997    
## gdp_per_cap          2.657e-04  4.001e-05   6.640 5.76e-10 ***
## gdp_per_cap_squared -7.075e-10  5.784e-10  -1.223    0.223    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.347 on 146 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.6007, Adjusted R-squared:  0.5952 
## F-statistic: 109.8 on 2 and 146 DF,  p-value: < 2.2e-16

Model 4

after Cederborg, J., & Snöbohm, S. (2016). Is there a relationship between economic growth and carbon dioxide emissions?.

lm4 <- lm(co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal + livestock , data=wbl)
summary(lm4)
## 
## Call:
## lm(formula = co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + 
##     e4coal + livestock, data = wbl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3740  -1.3602  -0.5832   0.7180  12.8173 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -4.128e+00  9.904e+00  -0.417   0.6776    
## gdp_per_cap          3.130e-04  5.055e-05   6.192 8.60e-09 ***
## gdp_per_cap_squared -1.194e-09  6.952e-10  -1.718   0.0884 .  
## ren_energy          -1.297e-01  2.891e-02  -4.487 1.67e-05 ***
## e4coal               1.318e-02  1.313e-02   1.004   0.3176    
## livestock            4.303e-02  9.828e-02   0.438   0.6623    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.39 on 120 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.6243, Adjusted R-squared:  0.6086 
## F-statistic: 39.87 on 5 and 120 DF,  p-value: < 2.2e-16

Model 5

Another, less theory-driven approach - let the step function find the best model using AIC

lm_no_log_full <- lm(co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal + livestock + fcons + euse + log(gdpt) + log(pop), data=wbl)
lm5 <- stats::step(lm_no_log_full, direction = "backward")
## Start:  AIC=48.75
## co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal + 
##     livestock + fcons + euse + log(gdpt) + log(pop)
## 
##                       Df Sum of Sq     RSS    AIC
## - log(gdpt)            1     0.031  78.607 46.759
## - log(pop)             1     0.080  78.656 46.779
## - ren_energy           1     0.240  78.817 46.844
## - fcons                1     0.250  78.826 46.848
## - livestock            1     0.256  78.832 46.851
## - gdp_per_cap          1     0.659  79.236 47.014
## - gdp_per_cap_squared  1     1.145  79.721 47.210
## <none>                              78.576 48.747
## - e4coal               1    57.311 135.887 64.275
## - euse                 1   115.994 194.570 75.762
## 
## Step:  AIC=46.76
## co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal + 
##     livestock + fcons + euse + log(pop)
## 
##                       Df Sum of Sq     RSS    AIC
## - fcons                1     0.231  78.839 44.853
## - ren_energy           1     0.240  78.847 44.857
## - livestock            1     0.267  78.874 44.868
## - log(pop)             1     0.732  79.339 45.056
## - gdp_per_cap          1     2.077  80.684 45.594
## - gdp_per_cap_squared  1     2.197  80.805 45.642
## <none>                              78.607 46.759
## - e4coal               1    59.840 138.447 62.872
## - euse                 1   115.988 194.595 73.766
## 
## Step:  AIC=44.85
## co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal + 
##     livestock + euse + log(pop)
## 
##                       Df Sum of Sq     RSS    AIC
## - livestock            1     0.211  79.050 42.939
## - ren_energy           1     0.309  79.148 42.979
## - log(pop)             1     0.606  79.445 43.099
## - gdp_per_cap          1     2.318  81.157 43.781
## - gdp_per_cap_squared  1     2.528  81.366 43.863
## <none>                              78.839 44.853
## - e4coal               1    59.708 138.547 60.895
## - euse                 1   116.300 195.138 71.855
## 
## Step:  AIC=42.94
## co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal + 
##     euse + log(pop)
## 
##                       Df Sum of Sq     RSS    AIC
## - ren_energy           1     0.250  79.300 41.040
## - log(pop)             1     0.646  79.695 41.199
## - gdp_per_cap          1     2.373  81.422 41.885
## - gdp_per_cap_squared  1     2.607  81.657 41.977
## <none>                              79.050 42.939
## - e4coal               1    59.629 138.678 58.925
## - euse                 1   116.561 195.611 69.933
## 
## Step:  AIC=41.04
## co2 ~ gdp_per_cap + gdp_per_cap_squared + e4coal + euse + log(pop)
## 
##                       Df Sum of Sq     RSS    AIC
## - log(pop)             1     0.524  79.824 39.251
## - gdp_per_cap          1     2.177  81.477 39.907
## - gdp_per_cap_squared  1     2.403  81.702 39.995
## <none>                              79.300 41.040
## - e4coal               1    59.390 138.690 56.928
## - euse                 1   145.088 224.388 72.325
## 
## Step:  AIC=39.25
## co2 ~ gdp_per_cap + gdp_per_cap_squared + e4coal + euse
## 
##                       Df Sum of Sq     RSS    AIC
## - gdp_per_cap          1     2.099  81.923 38.081
## - gdp_per_cap_squared  1     2.311  82.135 38.164
## <none>                              79.824 39.251
## - e4coal               1    67.225 147.049 56.801
## - euse                 1   145.775 225.600 70.497
## 
## Step:  AIC=38.08
## co2 ~ gdp_per_cap_squared + e4coal + euse
## 
##                       Df Sum of Sq     RSS    AIC
## - gdp_per_cap_squared  1     0.303  82.226 36.200
## <none>                              81.923 38.081
## - e4coal               1    65.836 147.759 54.955
## - euse                 1   186.576 268.499 74.068
## 
## Step:  AIC=36.2
## co2 ~ e4coal + euse
## 
##          Df Sum of Sq     RSS    AIC
## <none>                 82.226 36.200
## - e4coal  1    67.829 150.056 53.449
## - euse    1   226.767 308.993 76.563
summary(lm5)
## 
## Call:
## lm(formula = co2 ~ e4coal + euse, data = wbl)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4217 -0.5804  0.0393  0.6691  4.5850 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.8210550  0.9363703  -0.877    0.388    
## e4coal       0.0755322  0.0154429   4.891 3.43e-05 ***
## euse         0.0018015  0.0002014   8.943 7.81e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.684 on 29 degrees of freedom
##   (126 observations deleted due to missingness)
## Multiple R-squared:  0.761,  Adjusted R-squared:  0.7445 
## F-statistic: 46.18 on 2 and 29 DF,  p-value: 9.684e-10

Model 6

The same as the previous model, but with logs

lm_log_full <- lm(log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + e4coal + livestock, data=wbl)

lm6 <- stats::step(lm_log_full, direction = "backward")
## Start:  AIC=-181.89
## log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + 
##     e4coal + livestock
## 
##                           Df Sum of Sq    RSS     AIC
## - livestock                1    0.0118 27.057 -183.83
## <none>                                 27.045 -181.89
## - e4coal                   1    2.3113 29.356 -173.56
## - log_gdp_per_cap_squared  1    2.8907 29.936 -171.09
## - ren_energy               1    4.5999 31.645 -164.10
## - log_gdp_per_cap          1    6.5376 33.582 -156.61
## 
## Step:  AIC=-183.83
## log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + 
##     e4coal
## 
##                           Df Sum of Sq    RSS     AIC
## <none>                                 27.057 -183.83
## - e4coal                   1    2.4994 29.556 -174.70
## - log_gdp_per_cap_squared  1    2.9081 29.965 -172.97
## - ren_energy               1    4.5896 31.646 -166.09
## - log_gdp_per_cap          1    6.6302 33.687 -158.22
summary(lm6)
## 
## Call:
## lm(formula = log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + 
##     ren_energy + e4coal, data = wbl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15795 -0.29336 -0.06374  0.31979  1.38336 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -20.453913   2.907675  -7.034 1.29e-10 ***
## log_gdp_per_cap           3.436586   0.631115   5.445 2.76e-07 ***
## log_gdp_per_cap_squared  -0.122359   0.033929  -3.606 0.000452 ***
## ren_energy               -0.017875   0.003946  -4.530 1.39e-05 ***
## e4coal                    0.005888   0.001761   3.343 0.001102 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4729 on 121 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.8727, Adjusted R-squared:  0.8685 
## F-statistic: 207.5 on 4 and 121 DF,  p-value: < 2.2e-16

For each model perform diagnostics

sjstats::check_assumptions(lm2)
## 
## # Checking Model-Assumptions
## 
##   Model: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared
## 
##                           violated    statistic
##   Heteroskedasticity            no    p = 0.630
##   Non-normal residuals         yes    p = 0.023
##   Autocorrelated residuals      no    p = 0.932
##   Multicollinearity            yes  vif = 227.859
sjstats::check_assumptions(lm3)
## 
## # Checking Model-Assumptions
## 
##   Model: co2 ~ gdp_per_cap + gdp_per_cap_squared
## 
##                           violated    statistic
##   Heteroskedasticity           yes    p = 0.000
##   Non-normal residuals         yes    p = 0.000
##   Autocorrelated residuals      no    p = 0.694
##   Multicollinearity            yes  vif = 7.196
sjstats::check_assumptions(lm4)
## 
## # Checking Model-Assumptions
## 
##   Model: co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal + livestock
## 
##                           violated    statistic
##   Heteroskedasticity           yes    p = 0.000
##   Non-normal residuals         yes    p = 0.000
##   Autocorrelated residuals      no    p = 0.968
##   Multicollinearity            yes  vif = 9.743
sjstats::check_assumptions(lm5)
## 
## # Checking Model-Assumptions
## 
##   Model: co2 ~ e4coal + euse
## 
##                           violated    statistic
##   Heteroskedasticity            no    p = 0.183
##   Non-normal residuals          no    p = 0.158
##   Autocorrelated residuals      no    p = 0.744
##   Multicollinearity             no  vif = 1.026
sjstats::check_assumptions(lm6)
## 
## # Checking Model-Assumptions
## 
##   Model: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + e4coal
## 
##                           violated    statistic
##   Heteroskedasticity            no    p = 0.807
##   Non-normal residuals          no    p = 0.604
##   Autocorrelated residuals      no    p = 0.964
##   Multicollinearity            yes  vif = 240.274

Select between nested models with anova

(significance of difference of variance explainded)

The main problems with using anova function are the following:

  1. different defendant variables

  2. because some columns contain NA’s, the models are trained on different number of rows => cannot be compared directly

anova(lm0, lm3, test="F")
## Analysis of Variance Table
## 
## Model 1: co2 ~ gdp_per_cap
## Model 2: co2 ~ gdp_per_cap + gdp_per_cap_squared
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1    147 1652.0                          
## 2    146 1635.3  1    16.756 1.496 0.2233
anova(lm1, lm2, test="F")
## Analysis of Variance Table
## 
## Model 1: log_co2 ~ log_gdp_per_cap
## Model 2: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1    147 46.652                                 
## 2    146 39.547  1    7.1049 26.23 9.431e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Concluding remarks: However, we can use another approach - rmse for the models with the same dependant variables:

qpcR::RMSE(lm1)
## [1] 0.5595554
qpcR::RMSE(lm2)
## [1] 0.5151879
qpcR::RMSE(lm6)
## [1] 0.4633952
qpcR::RMSE(lm0)
## [1] 3.329765
qpcR::RMSE(lm3)
## [1] 3.312835
qpcR::RMSE(lm4)
## [1] 3.308463
qpcR::RMSE(lm5)
## [1] 1.602986

Judging by the combination of the following metrics: 1) R2adjusted 2) statistical significance of the coefficients 3) RMSE

we can conclude that the model lm6: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + e4coal is the best model for explaining co2 emissions

Repeat for other sample

wb.highmid <- read.csv('https://raw.githubusercontent.com/hrpunio/Erasmus_2021EE/main/wb_groups.csv', sep = ';',  header=T, na.string="NA") %>%
  ## about 30 countries:
  ##filter ( GroupName == 'Low income')
  ##filter ( GroupName == 'High income')
  dplyr::filter ( GroupName == 'High income' | GroupName == 'Middle income')

nrow(wb.highmid)
## [1] 189
wbl_2 <- wb %>% dplyr::filter (code %in% wb.highmid$code ) %>%
  pivot_wider( names_from = indicatorcode, values_from = value) %>%
  dplyr::select (countryname, code, year, 
          co2=EN.ATM.CO2E.PC, 
          gdp_per_cap=NY.GDP.PCAP.PP.CD, 
          e4coal=EG.ELC.COAL.ZS,
          ren_energy = EG.ELC.RNWX.ZS,
          fcons=AG.CON.FERT.ZS, 
          euse=EG.USE.PCAP.KG.OE, 
          livestock = AG.PRD.LVSK.XD,
          gdpt=NY.GDP.MKTP.KD,  
          pop=SP.POP.TOTL) %>%
  mutate(log_gdp_per_cap = log(gdp_per_cap),
                   log_co2 = log(co2),
                   log_pop = log(pop),
                   log_gdp = log(gdpt),
                   gdp_per_cap_squared = gdp_per_cap^2,
                   log_gdp_per_cap_squared = log(gdp_per_cap)^2) %>%
  dplyr::filter (year == 2015)
## Warning in log(co2): wyprodukowano wartości NaN
nrow(wbl)
## [1] 158
lm_1_new  <-  lm(log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared, data=wbl_2 ) 
lm_1_new %>% summary()
## 
## Call:
## lm(formula = log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared, 
##     data = wbl_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02931 -0.35221 -0.05029  0.30005  1.42955 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -19.8443     3.9737  -4.994 1.58e-06 ***
## log_gdp_per_cap           3.3877     0.8410   4.028 8.79e-05 ***
## log_gdp_per_cap_squared  -0.1241     0.0442  -2.807  0.00564 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.516 on 155 degrees of freedom
##   (30 observations deleted due to missingness)
## Multiple R-squared:  0.7813, Adjusted R-squared:  0.7784 
## F-statistic: 276.8 on 2 and 155 DF,  p-value: < 2.2e-16
lm_2_new  <-  lm(log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + 
    ren_energy + e4coal, data=wbl_2 ) 
lm_2_new %>% summary()
## 
## Call:
## lm(formula = log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + 
##     ren_energy + e4coal, data = wbl_2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15818 -0.28447 -0.06621  0.31976  1.37973 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -21.534717   4.495868  -4.790 4.90e-06 ***
## log_gdp_per_cap           3.656003   0.940767   3.886 0.000169 ***
## log_gdp_per_cap_squared  -0.133442   0.048865  -2.731 0.007287 ** 
## ren_energy               -0.017847   0.003848  -4.638 9.15e-06 ***
## e4coal                    0.005835   0.001758   3.320 0.001197 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4687 on 118 degrees of freedom
##   (65 observations deleted due to missingness)
## Multiple R-squared:  0.8206, Adjusted R-squared:  0.8145 
## F-statistic: 134.9 on 4 and 118 DF,  p-value: < 2.2e-16
sjstats::check_assumptions(lm_1_new)
## 
## # Checking Model-Assumptions
## 
##   Model: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared
## 
##                           violated    statistic
##   Heteroskedasticity            no    p = 0.637
##   Non-normal residuals         yes    p = 0.021
##   Autocorrelated residuals      no    p = 0.206
##   Multicollinearity            yes  vif = 363.993
sjstats::check_assumptions(lm_2_new)
## 
## # Checking Model-Assumptions
## 
##   Model: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + e4coal
## 
##                           violated    statistic
##   Heteroskedasticity            no    p = 0.704
##   Non-normal residuals          no    p = 0.769
##   Autocorrelated residuals      no    p = 0.548
##   Multicollinearity            yes  vif = 402.062
qpcR::RMSE(lm_1_new)
## [1] 0.511089
qpcR::RMSE(lm_2_new)
## [1] 0.4590642

Judging by the combination of the following metrics:

  1. R2adjusted

  2. statistical significance of the coefficients

  3. RMSE

we can conclude that the model lm_2_new:

log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + e4coal is the best model for explaining co2 emissions in the High income and Middle income countries