Put some economics theory here
Describe the data
rm(list = ls())
### Read all the data
##'NY.GDP.MKTP.KD', = GDP constant US$ 2015
##'SP.POP.TOTL', = Population total
##'EN.ATM.CO2E.PC', = CO2 emission in tons per/capita
##"EG.ELC.RNWX.ZS", = Electricity from renovable sources % of total
##'NY.GDP.PCAP.CD', = GDP per capita (current US$)
##"NY.GDP.PCAP.PP.CD", = GDP per capita, PPP (current international $)
##'EG.ELC.COAL.ZS', = Electricity produced from coal % of total
##'AG.PRD.LVSK.XD', = Livstock production index (2014--2016=100)
##"AG.CON.FERT.ZS", = Fertilizer consumption (kg/hectare of arable land)
##"EG.USE.PCAP.KG.OE" = Energy use (kg of oil equivalent per capita)
wb <- read.csv("https://raw.githubusercontent.com/hrpunio/Erasmus_2021EE/main/WBdata.csv", sep = ';', header=T, na.string="NA")
### Clean the data
##unwated <- c('OSS', 'LTE', 'AFW', 'LCN', 'LCD')
wb.unwanted <- read.csv("https://raw.githubusercontent.com/hrpunio/Erasmus_2021EE/main/wb_groups_list.csv",
sep = ';', header=T, na.string="NA")
More transformations
## Transform to _wider_ format
## change colum names to something less complicated
## Crossectional data: year == 2015
## Remove small countries: year >= 1990
wbl <- wb %>% filter (! code %in% wb.unwanted$code ) %>%
pivot_wider( names_from = indicatorcode, values_from = value) %>%
dplyr::select (countryname, code, year,
co2=EN.ATM.CO2E.PC,
gdp_per_cap=NY.GDP.PCAP.PP.CD,
e4coal=EG.ELC.COAL.ZS,
ren_energy = EG.ELC.RNWX.ZS,
fcons=AG.CON.FERT.ZS,
euse=EG.USE.PCAP.KG.OE,
livestock = AG.PRD.LVSK.XD,
gdpt=NY.GDP.MKTP.KD,
pop=SP.POP.TOTL) %>%
dplyr::filter (year == 2015) %>%
dplyr::filter(pop > 1000000) %>%
mutate(log_gdp_per_cap = log(gdp_per_cap),
log_co2 = log(co2),
gdp_per_cap_squared = gdp_per_cap^2,
log_gdp_per_cap_squared = log(gdp_per_cap)^2)
## check sample size:
nrow(wbl)
## [1] 158
Scatter-plot matrix
## scatter-plot matrix for co2, gdp, fcons, euse
wbl %>% dplyr::select(co2, gdp_per_cap, fcons, euse, e4coal) %>% pairs()
Correlation matrix for
## NOTE: na.omit function omits rows with NA values
wb.corr <- wbl %>% dplyr::select (co2, gdp_per_cap, fcons, euse, e4coal) %>%
na.omit() %>%
cor()
wb.corr
## co2 gdp_per_cap fcons euse e4coal
## co2 1.00000000 0.2953878 -0.04254805 0.75092882 0.3193346
## gdp_per_cap 0.29538777 1.0000000 0.21445252 0.49515965 -0.2790240
## fcons -0.04254805 0.2144525 1.00000000 -0.01101101 -0.1402689
## euse 0.75092882 0.4951597 -0.01101101 1.00000000 -0.1585325
## e4coal 0.31933460 -0.2790240 -0.14026886 -0.15853248 1.0000000
##
\(CO2 = a + b GDP\)
lin-lin model
lm0 <- lm(co2 ~ gdp_per_cap, data=wbl)
summary(lm0)
##
## Call:
## lm(formula = co2 ~ gdp_per_cap, data = wbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2944 -1.1120 -0.5472 0.2284 12.1932
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.566e-01 3.873e-01 0.921 0.359
## gdp_per_cap 2.203e-04 1.494e-05 14.743 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.352 on 147 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.5966, Adjusted R-squared: 0.5938
## F-statistic: 217.4 on 1 and 147 DF, p-value: < 2.2e-16
\(\ln CO2 = a + b \ln GDP\)
log-log model
lm1 <- lm(log_co2 ~ log_gdp_per_cap, data=wbl)
summary(lm1)
##
## Call:
## lm(formula = log_co2 ~ log_gdp_per_cap, data = wbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.35159 -0.35330 -0.03269 0.33589 1.58092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.65235 0.36179 -29.44 <2e-16 ***
## log_gdp_per_cap 1.22135 0.03887 31.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5633 on 147 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.8704, Adjusted R-squared: 0.8695
## F-statistic: 987.3 on 1 and 147 DF, p-value: < 2.2e-16
$CO2 = a + b GDP + c ^2 GDP $
the environmental Kuznets curve with logs as the theory of the Kuznets curves suggests, the sign of the coefficient before log_gdp_per_cap is positive and log_gdp_per_cap_squared is negative, resulting in the U-shape The model is great in terms of R2adj and p-values of the coefficients
lm2 <- lm(log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared, data=wbl)
summary(lm2)
##
## Call:
## lm(formula = log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared,
## data = wbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.04757 -0.37222 -0.06513 0.31844 1.39431
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -22.97153 2.42851 -9.459 < 2e-16 ***
## log_gdp_per_cap 3.99139 0.54206 7.363 1.21e-11 ***
## log_gdp_per_cap_squared -0.15298 0.02987 -5.121 9.43e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5205 on 146 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.8901, Adjusted R-squared: 0.8886
## F-statistic: 591.5 on 2 and 146 DF, p-value: < 2.2e-16
plot(lm2, which = 1)
Breusch-Pagan Test For Homoscedasticity
#high p = no heteroscedasticity
bptest(lm2)
##
## studentized Breusch-Pagan test
##
## data: lm2
## BP = 0.33513, df = 2, p-value = 0.8457
## Check autocorrelation of error term
## Durbin Watson Test for Autocorrelation
## high p = errors not autocorrelated
durbinWatsonTest(lm2)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.01873193 2.017379 0.934
## Alternative hypothesis: rho != 0
## visually looking at qq-plot
plot(lm2, which = 2)
qqPlot(lm2)
## [1] 80 138
shapiro.test(lm2$residuals)
##
## Shapiro-Wilk normality test
##
## data: lm2$residuals
## W = 0.97942, p-value = 0.02466
omcdiag(lm2)
##
## Call:
## omcdiag(mod = lm2)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.0044 1
## Farrar Chi-Square: 795.3085 1
## Red Indicator: 0.9978 1
## Sum of Lambda Inverse: 455.7179 1
## Theil's Method: 1.1011 1
## Condition Number: 250.4906 1
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
vif(lm2)
## log_gdp_per_cap log_gdp_per_cap_squared
## 227.8589 227.8589
$ CO2 = a + b GDP + c GDP^2 $
the classical environmental Kuznets curve the sings of the coefficients are concurring with the theory, but the values are very hard to work with - the previous model is clearly better (both by R2adj and p-values of the coefficients)
lm3 <- lm(co2 ~ gdp_per_cap + gdp_per_cap_squared, data=wbl)
summary(lm3)
##
## Call:
## lm(formula = co2 ~ gdp_per_cap + gdp_per_cap_squared, data = wbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7463 -1.2155 -0.3414 0.2384 13.8834
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.987e-03 4.833e-01 0.004 0.997
## gdp_per_cap 2.657e-04 4.001e-05 6.640 5.76e-10 ***
## gdp_per_cap_squared -7.075e-10 5.784e-10 -1.223 0.223
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.347 on 146 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.6007, Adjusted R-squared: 0.5952
## F-statistic: 109.8 on 2 and 146 DF, p-value: < 2.2e-16
after Cederborg, J., & Snöbohm, S. (2016). Is there a relationship between economic growth and carbon dioxide emissions?.
lm4 <- lm(co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal + livestock , data=wbl)
summary(lm4)
##
## Call:
## lm(formula = co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy +
## e4coal + livestock, data = wbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.3740 -1.3602 -0.5832 0.7180 12.8173
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.128e+00 9.904e+00 -0.417 0.6776
## gdp_per_cap 3.130e-04 5.055e-05 6.192 8.60e-09 ***
## gdp_per_cap_squared -1.194e-09 6.952e-10 -1.718 0.0884 .
## ren_energy -1.297e-01 2.891e-02 -4.487 1.67e-05 ***
## e4coal 1.318e-02 1.313e-02 1.004 0.3176
## livestock 4.303e-02 9.828e-02 0.438 0.6623
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.39 on 120 degrees of freedom
## (32 observations deleted due to missingness)
## Multiple R-squared: 0.6243, Adjusted R-squared: 0.6086
## F-statistic: 39.87 on 5 and 120 DF, p-value: < 2.2e-16
Another, less theory-driven approach - let the step function find the best model using AIC
lm_no_log_full <- lm(co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal + livestock + fcons + euse + log(gdpt) + log(pop), data=wbl)
lm5 <- stats::step(lm_no_log_full, direction = "backward")
## Start: AIC=48.75
## co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal +
## livestock + fcons + euse + log(gdpt) + log(pop)
##
## Df Sum of Sq RSS AIC
## - log(gdpt) 1 0.031 78.607 46.759
## - log(pop) 1 0.080 78.656 46.779
## - ren_energy 1 0.240 78.817 46.844
## - fcons 1 0.250 78.826 46.848
## - livestock 1 0.256 78.832 46.851
## - gdp_per_cap 1 0.659 79.236 47.014
## - gdp_per_cap_squared 1 1.145 79.721 47.210
## <none> 78.576 48.747
## - e4coal 1 57.311 135.887 64.275
## - euse 1 115.994 194.570 75.762
##
## Step: AIC=46.76
## co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal +
## livestock + fcons + euse + log(pop)
##
## Df Sum of Sq RSS AIC
## - fcons 1 0.231 78.839 44.853
## - ren_energy 1 0.240 78.847 44.857
## - livestock 1 0.267 78.874 44.868
## - log(pop) 1 0.732 79.339 45.056
## - gdp_per_cap 1 2.077 80.684 45.594
## - gdp_per_cap_squared 1 2.197 80.805 45.642
## <none> 78.607 46.759
## - e4coal 1 59.840 138.447 62.872
## - euse 1 115.988 194.595 73.766
##
## Step: AIC=44.85
## co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal +
## livestock + euse + log(pop)
##
## Df Sum of Sq RSS AIC
## - livestock 1 0.211 79.050 42.939
## - ren_energy 1 0.309 79.148 42.979
## - log(pop) 1 0.606 79.445 43.099
## - gdp_per_cap 1 2.318 81.157 43.781
## - gdp_per_cap_squared 1 2.528 81.366 43.863
## <none> 78.839 44.853
## - e4coal 1 59.708 138.547 60.895
## - euse 1 116.300 195.138 71.855
##
## Step: AIC=42.94
## co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal +
## euse + log(pop)
##
## Df Sum of Sq RSS AIC
## - ren_energy 1 0.250 79.300 41.040
## - log(pop) 1 0.646 79.695 41.199
## - gdp_per_cap 1 2.373 81.422 41.885
## - gdp_per_cap_squared 1 2.607 81.657 41.977
## <none> 79.050 42.939
## - e4coal 1 59.629 138.678 58.925
## - euse 1 116.561 195.611 69.933
##
## Step: AIC=41.04
## co2 ~ gdp_per_cap + gdp_per_cap_squared + e4coal + euse + log(pop)
##
## Df Sum of Sq RSS AIC
## - log(pop) 1 0.524 79.824 39.251
## - gdp_per_cap 1 2.177 81.477 39.907
## - gdp_per_cap_squared 1 2.403 81.702 39.995
## <none> 79.300 41.040
## - e4coal 1 59.390 138.690 56.928
## - euse 1 145.088 224.388 72.325
##
## Step: AIC=39.25
## co2 ~ gdp_per_cap + gdp_per_cap_squared + e4coal + euse
##
## Df Sum of Sq RSS AIC
## - gdp_per_cap 1 2.099 81.923 38.081
## - gdp_per_cap_squared 1 2.311 82.135 38.164
## <none> 79.824 39.251
## - e4coal 1 67.225 147.049 56.801
## - euse 1 145.775 225.600 70.497
##
## Step: AIC=38.08
## co2 ~ gdp_per_cap_squared + e4coal + euse
##
## Df Sum of Sq RSS AIC
## - gdp_per_cap_squared 1 0.303 82.226 36.200
## <none> 81.923 38.081
## - e4coal 1 65.836 147.759 54.955
## - euse 1 186.576 268.499 74.068
##
## Step: AIC=36.2
## co2 ~ e4coal + euse
##
## Df Sum of Sq RSS AIC
## <none> 82.226 36.200
## - e4coal 1 67.829 150.056 53.449
## - euse 1 226.767 308.993 76.563
summary(lm5)
##
## Call:
## lm(formula = co2 ~ e4coal + euse, data = wbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4217 -0.5804 0.0393 0.6691 4.5850
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.8210550 0.9363703 -0.877 0.388
## e4coal 0.0755322 0.0154429 4.891 3.43e-05 ***
## euse 0.0018015 0.0002014 8.943 7.81e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.684 on 29 degrees of freedom
## (126 observations deleted due to missingness)
## Multiple R-squared: 0.761, Adjusted R-squared: 0.7445
## F-statistic: 46.18 on 2 and 29 DF, p-value: 9.684e-10
The same as the previous model, but with logs
lm_log_full <- lm(log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + e4coal + livestock, data=wbl)
lm6 <- stats::step(lm_log_full, direction = "backward")
## Start: AIC=-181.89
## log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy +
## e4coal + livestock
##
## Df Sum of Sq RSS AIC
## - livestock 1 0.0118 27.057 -183.83
## <none> 27.045 -181.89
## - e4coal 1 2.3113 29.356 -173.56
## - log_gdp_per_cap_squared 1 2.8907 29.936 -171.09
## - ren_energy 1 4.5999 31.645 -164.10
## - log_gdp_per_cap 1 6.5376 33.582 -156.61
##
## Step: AIC=-183.83
## log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy +
## e4coal
##
## Df Sum of Sq RSS AIC
## <none> 27.057 -183.83
## - e4coal 1 2.4994 29.556 -174.70
## - log_gdp_per_cap_squared 1 2.9081 29.965 -172.97
## - ren_energy 1 4.5896 31.646 -166.09
## - log_gdp_per_cap 1 6.6302 33.687 -158.22
summary(lm6)
##
## Call:
## lm(formula = log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared +
## ren_energy + e4coal, data = wbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.15795 -0.29336 -0.06374 0.31979 1.38336
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.453913 2.907675 -7.034 1.29e-10 ***
## log_gdp_per_cap 3.436586 0.631115 5.445 2.76e-07 ***
## log_gdp_per_cap_squared -0.122359 0.033929 -3.606 0.000452 ***
## ren_energy -0.017875 0.003946 -4.530 1.39e-05 ***
## e4coal 0.005888 0.001761 3.343 0.001102 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4729 on 121 degrees of freedom
## (32 observations deleted due to missingness)
## Multiple R-squared: 0.8727, Adjusted R-squared: 0.8685
## F-statistic: 207.5 on 4 and 121 DF, p-value: < 2.2e-16
sjstats::check_assumptions(lm2)
##
## # Checking Model-Assumptions
##
## Model: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared
##
## violated statistic
## Heteroskedasticity no p = 0.630
## Non-normal residuals yes p = 0.023
## Autocorrelated residuals no p = 0.932
## Multicollinearity yes vif = 227.859
sjstats::check_assumptions(lm3)
##
## # Checking Model-Assumptions
##
## Model: co2 ~ gdp_per_cap + gdp_per_cap_squared
##
## violated statistic
## Heteroskedasticity yes p = 0.000
## Non-normal residuals yes p = 0.000
## Autocorrelated residuals no p = 0.694
## Multicollinearity yes vif = 7.196
sjstats::check_assumptions(lm4)
##
## # Checking Model-Assumptions
##
## Model: co2 ~ gdp_per_cap + gdp_per_cap_squared + ren_energy + e4coal + livestock
##
## violated statistic
## Heteroskedasticity yes p = 0.000
## Non-normal residuals yes p = 0.000
## Autocorrelated residuals no p = 0.968
## Multicollinearity yes vif = 9.743
sjstats::check_assumptions(lm5)
##
## # Checking Model-Assumptions
##
## Model: co2 ~ e4coal + euse
##
## violated statistic
## Heteroskedasticity no p = 0.183
## Non-normal residuals no p = 0.158
## Autocorrelated residuals no p = 0.744
## Multicollinearity no vif = 1.026
sjstats::check_assumptions(lm6)
##
## # Checking Model-Assumptions
##
## Model: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + e4coal
##
## violated statistic
## Heteroskedasticity no p = 0.807
## Non-normal residuals no p = 0.604
## Autocorrelated residuals no p = 0.964
## Multicollinearity yes vif = 240.274
The main problems with using anova function are the following:
different defendant variables
because some columns contain NA’s, the models are trained on different number of rows => cannot be compared directly
anova(lm0, lm3, test="F")
## Analysis of Variance Table
##
## Model 1: co2 ~ gdp_per_cap
## Model 2: co2 ~ gdp_per_cap + gdp_per_cap_squared
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 147 1652.0
## 2 146 1635.3 1 16.756 1.496 0.2233
anova(lm1, lm2, test="F")
## Analysis of Variance Table
##
## Model 1: log_co2 ~ log_gdp_per_cap
## Model 2: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 147 46.652
## 2 146 39.547 1 7.1049 26.23 9.431e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Concluding remarks: However, we can use another approach - rmse for the models with the same dependant variables:
qpcR::RMSE(lm1)
## [1] 0.5595554
qpcR::RMSE(lm2)
## [1] 0.5151879
qpcR::RMSE(lm6)
## [1] 0.4633952
qpcR::RMSE(lm0)
## [1] 3.329765
qpcR::RMSE(lm3)
## [1] 3.312835
qpcR::RMSE(lm4)
## [1] 3.308463
qpcR::RMSE(lm5)
## [1] 1.602986
Judging by the combination of the following metrics: 1) R2adjusted 2) statistical significance of the coefficients 3) RMSE
we can conclude that the model lm6: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + e4coal is the best model for explaining co2 emissions
wb.highmid <- read.csv('https://raw.githubusercontent.com/hrpunio/Erasmus_2021EE/main/wb_groups.csv', sep = ';', header=T, na.string="NA") %>%
## about 30 countries:
##filter ( GroupName == 'Low income')
##filter ( GroupName == 'High income')
dplyr::filter ( GroupName == 'High income' | GroupName == 'Middle income')
nrow(wb.highmid)
## [1] 189
wbl_2 <- wb %>% dplyr::filter (code %in% wb.highmid$code ) %>%
pivot_wider( names_from = indicatorcode, values_from = value) %>%
dplyr::select (countryname, code, year,
co2=EN.ATM.CO2E.PC,
gdp_per_cap=NY.GDP.PCAP.PP.CD,
e4coal=EG.ELC.COAL.ZS,
ren_energy = EG.ELC.RNWX.ZS,
fcons=AG.CON.FERT.ZS,
euse=EG.USE.PCAP.KG.OE,
livestock = AG.PRD.LVSK.XD,
gdpt=NY.GDP.MKTP.KD,
pop=SP.POP.TOTL) %>%
mutate(log_gdp_per_cap = log(gdp_per_cap),
log_co2 = log(co2),
log_pop = log(pop),
log_gdp = log(gdpt),
gdp_per_cap_squared = gdp_per_cap^2,
log_gdp_per_cap_squared = log(gdp_per_cap)^2) %>%
dplyr::filter (year == 2015)
## Warning in log(co2): wyprodukowano wartości NaN
nrow(wbl)
## [1] 158
lm_1_new <- lm(log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared, data=wbl_2 )
lm_1_new %>% summary()
##
## Call:
## lm(formula = log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared,
## data = wbl_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.02931 -0.35221 -0.05029 0.30005 1.42955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -19.8443 3.9737 -4.994 1.58e-06 ***
## log_gdp_per_cap 3.3877 0.8410 4.028 8.79e-05 ***
## log_gdp_per_cap_squared -0.1241 0.0442 -2.807 0.00564 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.516 on 155 degrees of freedom
## (30 observations deleted due to missingness)
## Multiple R-squared: 0.7813, Adjusted R-squared: 0.7784
## F-statistic: 276.8 on 2 and 155 DF, p-value: < 2.2e-16
lm_2_new <- lm(log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared +
ren_energy + e4coal, data=wbl_2 )
lm_2_new %>% summary()
##
## Call:
## lm(formula = log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared +
## ren_energy + e4coal, data = wbl_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.15818 -0.28447 -0.06621 0.31976 1.37973
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -21.534717 4.495868 -4.790 4.90e-06 ***
## log_gdp_per_cap 3.656003 0.940767 3.886 0.000169 ***
## log_gdp_per_cap_squared -0.133442 0.048865 -2.731 0.007287 **
## ren_energy -0.017847 0.003848 -4.638 9.15e-06 ***
## e4coal 0.005835 0.001758 3.320 0.001197 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4687 on 118 degrees of freedom
## (65 observations deleted due to missingness)
## Multiple R-squared: 0.8206, Adjusted R-squared: 0.8145
## F-statistic: 134.9 on 4 and 118 DF, p-value: < 2.2e-16
sjstats::check_assumptions(lm_1_new)
##
## # Checking Model-Assumptions
##
## Model: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared
##
## violated statistic
## Heteroskedasticity no p = 0.637
## Non-normal residuals yes p = 0.021
## Autocorrelated residuals no p = 0.206
## Multicollinearity yes vif = 363.993
sjstats::check_assumptions(lm_2_new)
##
## # Checking Model-Assumptions
##
## Model: log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + e4coal
##
## violated statistic
## Heteroskedasticity no p = 0.704
## Non-normal residuals no p = 0.769
## Autocorrelated residuals no p = 0.548
## Multicollinearity yes vif = 402.062
qpcR::RMSE(lm_1_new)
## [1] 0.511089
qpcR::RMSE(lm_2_new)
## [1] 0.4590642
Judging by the combination of the following metrics:
R2adjusted
statistical significance of the coefficients
RMSE
we can conclude that the model lm_2_new:
log_co2 ~ log_gdp_per_cap + log_gdp_per_cap_squared + ren_energy + e4coal is the best model for explaining co2 emissions in the High income and Middle income countries