Put some economics theory here
Describe the data
### Read all the data
##'NY.GDP.MKTP.KD', = GDP constant US$ 2015
##'SP.POP.TOTL', = Population total
##'EN.ATM.CO2E.PC', = CO2 emission in tons per/capita
##"EG.ELC.RNWX.ZS", = Electricity from renovable sources % of total
##'NY.GDP.PCAP.CD', = GDP per capita (current US$)
##"NY.GDP.PCAP.PP.CD", = GDP per capita, PPP (current international $)
##'EG.ELC.COAL.ZS', = Electricity produced from coal % of total
##'AG.PRD.LVSK.XD', = Livstock production index (2014--2016=100)
##"AG.CON.FERT.ZS", = Fertilizer consumption (kg/hectare of arable land)
##"EG.USE.PCAP.KG.OE" = Energy use (kg of oil equivalent per capita)
wb <- read.csv("WBdata.csv", sep = ';', header=T, na.string="NA")
### Clean the data
##unwated <- c('OSS', 'LTE', 'AFW', 'LCN', 'LCD')
wb.unwanted <- read.csv("wb_groups_list.csv",
sep = ';', header=T, na.string="NA")
More transformations
## Transform to _wider_ format
## change colum names to something less complicated
## Crossectional data: year == 2015
## Remove small countries: year >= 1990
wbl <- wb %>% filter (! code %in% wb.unwanted$code ) %>%
pivot_wider( names_from = indicatorcode, values_from = value) %>%
select (countryname, code, year,
co2=EN.ATM.CO2E.PC,
gdp=NY.GDP.PCAP.PP.CD,
e4coal=EG.ELC.COAL.ZS,
fcons=AG.CON.FERT.ZS,
euse=EG.USE.PCAP.KG.OE,
gdpt=NY.GDP.MKTP.KD,
pop=SP.POP.TOTL) %>%
filter (year == 2015) %>%
filter (pop > 1000000)
## check sample sime:
nrow(wbl)
## [1] 158
Scatter-plot matrix
## scatter-plot matrix for co2, gdp, fcons, euse
wbl %>% select (co2, gdp, fcons, euse, e4coal) %>% pairs()
Correlation matrix for
## NOTE: na.omit function omits rows with NA values
wb.corr <- wbl %>% select (co2, gdp, fcons, euse, e4coal) %>%
na.omit() %>% cor()
wb.corr
## co2 gdp fcons euse e4coal
## co2 1.00000000 0.2953878 -0.04254805 0.75092882 0.3193346
## gdp 0.29538777 1.0000000 0.21445252 0.49515965 -0.2790240
## fcons -0.04254805 0.2144525 1.00000000 -0.01101101 -0.1402689
## euse 0.75092882 0.4951597 -0.01101101 1.00000000 -0.1585325
## e4coal 0.31933460 -0.2790240 -0.14026886 -0.15853248 1.0000000
##
\(CO2 = a + b GDP\)
lm0 <- lm(co2 ~ gdp, data=wbl)
summary(lm0)
##
## Call:
## lm(formula = co2 ~ gdp, data = wbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2944 -1.1120 -0.5472 0.2284 12.1932
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.35658795 0.38730518 0.921 0.359
## gdp 0.00022027 0.00001494 14.743 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.352 on 147 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.5966, Adjusted R-squared: 0.5938
## F-statistic: 217.4 on 1 and 147 DF, p-value: < 0.00000000000000022
\(\ln CO2 = a + b \ln GDP\)
lm1 <- lm(log(co2) ~ log(gdp), data=wbl)
summary(lm1)
##
## Call:
## lm(formula = log(co2) ~ log(gdp), data = wbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.35159 -0.35330 -0.03269 0.33589 1.58092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.65235 0.36179 -29.44 <0.0000000000000002 ***
## log(gdp) 1.22135 0.03887 31.42 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5633 on 147 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.8704, Adjusted R-squared: 0.8695
## F-statistic: 987.3 on 1 and 147 DF, p-value: < 0.00000000000000022
$CO2 = a + b GDP $
lm2 <- lm(log(co2) ~ log(gdp) + I(log(gdp)^2 ), data=wbl)
summary(lm2)
##
## Call:
## lm(formula = log(co2) ~ log(gdp) + I(log(gdp)^2), data = wbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.04757 -0.37222 -0.06513 0.31844 1.39431
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -22.97153 2.42851 -9.459 < 0.0000000000000002 ***
## log(gdp) 3.99139 0.54206 7.363 0.0000000000121 ***
## I(log(gdp)^2) -0.15298 0.02987 -5.121 0.0000009430544 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5205 on 146 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.8901, Adjusted R-squared: 0.8886
## F-statistic: 591.5 on 2 and 146 DF, p-value: < 0.00000000000000022
plot(lm2, which = 1)
Breusch-Pagan Test For Homoscedasticity
#high p = no heteroscedasticity
bptest(lm2)
##
## studentized Breusch-Pagan test
##
## data: lm2
## BP = 0.33513, df = 2, p-value = 0.8457
## Check autocorrelation of error term
## Durbin Watson Test for Autocorrelation
## high p = errors not autocorrelated
durbinWatsonTest(lm2)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.01873193 2.017379 0.912
## Alternative hypothesis: rho != 0
## visually looking at qq-plot
plot(lm2, which = 2)
qqPlot(lm2)
## [1] 80 138
shapiro.test(lm2$residuals)
##
## Shapiro-Wilk normality test
##
## data: lm2$residuals
## W = 0.97942, p-value = 0.02466
vif(lm2)
anova(lm1, lm2, test=“F”)
wb.highmid <- read.csv("wb_groups.csv",
sep = ';', header=T, na.string="NA") %>%
## about 30 countries:
##filter ( GroupName == 'Low income')
##filter ( GroupName == 'High income')
filter ( GroupName == 'High income' | GroupName == 'Middle income')
nrow(wb.highmid)
## [1] 189
wbl <- wb %>% filter (code %in% wb.highmid$code ) %>%
pivot_wider( names_from = indicatorcode, values_from = value) %>%
select (countryname, code, year,
co2=EN.ATM.CO2E.PC,
gdp=NY.GDP.PCAP.PP.CD,
e4coal=EG.ELC.COAL.ZS,
fcons=AG.CON.FERT.ZS,
euse=EG.USE.PCAP.KG.OE,
gdpt=NY.GDP.MKTP.KD,
pop=SP.POP.TOTL) %>%
filter (year == 2015)
nrow(wbl)
## [1] 188