The Problem

Put some economics theory here

The data

Describe the data

### Read all the data
##'NY.GDP.MKTP.KD',    = GDP constant US$ 2015
##'SP.POP.TOTL',       = Population total
##'EN.ATM.CO2E.PC',    = CO2 emission in tons per/capita
##"EG.ELC.RNWX.ZS",    =  Electricity from renovable sources % of total
##'NY.GDP.PCAP.CD',    = GDP per capita (current US$) 
##"NY.GDP.PCAP.PP.CD", = GDP per capita, PPP (current international $) 
##'EG.ELC.COAL.ZS',    = Electricity produced from coal % of total
##'AG.PRD.LVSK.XD',    = Livstock production index (2014--2016=100)
##"AG.CON.FERT.ZS",    = Fertilizer consumption (kg/hectare of arable land)
##"EG.USE.PCAP.KG.OE"  = Energy use (kg of oil equivalent per capita)

wb <- read.csv("WBdata.csv", sep = ';',  header=T, na.string="NA")


### Clean the data
##unwated <- c('OSS', 'LTE', 'AFW', 'LCN', 'LCD')
wb.unwanted <- read.csv("wb_groups_list.csv", 
    sep = ';',  header=T, na.string="NA")

More transformations

## Transform to _wider_ format
## change colum names to something less complicated
## Crossectional data: year == 2015
## Remove small countries: year >= 1990
wbl <- wb %>% filter (! code %in% wb.unwanted$code ) %>%
  pivot_wider( names_from = indicatorcode, values_from = value) %>%
  select (countryname, code, year, 
          co2=EN.ATM.CO2E.PC, 
          gdp=NY.GDP.PCAP.PP.CD, 
          e4coal=EG.ELC.COAL.ZS,
          fcons=AG.CON.FERT.ZS, 
          euse=EG.USE.PCAP.KG.OE, 
          gdpt=NY.GDP.MKTP.KD,  
          pop=SP.POP.TOTL) %>%
  filter (year == 2015) %>%
  filter  (pop > 1000000)

## check sample sime:
nrow(wbl)
## [1] 158

Scatter-plot matrix

## scatter-plot matrix for co2, gdp, fcons, euse
wbl %>% select (co2, gdp, fcons, euse, e4coal) %>% pairs()

Correlation matrix for

## NOTE: na.omit function omits rows with NA values
wb.corr <- wbl %>% select (co2, gdp, fcons, euse, e4coal) %>% 
  na.omit() %>% cor()
wb.corr
##                co2        gdp       fcons        euse     e4coal
## co2     1.00000000  0.2953878 -0.04254805  0.75092882  0.3193346
## gdp     0.29538777  1.0000000  0.21445252  0.49515965 -0.2790240
## fcons  -0.04254805  0.2144525  1.00000000 -0.01101101 -0.1402689
## euse    0.75092882  0.4951597 -0.01101101  1.00000000 -0.1585325
## e4coal  0.31933460 -0.2790240 -0.14026886 -0.15853248  1.0000000
##

Model 0

\(CO2 = a + b GDP\)

lm0 <- lm(co2 ~ gdp, data=wbl)
summary(lm0)
## 
## Call:
## lm(formula = co2 ~ gdp, data = wbl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.2944  -1.1120  -0.5472   0.2284  12.1932 
## 
## Coefficients:
##               Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 0.35658795 0.38730518   0.921               0.359    
## gdp         0.00022027 0.00001494  14.743 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.352 on 147 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.5966, Adjusted R-squared:  0.5938 
## F-statistic: 217.4 on 1 and 147 DF,  p-value: < 0.00000000000000022

Model 1

\(\ln CO2 = a + b \ln GDP\)

lm1 <- lm(log(co2) ~ log(gdp), data=wbl)
summary(lm1)
## 
## Call:
## lm(formula = log(co2) ~ log(gdp), data = wbl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.35159 -0.35330 -0.03269  0.33589  1.58092 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) -10.65235    0.36179  -29.44 <0.0000000000000002 ***
## log(gdp)      1.22135    0.03887   31.42 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5633 on 147 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.8704, Adjusted R-squared:  0.8695 
## F-statistic: 987.3 on 1 and 147 DF,  p-value: < 0.00000000000000022

Model 2

$CO2 = a + b GDP $

lm2 <- lm(log(co2) ~ log(gdp) + I(log(gdp)^2 ), data=wbl)
summary(lm2)
## 
## Call:
## lm(formula = log(co2) ~ log(gdp) + I(log(gdp)^2), data = wbl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.04757 -0.37222 -0.06513  0.31844  1.39431 
## 
## Coefficients:
##                Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   -22.97153    2.42851  -9.459 < 0.0000000000000002 ***
## log(gdp)        3.99139    0.54206   7.363      0.0000000000121 ***
## I(log(gdp)^2)  -0.15298    0.02987  -5.121      0.0000009430544 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5205 on 146 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.8901, Adjusted R-squared:  0.8886 
## F-statistic: 591.5 on 2 and 146 DF,  p-value: < 0.00000000000000022

chart residuals vs fitted

plot(lm2, which = 1) 

Check for heteroscedasticity

Breusch-Pagan Test For Homoscedasticity

#high p = no heteroscedasticity
bptest(lm2)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm2
## BP = 0.33513, df = 2, p-value = 0.8457

Check for autocorrelation of error term

## Check autocorrelation of error term
## Durbin Watson Test for Autocorrelation
## high p = errors not autocorrelated
durbinWatsonTest(lm2)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01873193      2.017379   0.912
##  Alternative hypothesis: rho != 0

Check for normality of residuals

## visually looking at qq-plot
plot(lm2, which = 2) 

qqPlot(lm2)

## [1]  80 138
shapiro.test(lm2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm2$residuals
## W = 0.97942, p-value = 0.02466

Check multicollinearity

for models with

vif(lm2)

lm2 is highly collinear but it is OK

Perhaps some other models …

For each model perform diagnostics

Select between nested models with anova

(significance of difference of variance explainded)

anova(lm1, lm2, test=“F”)

Model 2 is significantly better than model 1

Repeat for other sample

wb.highmid <- read.csv("wb_groups.csv", 
                        sep = ';',  header=T, na.string="NA") %>%
  ## about 30 countries:
  ##filter ( GroupName == 'Low income')
  ##filter ( GroupName == 'High income')
  filter ( GroupName == 'High income' | GroupName == 'Middle income')

nrow(wb.highmid)
## [1] 189
wbl <- wb %>% filter (code %in% wb.highmid$code ) %>%
  pivot_wider( names_from = indicatorcode, values_from = value) %>%
  select (countryname, code, year, 
          co2=EN.ATM.CO2E.PC, 
          gdp=NY.GDP.PCAP.PP.CD, 
          e4coal=EG.ELC.COAL.ZS,
          fcons=AG.CON.FERT.ZS, 
          euse=EG.USE.PCAP.KG.OE, 
          gdpt=NY.GDP.MKTP.KD,  
          pop=SP.POP.TOTL) %>%
  filter (year == 2015)
nrow(wbl)
## [1] 188