\(~\)

1 - We are here interested in whether we can observe significant differences in mean between colonized countries and the rest of the world, and between colonized countries grouped by their level of settlers mortality:

\(~\)

Loading libraries and data set :

\(~\)

library(haven)
library(tidyverse)
library(reshape2)
library(data.table)
library(janitor)
library(xtable)
library(kableExtra)
library(stargazer)
library(ivreg)
library(RColorBrewer)
library(plotly)

data_AJR_2001 <- read_dta("/Users/bastienpatras/Desktop/ENS - Master of Economics/Econometrics/Tutorial 5-20210322/data_AJR_2001.dta")

\(~\)

1-3) Building the first column :

\(~\)

# Computation of the first column 

data_AJR_2001_sum <- data_AJR_2001 %>% 
                 select(logpgp95, loghjypl, avexpr, cons00a, cons1, democ00a, euro1900, logem4) %>% 
                 mutate(euro1900 = euro1900/100) %>%
                 summarize(across(
                  .cols = everything(),
                  .fns = list( 
                  Mean = ~mean(.x, na.rm = T), 
                    SD = ~var(.x, na.rm = T)^0.5),
                .names = "{.col}_{.fn}")
                      ) 
# Formating

column1 <- reshape2::melt(data_AJR_2001_sum) %>%
  rename(Decriptive = variable, `Whole sample` = value )

# Output

column1 %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
Decriptive Whole sample
logpgp95_Mean 8.3025091
logpgp95_SD 1.1053422
loghjypl_Mean -1.7311057
loghjypl_SD 1.0837256
avexpr_Mean 7.0664914
avexpr_SD 1.8042868
cons00a_Mean 1.8571429
cons00a_SD 1.8231318
cons1_Mean 3.5909091
cons1_SD 2.4146886
democ00a_Mean 1.1494253
democ00a_SD 2.5768592
euro1900_Mean 0.3046623
euro1900_SD 0.4238986
logem4_Mean 4.5959838
logem4_SD 1.3033335

\(~\)

Building the second column :

\(~\)

# Computation of the second column

data_AJR_2001_sum_2 <- data_AJR_2001 %>% 
  select(logpgp95, loghjypl, avexpr, cons00a, cons1, democ00a, euro1900, logem4, baseco) %>% 
  mutate(euro1900 = euro1900/100) %>%
  filter(baseco==1) %>% 
  summarize(across(
    .cols = everything(),
    .fns = list( 
      Mean = ~mean(.x, na.rm = T), 
      SD = ~var(.x, na.rm = T)^0.5),
    .names = "{.col}_{.fn}"))  %>% 
  select(logpgp95_Mean, logpgp95_SD, 
         loghjypl_Mean, loghjypl_SD,
         avexpr_Mean, avexpr_SD,
         cons00a_Mean, cons00a_SD,
         cons1_Mean, cons1_SD,
         democ00a_Mean, democ00a_SD,
         euro1900_Mean, euro1900_SD,
         logem4_Mean, logem4_SD)

# Formating

column2 <- reshape2::melt(data_AJR_2001_sum_2) %>%
  rename(`Base sample` = value )

# Output

column2 %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
variable Base sample
logpgp95_Mean 8.0622369
logpgp95_SD 1.0433593
loghjypl_Mean -1.9340524
loghjypl_SD 0.9807444
avexpr_Mean 6.5156250
avexpr_SD 1.4686472
cons00a_Mean 2.2500000
cons00a_SD 2.1123126
cons1_Mean 3.4000000
cons1_SD 2.3949099
democ00a_Mean 1.6440678
democ00a_SD 3.0043802
euro1900_Mean 0.1618095
euro1900_SD 0.2553334
logem4_Mean 4.6570311
logem4_SD 1.2579836

\(~\)

Building the first table :

\(~\)

# Merging the 2 first columns 

Table_1 <- cbind(column1,column2) %>% select(Decriptive, `Whole sample`, `Base sample`)

# Output

Table_1 %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
Decriptive Whole sample Base sample
logpgp95_Mean 8.3025091 8.0622369
logpgp95_SD 1.1053422 1.0433593
loghjypl_Mean -1.7311057 -1.9340524
loghjypl_SD 1.0837256 0.9807444
avexpr_Mean 7.0664914 6.5156250
avexpr_SD 1.8042868 1.4686472
cons00a_Mean 1.8571429 2.2500000
cons00a_SD 1.8231318 2.1123126
cons1_Mean 3.5909091 3.4000000
cons1_SD 2.4146886 2.3949099
democ00a_Mean 1.1494253 1.6440678
democ00a_SD 2.5768592 3.0043802
euro1900_Mean 0.3046623 0.1618095
euro1900_SD 0.4238986 0.2553334
logem4_Mean 4.5959838 4.6570311
logem4_SD 1.3033335 1.2579836

\(~\)

Building the third table :

\(~\)

# Computation of the second table

data_AJR_2001_sum_3 <- data_AJR_2001 %>% filter(baseco==1)

data_AJR_2001_sum_3 <- data_AJR_2001_sum_3 %>% 
  select(logpgp95, loghjypl, avexpr, cons00a, cons1, democ00a, euro1900, logem4) %>%
  mutate(euro1900 = euro1900/100) %>%
  mutate(quantile = case_when(logem4 < quantile(logem4, probs = 0.25) ~ "q1",
   quantile(logem4, probs = 0.25)  < logem4 & logem4 < quantile(logem4, probs = 0.5) ~ "q2",
   quantile(logem4, probs = 0.5)  < logem4 & logem4 < quantile(logem4, probs = 0.75) ~ "q3",
   quantile(logem4, probs = 0.75)  < logem4 & logem4 < quantile(logem4, probs = 1) ~ "q4"))  %>% 
  group_by(quantile) %>%
  summarize(across(
    .cols = everything(),
    .fns = list(Mean = ~mean(.x, na.rm = T),
                SD = ~var(.x, na.rm = T)^0.5),
    .names = "{.col}_{.fn}"))

# Formating second table 

c <- as.data.frame(t(as.matrix(data_AJR_2001_sum_3)))
Table_2 <- janitor::row_to_names(c, row_number = 1)

# Output

Table_2 %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
q1 q2 q3 q4 NA
logpgp95_Mean 8.825758 8.327826 7.873064 7.178371 8.344939
logpgp95_SD 1.2659394 0.6124916 0.7218007 0.6532207 1.5445691
loghjypl_Mean -1.071928 -1.496654 -2.106015 -3.031726 -1.853427
loghjypl_SD 0.8450932 0.4305819 0.7506511 0.4815506 1.3186635
avexpr_Mean 7.828788 6.363636 5.931818 6.066667 6.068182
avexpr_SD 1.5197167 0.9777634 1.3518684 1.1959527 1.9288748
cons00a_Mean 3.923077 2.800000 1.133333 1.000000 3.666667
cons00a_SD 3.0127932 1.7808505 0.5163978 0.0000000 3.0550505
cons1_Mean 4.846154 2.733333 3.133333 3.428571 1.666667
cons1_SD 2.882307 1.980861 2.356349 2.138090 1.154701
democ00a_Mean 4.153846 2.285714 0.200000 0.000000 2.666667
democ00a_SD 4.4692683 2.8937219 0.4140393 0.0000000 2.5166115
euro1900_Mean 0.294642857 0.233333340 0.086812500 0.005333333 0.366666679
euro1900_SD 0.43043120 0.14597783 0.11991064 0.02065591 0.32145504
logem4_Mean 3.089550 4.293698 4.874785 6.190215 5.483826
logem4_SD 0.65109792 0.04729285 0.36773350 0.62147923 2.16708946

\(~\)

Merging the two tables :

\(~\)

# Merging the two tables 

Table <- cbind(Table_1, Table_2)

# Output

Table %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
Decriptive Whole sample Base sample q1 q2 q3 q4 NA
logpgp95_Mean logpgp95_Mean 8.3025091 8.0622369 8.825758 8.327826 7.873064 7.178371 8.344939
logpgp95_SD logpgp95_SD 1.1053422 1.0433593 1.2659394 0.6124916 0.7218007 0.6532207 1.5445691
loghjypl_Mean loghjypl_Mean -1.7311057 -1.9340524 -1.071928 -1.496654 -2.106015 -3.031726 -1.853427
loghjypl_SD loghjypl_SD 1.0837256 0.9807444 0.8450932 0.4305819 0.7506511 0.4815506 1.3186635
avexpr_Mean avexpr_Mean 7.0664914 6.5156250 7.828788 6.363636 5.931818 6.066667 6.068182
avexpr_SD avexpr_SD 1.8042868 1.4686472 1.5197167 0.9777634 1.3518684 1.1959527 1.9288748
cons00a_Mean cons00a_Mean 1.8571429 2.2500000 3.923077 2.800000 1.133333 1.000000 3.666667
cons00a_SD cons00a_SD 1.8231318 2.1123126 3.0127932 1.7808505 0.5163978 0.0000000 3.0550505
cons1_Mean cons1_Mean 3.5909091 3.4000000 4.846154 2.733333 3.133333 3.428571 1.666667
cons1_SD cons1_SD 2.4146886 2.3949099 2.882307 1.980861 2.356349 2.138090 1.154701
democ00a_Mean democ00a_Mean 1.1494253 1.6440678 4.153846 2.285714 0.200000 0.000000 2.666667
democ00a_SD democ00a_SD 2.5768592 3.0043802 4.4692683 2.8937219 0.4140393 0.0000000 2.5166115
euro1900_Mean euro1900_Mean 0.3046623 0.1618095 0.294642857 0.233333340 0.086812500 0.005333333 0.366666679
euro1900_SD euro1900_SD 0.4238986 0.2553334 0.43043120 0.14597783 0.11991064 0.02065591 0.32145504
logem4_Mean logem4_Mean 4.5959838 4.6570311 3.089550 4.293698 4.874785 6.190215 5.483826
logem4_SD logem4_SD 1.3033335 1.2579836 0.65109792 0.04729285 0.36773350 0.62147923 2.16708946
# Latex output : xtable(Table)

\(~\)

3 - We first investigate the relationship between settlers mortality and current economic performance. Briefly recall what is the serie of mechanisms between these two variables. Then, reproduce the Figure 1 in Acemoglu et al. (2001).

\(~\)

# Slicing data

data_AJR_20011 <- data_AJR_2001 %>% 
                 select(logpgp95, logem4, shortnam)

# Ploting data

plot_ACEM_2001 <- data_AJR_20011 %>%
  ggplot(aes(x = logem4, y = logpgp95))+
  geom_point(color = "#C38D94") +
  geom_smooth(method=lm , color="#D81E5B", fill="#FFE3DC", se=TRUE) +
  xlab("Log settler mortality") +
  ylab("Log GDP (1995)") +
  theme(panel.background = element_rect(fill = "white"),
        axis.line = element_line(size = 0.2, colour = "#2F2D2E", linetype=1)) +
  scale_color_brewer()

# Output

show(plot_ACEM_2001)

\(~\)

4 - While we know that it is likely that the estimation of a linear model of economic per- formance as a function of the measure of the quality of institutions will suffer from endogeneity issues, we still implement this approach as a first step. It is an interesting first step to better investigate the correlation between our variables and how it is affected by some control variables :

\(~\)

c) Now, reproduce the OLS estimations presented the Table 2 of Acemoglu et al. (2001), and interprete your results regarding your estimated \(\hat{\alpha}\) across estimations.

\(~\)

# Column 1: 

c1 <- lm(logpgp95 ~ avexpr, data=data_AJR_2001)

# Column 2: 

data_AJR_2001_baseco1 <- data_AJR_2001 %>% filter(baseco==1)
c2 <- lm(logpgp95 ~ avexpr, data=data_AJR_2001_baseco1)

# Column 3: 

c3 <- lm(logpgp95 ~ avexpr + lat_abst, data=data_AJR_2001)

# Column 4: 

c4 <- lm(logpgp95 ~ avexpr + lat_abst + as.factor(africa) + as.factor(asia) + as.factor(other), data=data_AJR_2001)

# Column 5: 

c5 <- lm(logpgp95 ~ avexpr + lat_abst, data=data_AJR_2001_baseco1)

# Column 6: 

c6 <- lm(logpgp95 ~ avexpr + lat_abst + as.factor(africa) + as.factor(asia) + as.factor(other), data=data_AJR_2001_baseco1)

# stargazer(c1, c2, c3, c4, c5, c6)

\(~\)

5 - To document our IV strategy we consider the models in Equations (3) :

\(~\)

a) Estimate the models in Equations (3) on the sample of colonized countries and using latitude as a control variable.

\(~\)

# Conditioning the data set 

data_AJR_2001_baseco1 <- data_AJR_2001 %>% filter(baseco==1 & extmort4!=0)

# Model in Equation(4)

m1 <- lm(avexpr ~ lat_abst + cons00a, data=data_AJR_2001_baseco1)

# Model in Equation(5)

m2 <- lm(cons00a ~ euro1900 + lat_abst, data=data_AJR_2001_baseco1)

# Model in Equation(6)

m3 <- lm(euro1900 ~ lat_abst + logem4, data=data_AJR_2001_baseco1)

# stargazer(m1, m2, m3)

\(~\)

b) Estimate the first stage of the 2-SLS approach, that is the estimation of the model in Equation (5) where you control for the lattitude of country \(i\). Store your predicted values in a new variable.

\(~\)

# Conditioning the data set 

data_AJR_2001_baseco1 <- data_AJR_2001 %>% filter(baseco==1 & extmort4!=0)

# First stage

First_stage <- lm(avexpr ~ lat_abst + logem4, data=data_AJR_2001_baseco1)

# Storing the predicted value 

First_stage_fitted <- fitted(First_stage) 

# Output

head(First_stage_fitted) %>% 
  kbl(booktabs = T) %>%
  kable_styling(latex_options = "striped", full_width = T) 
x
5.927755
7.125869
8.034964
5.943324
6.885191
6.799632
# Output

summary(First_stage)
## 
## Call:
## lm(formula = avexpr ~ lat_abst + logem4, data = data_AJR_2001_baseco1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7410 -0.9299  0.0393  0.8553  3.1693 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.5294     0.8123  10.500 2.67e-15 ***
## lat_abst      2.0018     1.3372   1.497 0.139546    
## logem4       -0.5103     0.1410  -3.618 0.000603 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.252 on 61 degrees of freedom
## Multiple R-squared:  0.296,  Adjusted R-squared:  0.2729 
## F-statistic: 12.82 on 2 and 61 DF,  p-value: 2.244e-05
# Latex output : stargazer(First_stage)

\(~\)

c) Estimate the second stage of the 2-SLS approach. Interprete your estimation of the \(\hat{\alpha}\) coefficient. How does it compare to the estimation in the naive approach in question 2.(c)?

\(~\)

# Merging 

data_AJR_2001_baseco1 <- cbind(First_stage_fitted, data_AJR_2001_baseco1)

# Second stage

Second_stage <- lm(logpgp95 ~ First_stage_fitted + lat_abst, data=data_AJR_2001_baseco1)

# Output

summary(Second_stage)
## 
## Call:
## lm(formula = logpgp95 ~ First_stage_fitted + lat_abst, data = data_AJR_2001_baseco1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5396 -0.4654  0.1097  0.4632  1.5712 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.6918     0.9647   1.754   0.0845 .  
## First_stage_fitted   0.9957     0.1654   6.020 1.08e-07 ***
## lat_abst            -0.6472     0.9961  -0.650   0.5183    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7495 on 61 degrees of freedom
## Multiple R-squared:  0.5004, Adjusted R-squared:  0.484 
## F-statistic: 30.55 on 2 and 61 DF,  p-value: 6.42e-10
# Latex output : stargazer(First_stage, Second_stage)

\(~\)

6 - Replicate the estimation done in question 5 using this STATA command. What do you observe?

\(~\)

# Conditioning the data set 

data_AJR_2001_baseco1 <- data_AJR_2001 %>% filter(baseco==1)

# 2SLS estimation using ivreg() command

m_iv <- ivreg(logpgp95 ~ lat_abst + avexpr | 
                lat_abst + logem4, data = data_AJR_2001_baseco1)

# Output

summary(m_iv)
## 
## Call:
## ivreg(formula = logpgp95 ~ lat_abst + avexpr | lat_abst + logem4, 
##     data = data_AJR_2001_baseco1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5611 -0.6557  0.0732  0.7572  1.8803 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.6918     1.2930   1.308    0.196    
## lat_abst     -0.6472     1.3351  -0.485    0.630    
## avexpr        0.9957     0.2217   4.492 3.21e-05 ***
## 
## Diagnostic tests:
##                  df1 df2 statistic  p-value    
## Weak instruments   1  61     13.09 0.000603 ***
## Wu-Hausman         1  60     18.75 5.75e-05 ***
## Sargan             0  NA        NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.005 on 61 degrees of freedom
## Multiple R-Squared: 0.1025,  Adjusted R-squared: 0.07305 
## Wald test: 17.01 on 2 and 61 DF,  p-value: 1.351e-06
# Latex output : stargazer(m_iv)

\(~\)

7 - Add these variables as control variables in your 2SLS estimation done in question 6, and check whether the addition of these variables affects your estimates.

\(~\)

# Conditioning the data set 

data_AJR_2001_baseco1 <- data_AJR_2001 %>% filter(baseco==1)
data_AJR_2001_baseco12 <- data_AJR_2001 %>% filter(baseco==1 & f_brit == 1)

# 2SLS estimation using ivreg() command

### Table 1 : controlling for disease, latitude, life expectency 

# Column 1 
m_iv_1 <- ivreg(logpgp95 ~ avexpr + lat_abst + as.factor(f_brit) + as.factor(f_french) | 
                lat_abst + logem4 + as.factor(f_french) + as.factor(f_brit),
                data = data_AJR_2001_baseco1)
# Column 2
m_iv_2 <- ivreg(logpgp95 ~ avexpr + lat_abst | 
                lat_abst + logem4, data = data_AJR_2001_baseco12)
# Column 3
m_iv_3 <- ivreg(logpgp95 ~ avexpr + lat_abst  + as.factor(sjlofr) | 
                lat_abst + logem4 + as.factor(sjlofr),
                data = data_AJR_2001_baseco1)
# Column 4
m_iv_4 <- ivreg(logpgp95 ~ avexpr + lat_abst  + catho80 + 
                  muslim80  + no_cpm80 | 
                lat_abst + logem4 + catho80 + 
                  muslim80  + no_cpm80, data = data_AJR_2001_baseco1)

# Column 5
m_iv_5 <- ivreg(logpgp95 ~ avexpr + lat_abst  + catho80 + 
                  muslim80  + no_cpm80 | 
                lat_abst + logem4 + catho80 + 
                  muslim80  + no_cpm80, data = data_AJR_2001_baseco1)

# Latex output : stargazer(m_iv_1, m_iv_2, m_iv_3, m_iv_4, m_iv_5)

### Table 2

# Column 11
m_iv_11 <- ivreg(logpgp95 ~ avexpr + lat_abst  + malfal94 | 
                lat_abst + logem4 + malfal94,
                data = data_AJR_2001_baseco1)

# Column 12
m_iv_12 <- ivreg(logpgp95 ~ avexpr + lat_abst  + yellow | 
                lat_abst + logem4 + yellow,
                data = data_AJR_2001_baseco1)

# Column 13
m_iv_13 <- ivreg(logpgp95 ~ avexpr + lat_abst  + leb95 | 
                lat_abst + logem4 +  leb95,
                data = data_AJR_2001_baseco1)


# Latex output : stargazer(m_iv_11, m_iv_12, m_iv_13)

\(~\)

8 - Replicate the estimation done in question 6 when excluding these countries (identified with the dummy variable rich4)). How does it affect your results?

\(~\)

# Conditioning the data set 

data_AJR_2001_baseco13 <- data_AJR_2001 %>% filter(baseco==1 & rich4==0)

# 2SLS estimation using ivreg() command

m_iv_poor <- ivreg(logpgp95 ~ lat_abst + avexpr | 
                lat_abst + logem4, data = data_AJR_2001_baseco13)

# Output

summary(m_iv_poor)
## 
## Call:
## ivreg(formula = logpgp95 ~ lat_abst + avexpr | lat_abst + logem4, 
##     data = data_AJR_2001_baseco13)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.05145 -0.70560 -0.07305  0.90445  2.48150 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   0.1442     2.1835   0.066  0.94757   
## lat_abst      0.9385     1.4631   0.641  0.52378   
## avexpr        1.2118     0.3543   3.420  0.00116 **
## 
## Diagnostic tests:
##                  df1 df2 statistic  p-value    
## Weak instruments   1  57     7.826  0.00701 ** 
## Wu-Hausman         1  56    17.633 9.69e-05 ***
## Sargan             0  NA        NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.178 on 57 degrees of freedom
## Multiple R-Squared: -0.4918, Adjusted R-squared: -0.5442 
## Wald test: 7.252 on 2 and 57 DF,  p-value: 0.001564
# Latex output : stargazer(m_iv, m_iv_poor)