# Clear the workplace
rm(list = ls())
# Working directory
#setwd("C:/Users/s15885/OneDrive - Norges Handelshøyskole/Desktop/Econometrics/Lab2_update")
setwd("/Users/cepremap/Dropbox/Mac/Documents/For_Ire")
# Required packages
required <- c("haven", "dplyr", "ggplot2", "AER", "stargazer", "ivreg", "lmtest", "tidyverse", "kableExtra", "sandwich")
# Install missing packages
new.packages <- required[!(required %in% installed.packages()[, "Package"])]
if (length(new.packages)) {
install.packages(new.packages, dependencies = TRUE)
}
# Load packages
invisible(lapply(required, library, character.only = TRUE))
# Clean environment
rm(new.packages)
rm(required)
# Assuming 'cig85_95.dta' is in your working directory
mydata <- read_dta("/Users/cepremap/Dropbox/Mac/Documents/For_Ire/cig85_95.dta")
# Generate additional variables: THEY ARE ALREADY CREATED IN THE DATA.
mydata <- mydata %>%
mutate(lpackpc = log(packpc),
lincome = log(income / (pop * cpi)),
lavgprs = log(avgprs / cpi),
rtax = tax / cpi,
rtaxs = taxs / cpi,
rtaxso = rtaxs - rtax)
# Analyze the mean values by year
mean_values_by_year <- mydata %>%
group_by(year) %>%
summarize(mean_packpc = mean(packpc, na.rm = TRUE),
mean_income = mean(income, na.rm = TRUE),
mean_tax = mean(tax, na.rm = TRUE),
mean_avgprs = mean(avgprs, na.rm = TRUE))
# Melt the data to long format for ggplot
mean_values_long <- mean_values_by_year %>%
gather(key = "variable", value = "value", -year)
# Plotting with facet_wrap for separate plots by variable
ggplot(mean_values_long, aes(x = year, y = value)) +
geom_line() +
geom_point() +
labs(title = "Mean values by year",
x = "\nYear",
y = "Mean value\n") +
facet_wrap(~ variable, scales = "free_y") + # Separate plots for each variable
theme_minimal()
Mean value of interest decreasing for quantity, but increasing for price, taxes and income.
# Analyze the mean values by state
mean_values_by_state <- mydata %>%
group_by(state) %>%
summarize(mean_packpc = mean(packpc, na.rm = TRUE),
mean_income = mean(income, na.rm = TRUE),
mean_tax = mean(tax, na.rm = TRUE),
mean_avgprs = mean(avgprs, na.rm = TRUE))
# Melt the data to long format for ggplot
mean_values_long <- mean_values_by_state %>%
gather(key = "variable", value = "value", -state)
# Plotting with facet_wrap for separate plots
ggplot(mean_values_long, aes(x = state, y = value)) +
geom_line() +
geom_point() +
labs(title = "Mean values by state",
x = "\nState",
y = "Mean value\n") +
facet_wrap(~ variable, scales = "free_y") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
There is an important heterogeneity across states, as can be seen from sd and differences between min and max.
# Filter data for 1995
data_1995 <- filter(mydata, year == 1995)
# First stage: regressing lavgprs on rtaxso
first_stage <- lm(lavgprs ~ rtaxso, data = data_1995)
# Save the fitted values for the second stage
data_1995$lavgprs_fitted <- predict(first_stage)
# Second stage: regressing lpackpc on the fitted values from the first stage
second_stage <- ivreg(lpackpc ~ lavgprs | rtaxso, data = data_1995)
# Extract estimated coefficient and its standard error
estimated_coefficient <- summary(second_stage)$coefficients["lavgprs", "Estimate"]
standard_error <- summary(second_stage)$coefficients["lavgprs", "Std. Error"]
# HTML output
stargazer(first_stage, second_stage,
type = "html",
title = "Two-Stage Least Squares Regression Results",
dep.var.labels = c("First Stage: lavgprs", "Second Stage: lpackpc"),
covariate.labels = c("rtaxso (First Stage)", "lavgprs (Second Stage)"),
column.labels = c("First Stage", "Second Stage"),
keep.stat = c("n", "rsq", "adj.rsq"), # Show sample size & R²
out = "qbi.html")
| Dependent variable: | ||
| First Stage: lavgprs | Second Stage: lpackpc | |
| OLS | instrumental | |
| variable | ||
| First Stage | Second Stage | |
| (1) | (2) | |
| rtaxso (First Stage) | 0.031*** | |
| (0.005) | ||
| lavgprs (Second Stage) | -1.084*** | |
| (0.317) | ||
| Constant | 4.617*** | 9.720*** |
| (0.029) | (1.514) | |
| Observations | 48 | 48 |
| R2 | 0.471 | 0.401 |
| Adjusted R2 | 0.459 | 0.388 |
| Note: | p<0.1; p<0.05; p<0.01 | |
1% increase in the price reduce the demand for cigarettes by 1.08%
# Calculate the t-value for the hypothesis test
t_value <- (estimated_coefficient + 1) / standard_error # Note the +1 since we are testing against -1
# Calculate the p-value
p_value <- 2 * pt(-abs(t_value), df = df.residual(second_stage)) # Two-tailed test
print(t_value)
[1] -0.2640017
print(p_value)
[1] 0.7929578 #### NB: the Stata solutions perform a Wald test here
We fail to reject the null (slope coeff equal to -1).
The instrument is relevant, as can be seen from 1st stage statistically significant. We can never conclude for exogeneity, but here it seems unlikly. Indeed, taxes are decided by the state, whom is likely to take into account the state of the economy to take such decision.
# OLS
ols <- lm(lpackpc ~ lavgprs, data = data_1995)
# HTML output
stargazer(ols, second_stage, type = "html", out = "qb.html")
| Dependent variable: | ||
| lpackpc | ||
| OLS | instrumental | |
| variable | ||
| (1) | (2) | |
| lavgprs | -1.213*** | -1.084*** |
| (0.216) | (0.317) | |
| Constant | 10.339*** | 9.720*** |
| (1.035) | (1.514) | |
| Observations | 48 | 48 |
| R2 | 0.406 | 0.401 |
| Adjusted R2 | 0.393 | 0.388 |
| Residual Std. Error (df = 46) | 0.190 | 0.190 |
| F Statistic | 31.409*** (df = 1; 46) | |
| Note: | p<0.1; p<0.05; p<0.01 | |
No, taxes level are directly correlated with income level
# First-stage regression (OLS)
iv_q1c_1st <- lm(lavgprs ~ lincome + rtaxso, data = data_1995)
# Second-stage regression (IV)
iv_q1c_2nd <- ivreg(lpackpc ~ lincome + lavgprs | lincome + rtaxso, data = data_1995)
# Export results
stargazer(iv_q1c_1st, iv_q1c_2nd,
type = "html", out = "q1c.html",
title = "2SLS Q1c - Regression Results",
dep.var.labels = c("First Stage: lavgprs", "Second Stage: lpacpc"),
covariate.labels = c("lincome", "rtaxso", "lavgprs"),
keep.stat = c("n", "rsq", "adj.rsq"))
| Dependent variable: | ||
| First Stage: lavgprs | Second Stage: lpacpc | |
| OLS | instrumental | |
| variable | ||
| (1) | (2) | |
| lincome | 0.389*** | 0.215 |
| (0.085) | (0.269) | |
| rtaxso | 0.027*** | |
| (0.004) | ||
| lavgprs | -1.143*** | |
| (0.359) | ||
| Constant | 3.591*** | 9.431*** |
| (0.226) | (1.358) | |
| Observations | 48 | 48 |
| R2 | 0.639 | 0.419 |
| Adjusted R2 | 0.623 | 0.393 |
| Note: | p<0.1; p<0.05; p<0.01 | |
# First-stage regressions
iv_q1d_1st_rtaxso <- lm(lavgprs ~ lincome + rtaxso, data = data_1995)
iv_q1d_1st_rtax <- lm(lavgprs ~ lincome + rtax, data = data_1995)
# Second-stage IV regression using both instruments
iv_q1d_2nd <- ivreg(lpackpc ~ lincome + lavgprs | lincome + rtaxso + rtax, data = data_1995)
# Export results (similar to esttab)
stargazer(iv_q1d_1st_rtaxso, iv_q1d_1st_rtax, iv_q1d_2nd,
type = "html", out = "q1di.html",
title = "2SLS Q1d - Regression Results",
dep.var.labels = c("First Stage with rtaxso", "First Stage with rtax", "Second Stage"),
covariate.labels = c("lincome", "rtaxso", "rtax", "lavgprs"),
keep.stat = c("n", "rsq", "adj.rsq"))
| Dependent variable: | |||
| First Stage with rtaxso | First Stage with rtax | ||
| OLS | instrumental | ||
| variable | |||
| (1) | (2) | (3) | |
| lincome | 0.389*** | 0.080 | 0.280 |
| (0.085) | (0.050) | (0.239) | |
| rtaxso | 0.027*** | ||
| (0.004) | |||
| rtax | 0.011*** | ||
| (0.001) | |||
| lavgprs | -1.277*** | ||
| (0.263) | |||
| Constant | 3.591*** | 4.171*** | 9.895*** |
| (0.226) | (0.125) | (1.059) | |
| Observations | 48 | 48 | 48 |
| R2 | 0.639 | 0.900 | 0.429 |
| Adjusted R2 | 0.623 | 0.896 | 0.404 |
| Note: | p<0.1; p<0.05; p<0.01 | ||
summary(iv_q1d_2nd)
##
## Call:
## ivreg(formula = lpackpc ~ lincome + lavgprs | lincome + rtaxso +
## rtax, data = data_1995)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6006931 -0.0862222 -0.0009999 0.1164699 0.3734227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.8950 1.0586 9.348 4.12e-12 ***
## lincome 0.2804 0.2386 1.175 0.246
## lavgprs -1.2774 0.2632 -4.853 1.50e-05 ***
##
## Diagnostic tests:
## df1 df2 statistic p-value
## Weak instruments 2 44 244.734 <2e-16 ***
## Wu-Hausman 1 44 3.068 0.0868 .
## Sargan 1 NA 0.333 0.5641
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1879 on 45 degrees of freedom
## Multiple R-Squared: 0.4294, Adjusted R-squared: 0.4041
## Wald test: 13.28 on 2 and 45 DF, p-value: 2.931e-05
sd reduce with both instruments compared to single one
1% increase in the price reduce the demand for cigarettes by 1.277%
# Extract coefficient and standard error
beta_1 <- coef(summary(iv_q1d_2nd))["lavgprs", "Estimate"]
se_beta_1 <- coef(summary(iv_q1d_2nd))["lavgprs", "Std. Error"]
# Compute t-value
t_value <- (beta_1 + 1) / se_beta_1 # Testing against -1
# Compute two-tailed p-value
p_value <- 2 * pt(-abs(t_value), df = df.residual(iv_q1d_2nd))
print(t_value)
[1] -1.054049
print(p_value)
[1] 0.2974874
No, because of the exogeneity assumption is not credible
# Assuming 'brumm.dta' is in your working directory
brumm_data <- read_dta("/Users/cepremap/Dropbox/Mac/Documents/For_Ire/brumm.dta")
# Least squares with robust standard errors
model_brumm <- lm(inflat ~ money + output, data = brumm_data)
# Use vcovHC from the 'sandwich' package for robust standard errors
library(sandwich)
coeftest(model_brumm, vcov = vcovHC(model_brumm, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.234214 0.619615 -0.3780 0.7065
## money 1.033131 0.023694 43.6027 < 2.2e-16 ***
## output -1.662006 0.175914 -9.4478 2.706e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Strong joing hypothesis
# Additional package
library(car)
# Strong joint hypothesis: β1 = 0, β2 = 1, β3 = -1
strong_hypothesis <- linearHypothesis(model_brumm, c("(Intercept) = 0", "money = 1", "output = -1"), vcov. = vcovHC(model_brumm, type = "HC1"))
print(strong_hypothesis)
##
## Linear hypothesis test:
## (Intercept) = 0
## money = 1
## output = - 1
##
## Model 1: restricted model
## Model 2: inflat ~ money + output
##
## Note: Coefficient covariance matrix supplied.
##
## Res.Df Df F Pr(>F)
## 1 76
## 2 73 3 12.267 1.383e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Weak joint hypothesis: β2 = 1, β3 = -1
weak_hypothesis <- linearHypothesis(model_brumm, c("money = 1", "output = -1"), vcov. = vcovHC(model_brumm, type = "HC1"))
print(weak_hypothesis)
##
## Linear hypothesis test:
## money = 1
## output = - 1
##
## Model 1: restricted model
## Model 2: inflat ~ money + output
##
## Note: Coefficient covariance matrix supplied.
##
## Res.Df Df F Pr(>F)
## 1 75
## 2 73 2 7.1276 0.001487 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("Strong Hypothesis - F-Statistic:", strong_hypothesis$F[2], " P-Value:", strong_hypothesis$`Pr(>F)`[2], "\n")
## Strong Hypothesis - F-Statistic: 12.26724 P-Value: 1.382844e-06
cat("Weak Hypothesis - F-Statistic:", weak_hypothesis$F[2], " P-Value:", weak_hypothesis$`Pr(>F)`[2], "\n")
## Weak Hypothesis - F-Statistic: 7.12761 P-Value: 0.001487297
Testing the same hypothesis using robust standard errors gives an F statistic of 12.27 and a p value of 0.0000. The test statistic has an approximate F-distribution in large samples if the null hypothesis is true. There are J = 3 hypotheses, the numerator degrees of freedom, and N – K = 73, the denominator degrees of freedom. Since the p value is less than the level of significance, 0.05, we reject the strong hypothesis.
The F-value for the weak hypothesis is 7.127. The test statistic has an approximate F-distribution if the null hypothesis is true. There are J = 2 hypotheses, the numerator degrees of freedom, and N – K = 73, the denominator degrees of freedom. The p-value is 0.002. Thus, we reject the weak hypothesis.
N.B. The F statistics might be slightly different, the important thing is that they lead to the same conclusion.
Brumm argues that OUTPUT may be endogenous. Four instrumental variables
are proposed, INITIAL = initial level of real GDP, SCHOOL = a measure of
the population’s educational attainment, INV = avr. investment share of
GDP, and POPRATE = avr population growth rate. ##### c) Use the proposed
instruments, obtain instrumental variables (2SLS) estimates of the
INFLAT equation (i.e. reproduce the results reported in Table 2, column
Y)
# IV Regression
iv_model <- ivreg(inflat ~ money + output | money + initial + school + inv + poprate,
data = brumm_data)
# Display summary with diagnostics
summary(iv_model, diagnostics = TRUE)
Call: ivreg(formula = inflat ~ money + output | money + initial + school + inv + poprate, data = brumm_data)
Residuals: Min 1Q Median 3Q Max -24.633 -1.487 0.204 1.937 9.498
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.093985 1.858208 -0.589 0.5579
money 1.035059 0.009773 105.914 <2e-16 ** output -1.394200
0.551503 -2.528 0.0136
Diagnostic tests: df1 df2 statistic p-value
Weak instruments 4 70 4.642 0.00221 ** Wu-Hausman 1 72 0.300
0.58551
Sargan 3 NA 2.455 0.48344
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05
‘.’ 0.1 ’ ’ 1
Residual standard error: 4.344 on 73 degrees of freedom Multiple R-Squared: 0.9947, Adjusted R-squared: 0.9946 Wald test: 6852 on 2 and 73 DF, p-value: < 2.2e-16
# Capture stargazer output and print it correctly
stargazer(iv_model,
type = "html",
title = "Inflation and Output Growth Regressions: 2SLS Results",
se = list(sqrt(diag(vcovHC(iv_model, type = "HC1")))),
digits = 3,
star.cutoffs = c(0.10, 0.05, 0.01))
| Dependent variable: | |
| inflat | |
| money | 1.035*** |
| (0.025) | |
| output | -1.394*** |
| (0.334) | |
| Constant | -1.094 |
| (1.219) | |
| Observations | 76 |
| R2 | 0.995 |
| Adjusted R2 | 0.995 |
| Residual Std. Error | 4.344 (df = 73) |
| Note: | p<0.1; p<0.05; p<0.01 |
#####R and stata have a bit different coefficents
Here we look at the Wu-Hausman result from the model in point c, we get stas: 0.3 and p-value: 0.58. Test is not significant, so according to Durbin-Wu-Hausman endogeneity is not a problem here, and we can rely on OLS. This means the whole point of Brumm article seems to be weak when we apply the test procedure taught in the lecture.
N.B. the tandard implementation of the Hausman test assumes homoskedasticity (no robust standard errors), it is possible to perform it manually with heteroskedasticity but it is not asked here.