This week, we will be learning how to work with panel data. For this tutorial, we will be using the seatbelts dataset from the AER package.
library(AER)
library(ggplot2)
library(dplyr)
library(plm)
library(sandwich)
library(lmtest)
library(tidyr)
data("USSeatBelts")
data("PSID7682")
data <- as.data.frame(USSeatBelts)
#histograms
data_small <- data %>%
dplyr::select(-c(state, speed65, speed70, drinkage, alcohol, enforce)) %>%
drop_na() %>%
mutate(year = as.numeric(as.character(year)))
library(tidyr)
ggplot(gather(data_small), aes(value)) +
geom_histogram(bins = 30, fill = "skyblue") +
facet_wrap(~key, scales = 'free')+
theme_dark()
Why is the panel not balanced? The issue here is that seatbelt is missing a lot of data (about 200 observations). If we want a balanced panel, we can either skip using seatbelt in our regression, or find another variable with more coverage.
#look at how fatalities correlate with other variables
data_small %>%
mutate(id = row_number()) %>%
gather(variable, value, -fatalities, -id) %>%
ggplot(aes(y = fatalities, x = value))+
geom_point(color = "skyblue")+
geom_smooth(method = "lm", color = "black")+
facet_wrap(~variable, scales = "free_x")+
theme_dark()+
labs(y = "Total Patents", x = "Variable")
#we'll make a simple linear regression model first
model1 <- lm(fatalities ~ log(miles) + income + age + speed65 + alcohol, data = data)
summary(model1)
Call:
lm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol, data = data)
Residuals:
Min 1Q Median 3Q Max
-0.0115376 -0.0027850 -0.0003352 0.0022816 0.0197495
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.242e-02 3.494e-03 15.001 < 2e-16 ***
log(miles) -1.304e-04 1.578e-04 -0.827 0.408709
income -8.197e-07 3.770e-08 -21.743 < 2e-16 ***
age -4.146e-04 1.030e-04 -4.024 6.29e-05 ***
speed65yes -1.116e-04 3.555e-04 -0.314 0.753735
alcoholyes -1.848e-03 5.006e-04 -3.691 0.000239 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.004315 on 759 degrees of freedom
Multiple R-squared: 0.5143, Adjusted R-squared: 0.5111
F-statistic: 160.7 on 5 and 759 DF, p-value: < 2.2e-16
plot(model1)
model2 <- lm(fatalities ~ log(miles) + income + age + speed65 + alcohol + as.factor(year) + as.factor(state), data = data)
summary(model2)
Call:
lm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol + as.factor(year) + as.factor(state), data = data)
Residuals:
Min 1Q Median 3Q Max
-0.0082202 -0.0011065 0.0000759 0.0010584 0.0119474
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.310e-01 1.585e-02 8.264 7.13e-16
log(miles) -1.377e-02 1.351e-03 -10.192 < 2e-16
income 5.915e-07 9.745e-08 6.070 2.11e-09
age 1.024e-04 3.040e-04 0.337 0.736409
speed65yes 3.129e-04 4.155e-04 0.753 0.451626
alcoholyes -6.239e-04 3.775e-04 -1.653 0.098787
as.factor(year)1984 -9.013e-04 4.258e-04 -2.117 0.034653
as.factor(year)1985 -2.107e-03 4.790e-04 -4.398 1.26e-05
as.factor(year)1986 -1.748e-03 5.444e-04 -3.212 0.001379
as.factor(year)1987 -3.005e-03 7.566e-04 -3.971 7.89e-05
as.factor(year)1988 -3.626e-03 8.781e-04 -4.129 4.08e-05
as.factor(year)1989 -5.642e-03 1.002e-03 -5.632 2.59e-08
as.factor(year)1990 -6.614e-03 1.127e-03 -5.869 6.77e-09
as.factor(year)1991 -8.193e-03 1.194e-03 -6.865 1.48e-11
as.factor(year)1992 -9.739e-03 1.315e-03 -7.408 3.73e-13
as.factor(year)1993 -9.972e-03 1.409e-03 -7.076 3.64e-12
as.factor(year)1994 -1.049e-02 1.521e-03 -6.896 1.21e-11
as.factor(year)1995 -1.053e-02 1.651e-03 -6.375 3.33e-10
as.factor(year)1996 -1.130e-02 1.786e-03 -6.328 4.46e-10
as.factor(year)1997 -1.166e-02 1.920e-03 -6.076 2.04e-09
as.factor(state)AL 3.466e-02 3.899e-03 8.890 < 2e-16
as.factor(state)AR 2.876e-02 3.273e-03 8.786 < 2e-16
as.factor(state)AZ 3.331e-02 3.509e-03 9.493 < 2e-16
as.factor(state)CA 5.033e-02 6.027e-03 8.350 3.69e-16
as.factor(state)CO 2.168e-02 3.301e-03 6.567 1.00e-10
as.factor(state)CT 1.144e-02 3.914e-03 2.922 0.003590
as.factor(state)DC -1.407e-02 2.393e-03 -5.878 6.45e-09
as.factor(state)DE -3.588e-04 2.122e-03 -0.169 0.865796
as.factor(state)FL 4.605e-02 5.738e-03 8.024 4.35e-15
as.factor(state)GA 3.673e-02 4.309e-03 8.524 < 2e-16
as.factor(state)HI 1.359e-03 2.016e-03 0.674 0.500449
as.factor(state)IA 2.072e-02 3.477e-03 5.960 4.01e-09
as.factor(state)ID 1.571e-02 1.914e-03 8.205 1.12e-15
as.factor(state)IL 3.450e-02 4.810e-03 7.173 1.88e-12
as.factor(state)IN 3.030e-02 4.184e-03 7.241 1.18e-12
as.factor(state)KS 1.967e-02 3.256e-03 6.042 2.48e-09
as.factor(state)KY 3.035e-02 3.649e-03 8.315 4.82e-16
as.factor(state)LA 3.301e-02 3.359e-03 9.828 < 2e-16
as.factor(state)MA 1.885e-02 4.377e-03 4.308 1.89e-05
as.factor(state)MD 2.234e-02 3.911e-03 5.712 1.65e-08
as.factor(state)ME 9.358e-03 2.695e-03 3.472 0.000549
as.factor(state)MI 3.506e-02 4.661e-03 7.522 1.68e-13
as.factor(state)MN 2.075e-02 3.807e-03 5.452 6.94e-08
as.factor(state)MO 3.143e-02 4.238e-03 7.415 3.54e-13
as.factor(state)MS 3.497e-02 2.979e-03 11.741 < 2e-16
as.factor(state)MT 1.328e-02 2.165e-03 6.134 1.44e-09
as.factor(state)NC 3.763e-02 4.400e-03 8.553 < 2e-16
as.factor(state)ND -1.696e-03 1.946e-03 -0.871 0.383848
as.factor(state)NE 1.233e-02 2.752e-03 4.483 8.63e-06
as.factor(state)NH 2.500e-03 2.356e-03 1.061 0.289097
as.factor(state)NJ 2.367e-02 4.746e-03 4.989 7.69e-07
as.factor(state)NM 2.787e-02 2.430e-03 11.468 < 2e-16
as.factor(state)NV 1.548e-02 2.335e-03 6.631 6.71e-11
as.factor(state)NY 3.668e-02 5.330e-03 6.882 1.32e-11
as.factor(state)OH 3.581e-02 4.906e-03 7.299 7.92e-13
as.factor(state)OK 2.667e-02 3.668e-03 7.272 9.55e-13
as.factor(state)OR 2.376e-02 3.529e-03 6.733 3.49e-11
as.factor(state)PA 3.667e-02 5.192e-03 7.061 4.01e-12
as.factor(state)RI -4.727e-03 2.537e-03 -1.863 0.062876
as.factor(state)SC 3.477e-02 3.447e-03 10.086 < 2e-16
as.factor(state)SD 5.895e-03 2.049e-03 2.877 0.004141
as.factor(state)TN 3.528e-02 4.099e-03 8.607 < 2e-16
as.factor(state)TX 4.796e-02 5.318e-03 9.018 < 2e-16
as.factor(state)UT 1.728e-02 1.937e-03 8.922 < 2e-16
as.factor(state)VA 2.822e-02 4.311e-03 6.546 1.15e-10
as.factor(state)VT -3.800e-04 1.923e-03 -0.198 0.843370
as.factor(state)WA 2.399e-02 3.923e-03 6.115 1.61e-09
as.factor(state)WI 2.568e-02 4.021e-03 6.387 3.10e-10
as.factor(state)WV 2.399e-02 3.097e-03 7.744 3.41e-14
as.factor(state)WY 5.525e-03 1.478e-03 3.740 0.000200
(Intercept) ***
log(miles) ***
income ***
age
speed65yes
alcoholyes .
as.factor(year)1984 *
as.factor(year)1985 ***
as.factor(year)1986 **
as.factor(year)1987 ***
as.factor(year)1988 ***
as.factor(year)1989 ***
as.factor(year)1990 ***
as.factor(year)1991 ***
as.factor(year)1992 ***
as.factor(year)1993 ***
as.factor(year)1994 ***
as.factor(year)1995 ***
as.factor(year)1996 ***
as.factor(year)1997 ***
as.factor(state)AL ***
as.factor(state)AR ***
as.factor(state)AZ ***
as.factor(state)CA ***
as.factor(state)CO ***
as.factor(state)CT **
as.factor(state)DC ***
as.factor(state)DE
as.factor(state)FL ***
as.factor(state)GA ***
as.factor(state)HI
as.factor(state)IA ***
as.factor(state)ID ***
as.factor(state)IL ***
as.factor(state)IN ***
as.factor(state)KS ***
as.factor(state)KY ***
as.factor(state)LA ***
as.factor(state)MA ***
as.factor(state)MD ***
as.factor(state)ME ***
as.factor(state)MI ***
as.factor(state)MN ***
as.factor(state)MO ***
as.factor(state)MS ***
as.factor(state)MT ***
as.factor(state)NC ***
as.factor(state)ND
as.factor(state)NE ***
as.factor(state)NH
as.factor(state)NJ ***
as.factor(state)NM ***
as.factor(state)NV ***
as.factor(state)NY ***
as.factor(state)OH ***
as.factor(state)OK ***
as.factor(state)OR ***
as.factor(state)PA ***
as.factor(state)RI .
as.factor(state)SC ***
as.factor(state)SD **
as.factor(state)TN ***
as.factor(state)TX ***
as.factor(state)UT ***
as.factor(state)VA ***
as.factor(state)VT
as.factor(state)WA ***
as.factor(state)WI ***
as.factor(state)WV ***
as.factor(state)WY ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.002013 on 695 degrees of freedom
Multiple R-squared: 0.9033, Adjusted R-squared: 0.8936
F-statistic: 94.04 on 69 and 695 DF, p-value: < 2.2e-16
plot(model2)
model3 <- plm(fatalities ~ log(miles) + income + age + speed65 + alcohol,
data = data,
effect = "twoways",
index = c("state", "year"),
model = "within")
summary(model3)
Twoways effects Within Model
Call:
plm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol, data = data, effect = "twoways", model = "within",
index = c("state", "year"))
Balanced Panel: n = 51, T = 15, N = 765
Residuals:
Min. 1st Qu. Median 3rd Qu. Max.
-8.2202e-03 -1.1065e-03 7.5899e-05 1.0584e-03 1.1947e-02
Coefficients:
Estimate Std. Error t-value Pr(>|t|)
log(miles) -1.3773e-02 1.3513e-03 -10.1924 < 2.2e-16 ***
income 5.9150e-07 9.7454e-08 6.0696 2.109e-09 ***
age 1.0237e-04 3.0399e-04 0.3367 0.73641
speed65yes 3.1289e-04 4.1546e-04 0.7531 0.45163
alcoholyes -6.2392e-04 3.7745e-04 -1.6530 0.09879 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 0.0035702
Residual Sum of Squares: 0.0028151
R-Squared: 0.21149
Adj. R-Squared: 0.13321
F-statistic: 37.282 on 5 and 695 DF, p-value: < 2.22e-16
coeftest(model3, vcov. = vcovHC, type = "HC1")
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
log(miles) -1.3773e-02 2.6170e-03 -5.2629 1.892e-07 ***
income 5.9150e-07 2.6606e-07 2.2232 0.02652 *
age 1.0237e-04 5.1695e-04 0.1980 0.84309
speed65yes 3.1289e-04 9.5179e-04 0.3287 0.74245
alcoholyes -6.2392e-04 5.1009e-04 -1.2231 0.22169
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model3)
Twoways effects Within Model
Call:
plm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol, data = data, effect = "twoways", model = "within",
index = c("state", "year"))
Balanced Panel: n = 51, T = 15, N = 765
Residuals:
Min. 1st Qu. Median 3rd Qu. Max.
-8.2202e-03 -1.1065e-03 7.5899e-05 1.0584e-03 1.1947e-02
Coefficients:
Estimate Std. Error t-value Pr(>|t|)
log(miles) -1.3773e-02 1.3513e-03 -10.1924 < 2.2e-16 ***
income 5.9150e-07 9.7454e-08 6.0696 2.109e-09 ***
age 1.0237e-04 3.0399e-04 0.3367 0.73641
speed65yes 3.1289e-04 4.1546e-04 0.7531 0.45163
alcoholyes -6.2392e-04 3.7745e-04 -1.6530 0.09879 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 0.0035702
Residual Sum of Squares: 0.0028151
R-Squared: 0.21149
Adj. R-Squared: 0.13321
F-statistic: 37.282 on 5 and 695 DF, p-value: < 2.22e-16
#what if we still want the fixed effects? We can print them out. Here, I'm printing them out as deviations from the overall mean:
fixef(model3, type = "dmean")
AK AL AR AZ CA
-2.2374e-02 1.2289e-02 6.3849e-03 1.0935e-02 2.7951e-02
CO CT DC DE FL
-6.9419e-04 -1.0938e-02 -3.6439e-02 -2.2733e-02 2.3674e-02
GA HI IA ID IL
1.4357e-02 -2.1015e-02 -1.6514e-03 -6.6675e-03 1.2129e-02
IN KS KY LA MA
7.9230e-03 -2.7008e-03 7.9715e-03 1.0640e-02 -3.5202e-03
MD ME MI MN MO
-3.1272e-05 -1.3016e-02 1.2688e-02 -1.6212e-03 9.0526e-03
MS MT NC ND NE
1.2598e-02 -9.0916e-03 1.5261e-02 -2.4070e-02 -1.0039e-02
NH NJ NM NV NY
-1.9874e-02 1.3005e-03 5.4981e-03 -6.8902e-03 1.4307e-02
OH OK OR PA RI
1.3438e-02 4.2996e-03 1.3849e-03 1.4292e-02 -2.7101e-02
SC SD TN TX UT
1.2392e-02 -1.6479e-02 1.2910e-02 2.5583e-02 -5.0917e-03
VA VT WA WI WV
5.8458e-03 -2.2754e-02 1.6150e-03 3.3084e-03 1.6132e-03
WY
-1.6849e-02
#are the fixed effects signficant?
#We can use PLM test to run a Lagrange multiplier test which tests whether the two way fixed effects are significant
#null: no effects are significant
#alternative: the effects are significant
plmtest(model3, effect="twoways")
Lagrange Multiplier Test - two-ways effects (Honda)
data: fatalities ~ log(miles) + income + age + speed65 + alcohol
normal = 34.031, p-value < 2.2e-16
alternative hypothesis: significant effects
model4 <- plm(fatalities ~ log(miles) + income + age + speed65 + alcohol + seatbelt,
data = data,
index = c("state", "year"),
effect = "twoways",
model = "within")
coeftest(model4, vcov. = vcovHC, type = "HC1")
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
log(miles) -1.1145e-02 2.1176e-03 -5.2629 2.133e-07 ***
income 2.0512e-07 2.6116e-07 0.7854 0.43259
age 7.3632e-04 6.1294e-04 1.2013 0.23022
speed65yes -1.0546e-03 4.7117e-04 -2.2383 0.02565 *
alcoholyes -6.0788e-04 4.3086e-04 -1.4109 0.15893
seatbelt -2.7482e-03 1.4169e-03 -1.9396 0.05301 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model4)
Twoways effects Within Model
Call:
plm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol + seatbelt, data = data, effect = "twoways", model = "within",
index = c("state", "year"))
Unbalanced Panel: n = 51, T = 8-15, N = 556
Residuals:
Min. 1st Qu. Median 3rd Qu. Max.
-4.7714e-03 -7.9252e-04 -2.8379e-05 7.7272e-04 7.2144e-03
Coefficients:
Estimate Std. Error t-value Pr(>|t|)
log(miles) -1.1145e-02 1.5638e-03 -7.1269 3.741e-12 ***
income 2.0512e-07 1.3704e-07 1.4968 0.135083
age 7.3632e-04 3.6428e-04 2.0213 0.043798 *
speed65yes -1.0546e-03 4.0095e-04 -2.6303 0.008803 **
alcoholyes -6.0788e-04 3.3182e-04 -1.8320 0.067568 .
seatbelt -2.7482e-03 1.0838e-03 -2.5357 0.011536 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 0.0013806
Residual Sum of Squares: 0.0011558
R-Squared: 0.16283
Adj. R-Squared: 0.041998
F-statistic: 15.7218 on 6 and 485 DF, p-value: < 2.22e-16
Next, we’ll run the random effects version fo the model
modelr <- plm(fatalities ~ log(miles) + income + age + speed65 + alcohol + seatbelt,
data = data,
index = c("state", "year"),
model = "random")
summary(modelr)
Oneway (individual) effect Random Effect Model
(Swamy-Arora's transformation)
Call:
plm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol + seatbelt, data = data, model = "random", index = c("state",
"year"))
Unbalanced Panel: n = 51, T = 8-15, N = 556
Effects:
var std.dev share
idiosyncratic 2.976e-06 1.725e-03 0.239
individual 9.460e-06 3.076e-03 0.761
theta:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.8055 0.8162 0.8402 0.8337 0.8463 0.8567
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-5.71e-03 -1.16e-03 -1.48e-04 -6.69e-07 9.21e-04 7.72e-03
Coefficients:
Estimate Std. Error z-value Pr(>|z|)
(Intercept) 4.2942e-02 8.7764e-03 4.8929 9.937e-07 ***
log(miles) -9.8507e-04 4.5304e-04 -2.1744 0.029677 *
income -4.6551e-07 5.3390e-08 -8.7190 < 2.2e-16 ***
age -2.4412e-06 2.4400e-04 -0.0100 0.992017
speed65yes -6.6930e-04 3.1838e-04 -2.1022 0.035535 *
alcoholyes -1.1404e-03 3.6950e-04 -3.0864 0.002026 **
seatbelt -5.9381e-03 1.0824e-03 -5.4862 4.106e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 0.005404
Residual Sum of Squares: 0.0018924
R-Squared: 0.64982
Adj. R-Squared: 0.64599
Chisq: 1000.64 on 6 DF, p-value: < 2.22e-16
Which is a better fit for this data? Check with the Hausman test:
phtest(model4, modelr)
Hausman Test
data: fatalities ~ log(miles) + income + age + speed65 + alcohol + ...
chisq = 100.63, df = 6, p-value < 2.2e-16
alternative hypothesis: one model is inconsistent
A significant result confirms that the fixed effects model is a better fit!
#make a residual vs fitted values plot to check for overall fit and heteroskedasticity
residuals <- as.numeric(model3$residuals)
fitted <- as.numeric(model3$model[[1]] - model3$residuals)
diagnostic <- data.frame(fitted, residuals)
ggplot(data= diagnostic, aes(x = fitted, y = residuals))+
geom_point()+
geom_smooth(method = "lm")+
theme_minimal()
#qqplot with the plm package
qqnorm(residuals(model3), ylab = 'Residuals')
qqline(residuals(model3))