This week, we will be learning how to work with panel data. For this tutorial, we will be using the seatbelts dataset from the AER package.
library(AER)
Loading required package: car
Loading required package: carData
Attaching package: ‘car’
The following object is masked from ‘package:purrr’:
some
The following object is masked from ‘package:dplyr’:
recode
Loading required package: sandwich
Loading required package: survival
library(ggplot2)
library(dplyr)
library(plm)
library(tidyr)
library(sandwich)
library(lmtest)
data("USSeatBelts")
data("PSID7682")
data <- as.data.frame(USSeatBelts)
#histograms
data_small <- data %>%
dplyr::select(-c(state, speed65, speed70, drinkage, alcohol, enforce)) %>%
drop_na() %>%
mutate(year = as.numeric(as.character(year)))
ggplot(gather(data_small), aes(value)) +
geom_histogram(bins = 30, fill = "skyblue") +
facet_wrap(~key, scales = 'free')+
theme_dark()
Why is the panel not balanced? The issue here is that seatbelt is missing a lot of data (about 200 observations). If we want a balanced panel, we can either skip using seatbelt in our regression, or find another variable with more coverage.
#look at how fatalities correlate with other variables
data_small %>%
mutate(id = row_number()) %>%
gather(variable, value, -fatalities, -id) %>%
ggplot(aes(y = fatalities, x = value))+
geom_point(color = "skyblue")+
geom_smooth(method = "lm", color = "black")+
facet_wrap(~variable, scales = "free_x")+
theme_dark()+
labs(y = "Total Patents", x = "Variable")
#we'll make a simple linear regression model first
model1 <- lm(fatalities ~ log(miles) + income + age + speed65 + alcohol, data = data)
summary(model1)
Call:
lm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol, data = data)
Residuals:
Min 1Q Median 3Q Max
-0.0115376 -0.0027850 -0.0003352 0.0022816 0.0197495
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.242e-02 3.494e-03 15.001 < 2e-16 ***
log(miles) -1.304e-04 1.578e-04 -0.827 0.408709
income -8.197e-07 3.770e-08 -21.743 < 2e-16 ***
age -4.146e-04 1.030e-04 -4.024 6.29e-05 ***
speed65yes -1.116e-04 3.555e-04 -0.314 0.753735
alcoholyes -1.848e-03 5.006e-04 -3.691 0.000239 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.004315 on 759 degrees of freedom
Multiple R-squared: 0.5143, Adjusted R-squared: 0.5111
F-statistic: 160.7 on 5 and 759 DF, p-value: < 2.2e-16
plot(model1)
model2 <- lm(fatalities ~ log(miles) + income + age + speed65 + alcohol + as.factor(year) + as.factor(state), data = data)
summary(model2)
Call:
lm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol + as.factor(year) + as.factor(state), data = data)
Residuals:
Min 1Q Median 3Q Max
-0.0082202 -0.0011065 0.0000759 0.0010584 0.0119474
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.310e-01 1.585e-02 8.264 7.13e-16 ***
log(miles) -1.377e-02 1.351e-03 -10.192 < 2e-16 ***
income 5.915e-07 9.745e-08 6.070 2.11e-09 ***
age 1.024e-04 3.040e-04 0.337 0.736409
speed65yes 3.129e-04 4.155e-04 0.753 0.451626
alcoholyes -6.239e-04 3.775e-04 -1.653 0.098787 .
as.factor(year)1984 -9.013e-04 4.258e-04 -2.117 0.034653 *
as.factor(year)1985 -2.107e-03 4.790e-04 -4.398 1.26e-05 ***
as.factor(year)1986 -1.748e-03 5.444e-04 -3.212 0.001379 **
as.factor(year)1987 -3.005e-03 7.566e-04 -3.971 7.89e-05 ***
as.factor(year)1988 -3.626e-03 8.781e-04 -4.129 4.08e-05 ***
as.factor(year)1989 -5.642e-03 1.002e-03 -5.632 2.59e-08 ***
as.factor(year)1990 -6.614e-03 1.127e-03 -5.869 6.77e-09 ***
as.factor(year)1991 -8.193e-03 1.194e-03 -6.865 1.48e-11 ***
as.factor(year)1992 -9.739e-03 1.315e-03 -7.408 3.73e-13 ***
as.factor(year)1993 -9.972e-03 1.409e-03 -7.076 3.64e-12 ***
as.factor(year)1994 -1.049e-02 1.521e-03 -6.896 1.21e-11 ***
as.factor(year)1995 -1.053e-02 1.651e-03 -6.375 3.33e-10 ***
as.factor(year)1996 -1.130e-02 1.786e-03 -6.328 4.46e-10 ***
as.factor(year)1997 -1.166e-02 1.920e-03 -6.076 2.04e-09 ***
as.factor(state)AL 3.466e-02 3.899e-03 8.890 < 2e-16 ***
as.factor(state)AR 2.876e-02 3.273e-03 8.786 < 2e-16 ***
as.factor(state)AZ 3.331e-02 3.509e-03 9.493 < 2e-16 ***
as.factor(state)CA 5.033e-02 6.027e-03 8.350 3.69e-16 ***
as.factor(state)CO 2.168e-02 3.301e-03 6.567 1.00e-10 ***
as.factor(state)CT 1.144e-02 3.914e-03 2.922 0.003590 **
as.factor(state)DC -1.407e-02 2.393e-03 -5.878 6.45e-09 ***
as.factor(state)DE -3.588e-04 2.122e-03 -0.169 0.865796
as.factor(state)FL 4.605e-02 5.738e-03 8.024 4.35e-15 ***
as.factor(state)GA 3.673e-02 4.309e-03 8.524 < 2e-16 ***
as.factor(state)HI 1.359e-03 2.016e-03 0.674 0.500449
as.factor(state)IA 2.072e-02 3.477e-03 5.960 4.01e-09 ***
as.factor(state)ID 1.571e-02 1.914e-03 8.205 1.12e-15 ***
as.factor(state)IL 3.450e-02 4.810e-03 7.173 1.88e-12 ***
as.factor(state)IN 3.030e-02 4.184e-03 7.241 1.18e-12 ***
as.factor(state)KS 1.967e-02 3.256e-03 6.042 2.48e-09 ***
as.factor(state)KY 3.035e-02 3.649e-03 8.315 4.82e-16 ***
as.factor(state)LA 3.301e-02 3.359e-03 9.828 < 2e-16 ***
as.factor(state)MA 1.885e-02 4.377e-03 4.308 1.89e-05 ***
as.factor(state)MD 2.234e-02 3.911e-03 5.712 1.65e-08 ***
as.factor(state)ME 9.358e-03 2.695e-03 3.472 0.000549 ***
as.factor(state)MI 3.506e-02 4.661e-03 7.522 1.68e-13 ***
as.factor(state)MN 2.075e-02 3.807e-03 5.452 6.94e-08 ***
as.factor(state)MO 3.143e-02 4.238e-03 7.415 3.54e-13 ***
as.factor(state)MS 3.497e-02 2.979e-03 11.741 < 2e-16 ***
as.factor(state)MT 1.328e-02 2.165e-03 6.134 1.44e-09 ***
as.factor(state)NC 3.763e-02 4.400e-03 8.553 < 2e-16 ***
as.factor(state)ND -1.696e-03 1.946e-03 -0.871 0.383848
as.factor(state)NE 1.233e-02 2.752e-03 4.483 8.63e-06 ***
as.factor(state)NH 2.500e-03 2.356e-03 1.061 0.289097
as.factor(state)NJ 2.367e-02 4.746e-03 4.989 7.69e-07 ***
as.factor(state)NM 2.787e-02 2.430e-03 11.468 < 2e-16 ***
as.factor(state)NV 1.548e-02 2.335e-03 6.631 6.71e-11 ***
as.factor(state)NY 3.668e-02 5.330e-03 6.882 1.32e-11 ***
as.factor(state)OH 3.581e-02 4.906e-03 7.299 7.92e-13 ***
as.factor(state)OK 2.667e-02 3.668e-03 7.272 9.55e-13 ***
as.factor(state)OR 2.376e-02 3.529e-03 6.733 3.49e-11 ***
as.factor(state)PA 3.667e-02 5.192e-03 7.061 4.01e-12 ***
as.factor(state)RI -4.727e-03 2.537e-03 -1.863 0.062876 .
as.factor(state)SC 3.477e-02 3.447e-03 10.086 < 2e-16 ***
as.factor(state)SD 5.895e-03 2.049e-03 2.877 0.004141 **
as.factor(state)TN 3.528e-02 4.099e-03 8.607 < 2e-16 ***
as.factor(state)TX 4.796e-02 5.318e-03 9.018 < 2e-16 ***
as.factor(state)UT 1.728e-02 1.937e-03 8.922 < 2e-16 ***
as.factor(state)VA 2.822e-02 4.311e-03 6.546 1.15e-10 ***
as.factor(state)VT -3.800e-04 1.923e-03 -0.198 0.843370
as.factor(state)WA 2.399e-02 3.923e-03 6.115 1.61e-09 ***
as.factor(state)WI 2.568e-02 4.021e-03 6.387 3.10e-10 ***
as.factor(state)WV 2.399e-02 3.097e-03 7.744 3.41e-14 ***
as.factor(state)WY 5.525e-03 1.478e-03 3.740 0.000200 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.002013 on 695 degrees of freedom
Multiple R-squared: 0.9033, Adjusted R-squared: 0.8936
F-statistic: 94.04 on 69 and 695 DF, p-value: < 2.2e-16
plot(model2)
model3 <- plm(fatalities ~ log(miles) + income + age + speed65 + alcohol,
data = data,
effect = "twoways",
index = c("state", "year"),
model = "within")
summary(model3)
Twoways effects Within Model
Call:
plm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol, data = data, effect = "twoways", model = "within",
index = c("state", "year"))
Balanced Panel: n = 51, T = 15, N = 765
Residuals:
Min. 1st Qu. Median 3rd Qu. Max.
-8.2202e-03 -1.1065e-03 7.5899e-05 1.0584e-03 1.1947e-02
Coefficients:
Estimate Std. Error t-value Pr(>|t|)
log(miles) -1.3773e-02 1.3513e-03 -10.1924 < 2.2e-16 ***
income 5.9150e-07 9.7454e-08 6.0696 2.109e-09 ***
age 1.0237e-04 3.0399e-04 0.3367 0.73641
speed65yes 3.1289e-04 4.1546e-04 0.7531 0.45163
alcoholyes -6.2392e-04 3.7745e-04 -1.6530 0.09879 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 0.0035702
Residual Sum of Squares: 0.0028151
R-Squared: 0.21149
Adj. R-Squared: 0.13321
F-statistic: 37.282 on 5 and 695 DF, p-value: < 2.22e-16
coeftest(model3, vcov. = vcovHC, type = "HC1")
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
log(miles) -1.3773e-02 2.6170e-03 -5.2629 1.892e-07 ***
income 5.9150e-07 2.6606e-07 2.2232 0.02652 *
age 1.0237e-04 5.1695e-04 0.1980 0.84309
speed65yes 3.1289e-04 9.5179e-04 0.3287 0.74245
alcoholyes -6.2392e-04 5.1009e-04 -1.2231 0.22169
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model3)
Twoways effects Within Model
Call:
plm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol, data = data, effect = "twoways", model = "within",
index = c("state", "year"))
Balanced Panel: n = 51, T = 15, N = 765
Residuals:
Min. 1st Qu. Median 3rd Qu. Max.
-8.2202e-03 -1.1065e-03 7.5899e-05 1.0584e-03 1.1947e-02
Coefficients:
Estimate Std. Error t-value Pr(>|t|)
log(miles) -1.3773e-02 1.3513e-03 -10.1924 < 2.2e-16 ***
income 5.9150e-07 9.7454e-08 6.0696 2.109e-09 ***
age 1.0237e-04 3.0399e-04 0.3367 0.73641
speed65yes 3.1289e-04 4.1546e-04 0.7531 0.45163
alcoholyes -6.2392e-04 3.7745e-04 -1.6530 0.09879 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 0.0035702
Residual Sum of Squares: 0.0028151
R-Squared: 0.21149
Adj. R-Squared: 0.13321
F-statistic: 37.282 on 5 and 695 DF, p-value: < 2.22e-16
#what if we still want the fixed effects? We can print them out. Here, I'm printing them out as deviations from the overall mean:
fixef(model3, type = "dmean")
AK AL AR AZ CA CO CT DC DE FL
-2.2374e-02 1.2289e-02 6.3849e-03 1.0935e-02 2.7951e-02 -6.9419e-04 -1.0938e-02 -3.6439e-02 -2.2733e-02 2.3674e-02
GA HI IA ID IL IN KS KY LA MA
1.4357e-02 -2.1015e-02 -1.6514e-03 -6.6675e-03 1.2129e-02 7.9230e-03 -2.7008e-03 7.9715e-03 1.0640e-02 -3.5202e-03
MD ME MI MN MO MS MT NC ND NE
-3.1272e-05 -1.3016e-02 1.2688e-02 -1.6212e-03 9.0526e-03 1.2598e-02 -9.0916e-03 1.5261e-02 -2.4070e-02 -1.0039e-02
NH NJ NM NV NY OH OK OR PA RI
-1.9874e-02 1.3005e-03 5.4981e-03 -6.8902e-03 1.4307e-02 1.3438e-02 4.2996e-03 1.3849e-03 1.4292e-02 -2.7101e-02
SC SD TN TX UT VA VT WA WI WV
1.2392e-02 -1.6479e-02 1.2910e-02 2.5583e-02 -5.0917e-03 5.8458e-03 -2.2754e-02 1.6150e-03 3.3084e-03 1.6132e-03
WY
-1.6849e-02
#are the fixed effects signficant?
#We can use PLM test to run a Lagrange multiplier test which tests whether the two way fixed effects are significant
#null: no effects are significant
#alternative: the effects are significant
plmtest(model3, effect="twoways")
Lagrange Multiplier Test - two-ways effects (Honda)
data: fatalities ~ log(miles) + income + age + speed65 + alcohol
normal = 34.031, p-value < 2.2e-16
alternative hypothesis: significant effects
model4 <- plm(fatalities ~ log(miles) + income + age + speed65 + alcohol + seatbelt,
data = data,
index = c("state", "year"),
effect = "twoways",
model = "within")
coeftest(model4, vcov. = vcovHC, type = "HC1")
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
log(miles) -1.1145e-02 2.1176e-03 -5.2629 2.133e-07 ***
income 2.0512e-07 2.6116e-07 0.7854 0.43259
age 7.3632e-04 6.1294e-04 1.2013 0.23022
speed65yes -1.0546e-03 4.7117e-04 -2.2383 0.02565 *
alcoholyes -6.0788e-04 4.3086e-04 -1.4109 0.15893
seatbelt -2.7482e-03 1.4169e-03 -1.9396 0.05301 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(model4)
Twoways effects Within Model
Call:
plm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol + seatbelt, data = data, effect = "twoways", model = "within",
index = c("state", "year"))
Unbalanced Panel: n = 51, T = 8-15, N = 556
Residuals:
Min. 1st Qu. Median 3rd Qu. Max.
-4.7714e-03 -7.9252e-04 -2.8379e-05 7.7272e-04 7.2144e-03
Coefficients:
Estimate Std. Error t-value Pr(>|t|)
log(miles) -1.1145e-02 1.5638e-03 -7.1269 3.741e-12 ***
income 2.0512e-07 1.3704e-07 1.4968 0.135083
age 7.3632e-04 3.6428e-04 2.0213 0.043798 *
speed65yes -1.0546e-03 4.0095e-04 -2.6303 0.008803 **
alcoholyes -6.0788e-04 3.3182e-04 -1.8320 0.067568 .
seatbelt -2.7482e-03 1.0838e-03 -2.5357 0.011536 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 0.0013806
Residual Sum of Squares: 0.0011558
R-Squared: 0.16283
Adj. R-Squared: 0.041998
F-statistic: 15.7218 on 6 and 485 DF, p-value: < 2.22e-16
Next, we’ll run the random effects version fo the model
modelr <- plm(fatalities ~ log(miles) + income + age + speed65 + alcohol + seatbelt,
data = data,
index = c("state", "year"),
model = "random")
summary(modelr)
Oneway (individual) effect Random Effect Model
(Swamy-Arora's transformation)
Call:
plm(formula = fatalities ~ log(miles) + income + age + speed65 +
alcohol + seatbelt, data = data, model = "random", index = c("state",
"year"))
Unbalanced Panel: n = 51, T = 8-15, N = 556
Effects:
var std.dev share
idiosyncratic 2.976e-06 1.725e-03 0.239
individual 9.460e-06 3.076e-03 0.761
theta:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.8055 0.8162 0.8402 0.8337 0.8463 0.8567
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-5.71e-03 -1.16e-03 -1.48e-04 -7.00e-07 9.21e-04 7.72e-03
Coefficients:
Estimate Std. Error z-value Pr(>|z|)
(Intercept) 4.2942e-02 8.7764e-03 4.8929 9.937e-07 ***
log(miles) -9.8507e-04 4.5304e-04 -2.1744 0.029677 *
income -4.6551e-07 5.3390e-08 -8.7190 < 2.2e-16 ***
age -2.4412e-06 2.4400e-04 -0.0100 0.992017
speed65yes -6.6930e-04 3.1838e-04 -2.1022 0.035535 *
alcoholyes -1.1404e-03 3.6950e-04 -3.0864 0.002026 **
seatbelt -5.9381e-03 1.0824e-03 -5.4862 4.106e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 0.005404
Residual Sum of Squares: 0.0018924
R-Squared: 0.64982
Adj. R-Squared: 0.64599
Chisq: 1000.64 on 6 DF, p-value: < 2.22e-16
Which is a better fit for this data? Check with the Hausman test:
phtest(model4, modelr)
Hausman Test
data: fatalities ~ log(miles) + income + age + speed65 + alcohol + ...
chisq = 100.63, df = 6, p-value < 2.2e-16
alternative hypothesis: one model is inconsistent
A significant result confirms that the fixed effects model is a better fit!
#make a residual vs fitted values plot to check for overall fit and heteroskedasticity
residuals <- as.numeric(model3$residuals)
fitted <- as.numeric(model3$model[[1]] - model3$residuals)
diagnostic <- data.frame(fitted, residuals)
ggplot(data= diagnostic, aes(x = fitted, y = residuals))+
geom_point()+
geom_smooth(method = "lm")+
theme_minimal()
#qqplot with the plm package
qqnorm(residuals(model3), ylab = 'Residuals')
qqline(residuals(model3))