sleep75 dataset from the
wooldridge package to answer three questions:library(wooldridge)
str(sleep75)
## 'data.frame': 706 obs. of 34 variables:
## $ age : int 32 31 44 30 64 41 35 47 32 30 ...
## $ black : int 0 0 0 0 0 0 0 0 0 0 ...
## $ case : int 1 2 3 4 5 6 7 8 9 10 ...
## $ clerical: num 0 0 0 0 0 0 0 0 0 0 ...
## $ construc: num 0 0 0 0 0 0 0 0 0 0 ...
## $ educ : int 12 14 17 12 14 12 12 13 17 15 ...
## $ earns74 : num 0 9500 42500 42500 2500 ...
## $ gdhlth : int 0 1 1 1 1 1 1 1 1 1 ...
## $ inlf : int 1 1 1 1 1 1 1 1 1 1 ...
## $ leis1 : int 3529 2140 4595 3211 4052 4812 4787 3544 4359 4211 ...
## $ leis2 : int 3479 2140 4505 3211 4007 4797 4157 3469 4359 4061 ...
## $ leis3 : int 3479 2140 4227 3211 4007 4797 4157 3439 4121 4061 ...
## $ smsa : int 0 0 1 0 0 0 0 1 0 1 ...
## $ lhrwage : num 1.956 0.358 3.022 2.264 1.012 ...
## $ lothinc : num 10.08 0 0 0 9.33 ...
## $ male : int 1 1 1 0 1 1 1 1 1 1 ...
## $ marr : int 1 0 1 1 1 1 1 1 1 1 ...
## $ prot : int 1 1 0 1 1 1 1 1 0 0 ...
## $ rlxall : int 3163 2920 3038 3083 3493 4078 3810 3033 3606 3168 ...
## $ selfe : int 0 1 1 1 0 0 0 1 0 1 ...
## $ sleep : int 3113 2920 2670 3083 3448 4063 3180 2928 3368 3018 ...
## $ slpnaps : int 3163 2920 2760 3083 3493 4078 3810 3003 3368 3168 ...
## $ south : int 0 1 0 0 0 0 0 0 0 0 ...
## $ spsepay : num 0 0 20000 5000 2400 0 12000 0 0 6000 ...
## $ spwrk75 : int 0 0 1 1 1 0 1 0 0 1 ...
## $ totwrk : int 3438 5020 2815 3786 2580 1205 2113 3608 2353 2851 ...
## $ union : int 0 0 0 0 0 0 0 0 1 0 ...
## $ worknrm : int 3438 5020 2815 3786 2580 0 2113 3608 2353 2851 ...
## $ workscnd: int 0 0 0 0 0 1205 0 0 0 0 ...
## $ exper : int 14 11 21 12 44 23 17 28 9 9 ...
## $ yngkid : int 0 0 0 0 0 0 1 0 0 0 ...
## $ yrsmarr : int 13 0 0 12 33 23 0 24 11 7 ...
## $ hrwage : num 7.07 1.43 20.53 9.62 2.75 ...
## $ agesq : int 1024 961 1936 900 4096 1681 1225 2209 1024 900 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
summary(model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2378.00 -243.29 6.74 259.24 1350.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3840.83197 235.10870 16.336 <2e-16 ***
## totwrk -0.16342 0.01813 -9.013 <2e-16 ***
## educ -11.71332 5.86689 -1.997 0.0463 *
## age -8.69668 11.20746 -0.776 0.4380
## I(age^2) 0.12844 0.13390 0.959 0.3378
## male 87.75243 34.32616 2.556 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared: 0.1228, Adjusted R-squared: 0.1165
## F-statistic: 19.59 on 5 and 700 DF, p-value: < 2.2e-16
coef_male <- summary(model)$coefficients["male", ]
cat("Coefficient for male:", coef_male["Estimate"], "\n")
## Coefficient for male: 87.75243
cat("P-value for male:", coef_male["Pr(>|t|)"], "\n")
## P-value for male: 0.01078518
if (coef_male["Pr(>|t|)"] < 0.05) {
cat("Evidence shows that men sleep significantly more than women.\n")
} else {
cat("No significant evidence that men sleep more than women.\n")
}
## Evidence shows that men sleep significantly more than women.
coef_totwrk <- summary(model)$coefficients["totwrk", ]
cat("Coefficient for totwrk:", coef_totwrk["Estimate"], "\n")
## Coefficient for totwrk: -0.1634232
cat("P-value for totwrk:", coef_totwrk["Pr(>|t|)"], "\n")
## P-value for totwrk: 1.891272e-18
if (coef_totwrk["Pr(>|t|)"] < 0.05) {
cat("There is a statistically significant tradeoff between working and sleeping.\n")
} else {
cat("No significant tradeoff between working and sleeping.\n")
}
## There is a statistically significant tradeoff between working and sleeping.
model_reduced <- lm(sleep ~ totwrk + educ + male, data = sleep75)
anova_test <- anova(model_reduced, model)
cat("ANOVA test results:\n")
## ANOVA test results:
print(anova_test)
## Analysis of Variance Table
##
## Model 1: sleep ~ totwrk + educ + male
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 702 122631662
## 2 700 122147777 2 483885 1.3865 0.2506
if (anova_test$`Pr(>F)`[2] < 0.05) {
cat("Reject the null hypothesis: Age has a significant effect on sleeping.\n")
} else {
cat("Fail to reject the null hypothesis: Age has no significant effect on sleeping.\n")
}
## Fail to reject the null hypothesis: Age has no significant effect on sleeping.
library(wooldridge)
str(gpa2)
## 'data.frame': 4137 obs. of 12 variables:
## $ sat : int 920 1170 810 940 1180 980 880 980 1240 1230 ...
## $ tothrs : int 43 18 14 40 18 114 78 55 18 17 ...
## $ colgpa : num 2.04 4 1.78 2.42 2.61 ...
## $ athlete : int 1 0 1 0 0 0 0 0 0 0 ...
## $ verbmath: num 0.484 0.828 0.884 0.808 0.735 ...
## $ hsize : num 0.1 9.4 1.19 5.71 2.14 2.68 3.11 2.68 3.67 0.1 ...
## $ hsrank : int 4 191 42 252 86 41 161 101 161 3 ...
## $ hsperc : num 40 20.3 35.3 44.1 40.2 ...
## $ female : int 1 0 0 0 0 1 0 0 0 0 ...
## $ white : int 0 1 1 1 1 1 0 1 1 1 ...
## $ black : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hsizesq : num 0.01 88.36 1.42 32.6 4.58 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model <- lm(sat ~ hsize + I(hsize^2) + female + black + I(female * black), data = gpa2)
summary(model)
##
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + I(female *
## black), data = gpa2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.45 -89.54 -5.24 85.41 479.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1028.0972 6.2902 163.445 < 2e-16 ***
## hsize 19.2971 3.8323 5.035 4.97e-07 ***
## I(hsize^2) -2.1948 0.5272 -4.163 3.20e-05 ***
## female -45.0915 4.2911 -10.508 < 2e-16 ***
## black -169.8126 12.7131 -13.357 < 2e-16 ***
## I(female * black) 62.3064 18.1542 3.432 0.000605 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared: 0.08578, Adjusted R-squared: 0.08468
## F-statistic: 77.52 on 5 and 4131 DF, p-value: < 2.2e-16
coef_hsize2 <- summary(model)$coefficients["I(hsize^2)", ]
cat("Coefficient for hsize^2:", coef_hsize2["Estimate"], "\n")
## Coefficient for hsize^2: -2.194828
cat("P-value for hsize^2:", coef_hsize2["Pr(>|t|)"], "\n")
## P-value for hsize^2: 3.198417e-05
if (coef_hsize2["Pr(>|t|)"] < 0.05) {
cat("There is strong evidence that hsize^2 should be included in the model.\n")
} else {
cat("There is no strong evidence that hsize^2 should be included in the model.\n")
}
## There is strong evidence that hsize^2 should be included in the model.
optimal_hsize <- -coef(model)["hsize"] / (2 * coef(model)["I(hsize^2)"])
cat("Optimal high school size (in hundreds):", optimal_hsize, "\n")
## Optimal high school size (in hundreds): 4.39603
coef_female <- summary(model)$coefficients["female", ]
cat("Coefficient for female:", coef_female["Estimate"], "\n")
## Coefficient for female: -45.09145
cat("P-value for female:", coef_female["Pr(>|t|)"], "\n")
## P-value for female: 1.656301e-25
if (coef_female["Pr(>|t|)"] < 0.05) {
cat("The difference in SAT score between nonblack females and nonblack males is statistically significant.\n")
} else {
cat("The difference in SAT score between nonblack females and nonblack males is not statistically significant.\n")
}
## The difference in SAT score between nonblack females and nonblack males is statistically significant.
coef_black <- summary(model)$coefficients["black", ]
cat("Coefficient for black:", coef_black["Estimate"], "\n")
## Coefficient for black: -169.8126
cat("P-value for black:", coef_black["Pr(>|t|)"], "\n")
## P-value for black: 7.140387e-40
if (coef_black["Pr(>|t|)"] < 0.05) {
cat("There is a statistically significant difference in SAT score between nonblack males and black males.\n")
} else {
cat("There is no statistically significant difference in SAT score between nonblack males and black males.\n")
}
## There is a statistically significant difference in SAT score between nonblack males and black males.
coef_female_black <- summary(model)$coefficients["I(female * black)", ]
diff_black_female <- coef_black["Estimate"] + coef_female_black["Estimate"]
cat("Estimated difference in SAT score between black females and nonblack females:", diff_black_female, "\n")
## Estimated difference in SAT score between black females and nonblack females: -107.5063
p_value_female_black <- coef_female_black["Pr(>|t|)"]
cat("P-value for female-black interaction term:", p_value_female_black, "\n")
## P-value for female-black interaction term: 0.0006048744
if (p_value_female_black < 0.05) {
cat("The difference in SAT score between black females and nonblack females is statistically significant.\n")
} else {
cat("The difference in SAT score between black females and nonblack females is not statistically significant.\n")
}
## The difference in SAT score between black females and nonblack females is statistically significant.
data(wage2)
model_wage <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
summary(model_wage)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
wage2$exper_sq <- wage2$exper^2
wage2$tenure_sq <- wage2$tenure^2
model_wage_quad <- lm(log(wage) ~ educ + exper + exper_sq + tenure + tenure_sq + married + black + south + urban, data = wage2)
summary(model_wage_quad)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + exper_sq + tenure + tenure_sq +
## married + black + south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98236 -0.21972 -0.00036 0.24078 1.25127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.3586756 0.1259143 42.558 < 2e-16 ***
## educ 0.0642761 0.0063115 10.184 < 2e-16 ***
## exper 0.0172146 0.0126138 1.365 0.172665
## exper_sq -0.0001138 0.0005319 -0.214 0.830622
## tenure 0.0249291 0.0081297 3.066 0.002229 **
## tenure_sq -0.0007964 0.0004710 -1.691 0.091188 .
## married 0.1985470 0.0391103 5.077 4.65e-07 ***
## black -0.1906636 0.0377011 -5.057 5.13e-07 ***
## south -0.0912153 0.0262356 -3.477 0.000531 ***
## urban 0.1854241 0.0269585 6.878 1.12e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared: 0.255, Adjusted R-squared: 0.2477
## F-statistic: 35.17 on 9 and 925 DF, p-value: < 2.2e-16
model_wage_restricted <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
anova(model_wage_restricted, model_wage_quad)
## Analysis of Variance Table
##
## Model 1: log(wage) ~ educ + exper + tenure + married + black + south +
## urban
## Model 2: log(wage) ~ educ + exper + exper_sq + tenure + tenure_sq + married +
## black + south + urban
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 123.82
## 2 925 123.42 2 0.39756 1.4898 0.226
model_wage_interaction <- lm(log(wage) ~ educ * black + exper + tenure + married + south + urban, data = wage2)
summary(model_wage_interaction)
##
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## black 0.094809 0.255399 0.371 0.710561
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
wage2$married_black <- wage2$married * wage2$black
wage2$married_nonblack <- wage2$married * (1 - wage2$black)
wage2$single_black <- (1 - wage2$married) * wage2$black
wage2$single_nonblack <- (1 - wage2$married) * (1 - wage2$black)
model_wage_groups <- lm(log(wage) ~ married_black + married_nonblack + single_black + single_nonblack + educ + exper + tenure + south + urban, data = wage2)
summary(model_wage_groups)
##
## Call:
## lm(formula = log(wage) ~ married_black + married_nonblack + single_black +
## single_nonblack + educ + exper + tenure + south + urban,
## data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98013 -0.21780 0.01057 0.24219 1.22889
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.403793 0.114122 47.351 < 2e-16 ***
## married_black 0.009448 0.056013 0.169 0.866083
## married_nonblack 0.188915 0.042878 4.406 1.18e-05 ***
## single_black -0.240820 0.096023 -2.508 0.012314 *
## single_nonblack NA NA NA NA
## educ 0.065475 0.006253 10.471 < 2e-16 ***
## exper 0.014146 0.003191 4.433 1.04e-05 ***
## tenure 0.011663 0.002458 4.745 2.41e-06 ***
## south -0.091989 0.026321 -3.495 0.000497 ***
## urban 0.184350 0.026978 6.833 1.50e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared: 0.2528, Adjusted R-squared: 0.2464
## F-statistic: 39.17 on 8 and 926 DF, p-value: < 2.2e-16
library(car)
## Loading required package: carData
model1 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
summary(model1)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
coef_black <- coef(summary(model1))["black", "Estimate"]
pval_black <- coef(summary(model1))["black", "Pr(>|t|)"]
cat("Difference in log(wage) between blacks and nonblacks (coefficient for black):", coef_black, "\n")
## Difference in log(wage) between blacks and nonblacks (coefficient for black): -0.1883499
cat("P-value for black:", pval_black, "\n")
## P-value for black: 6.839315e-07
if (pval_black < 0.05) {
cat("The difference is statistically significant at the 5% level.\n")
} else {
cat("The difference is NOT statistically significant at the 5% level.\n")
}
## The difference is statistically significant at the 5% level.
wage2$exper2 <- wage2$exper^2
wage2$tenure2 <- wage2$tenure^2
model2 <- lm(log(wage) ~ educ + exper + exper2 + tenure + tenure2 + married + black + south + urban, data = wage2)
test <- linearHypothesis(model2, c("exper2 = 0", "tenure2 = 0"))
cat("Joint significance test for exper^2 and tenure^2:\n")
## Joint significance test for exper^2 and tenure^2:
print(test)
##
## Linear hypothesis test:
## exper2 = 0
## tenure2 = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + exper + exper2 + tenure + tenure2 + married +
## black + south + urban
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 123.82
## 2 925 123.42 2 0.39756 1.4898 0.226
if (test$`Pr(>F)`[2] > 0.2) {
cat("Exper^2 and tenure^2 are jointly insignificant at the 20% level.\n")
} else {
cat("Exper^2 and tenure^2 are significant at the 20% level.\n")
}
## Exper^2 and tenure^2 are jointly insignificant at the 20% level.
model3 <- lm(log(wage) ~ educ * black + exper + tenure + married + south + urban, data = wage2)
# View the summary of the model
summary(model3)
##
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## black 0.094809 0.255399 0.371 0.710561
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
# Test if the return to education depends on race (interaction term)
coef_interaction <- summary(model3)$coefficients["educ:black", ]
cat("Coefficient for educ:black interaction:", coef_interaction["Estimate"], "\n")
## Coefficient for educ:black interaction: -0.02262361
cat("P-value for educ:black interaction:", coef_interaction["Pr(>|t|)"], "\n")
## P-value for educ:black interaction: 0.2626026
wage2$married_black <- interaction(wage2$married, wage2$black)
model4 <- lm(log(wage) ~ married_black + educ + exper + tenure + south + urban, data = wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ married_black + educ + exper + tenure +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98013 -0.21780 0.01057 0.24219 1.22889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.403793 0.114122 47.351 < 2e-16 ***
## married_black1.0 0.188915 0.042878 4.406 1.18e-05 ***
## married_black0.1 -0.240820 0.096023 -2.508 0.012314 *
## married_black1.1 0.009448 0.056013 0.169 0.866083
## educ 0.065475 0.006253 10.471 < 2e-16 ***
## exper 0.014146 0.003191 4.433 1.04e-05 ***
## tenure 0.011663 0.002458 4.745 2.41e-06 ***
## south -0.091989 0.026321 -3.495 0.000497 ***
## urban 0.184350 0.026978 6.833 1.50e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared: 0.2528, Adjusted R-squared: 0.2464
## F-statistic: 39.17 on 8 and 926 DF, p-value: < 2.2e-16
coef_married_black <- coef(summary(model4))["married_black1.1", "Estimate"]
coef_married_nonblack <- coef(summary(model4))["married_black1.0", "Estimate"]
wage_diff <- coef_married_black - coef_married_nonblack
cat("Estimated wage differential between married blacks and married nonblacks:", wage_diff, "\n")
## Estimated wage differential between married blacks and married nonblacks: -0.1794663
library(wooldridge)
library(sandwich)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
model <- lm(cigs ~ log(cigpric) + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(model)
##
## Call:
## lm(formula = cigs ~ log(cigpric) + log(income) + educ + age +
## I(age^2) + restaurn + white, data = smoke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.772 -9.330 -5.907 7.945 70.275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.682419 24.220729 -0.111 0.91184
## log(cigpric) -0.850907 5.782321 -0.147 0.88305
## log(income) 0.869014 0.728763 1.192 0.23344
## educ -0.501753 0.167168 -3.001 0.00277 **
## age 0.774502 0.160516 4.825 1.68e-06 ***
## I(age^2) -0.009069 0.001748 -5.188 2.70e-07 ***
## restaurn -2.865621 1.117406 -2.565 0.01051 *
## white -0.559236 1.459461 -0.383 0.70169
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared: 0.05291, Adjusted R-squared: 0.04461
## F-statistic: 6.377 on 7 and 799 DF, p-value: 2.588e-07
library(sandwich)
library(lmtest)
robust_se <- coeftest(model, vcov = vcovHC(model, type = "HC1"))
cat("Robust vs. usual standard errors:\n")
## Robust vs. usual standard errors:
print(robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.6824188 25.9019368 -0.1036 0.917544
## log(cigpric) -0.8509074 6.0543955 -0.1405 0.888266
## log(income) 0.8690140 0.5979719 1.4533 0.146542
## educ -0.5017532 0.1624097 -3.0894 0.002075 **
## age 0.7745022 0.1380317 5.6110 2.773e-08 ***
## I(age^2) -0.0090686 0.0014589 -6.2159 8.214e-10 ***
## restaurn -2.8656212 1.0172750 -2.8170 0.004968 **
## white -0.5592364 1.3782830 -0.4057 0.685036
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef_educ <- summary(model)$coefficients["educ", ]
effect_4_years <- coef_educ["Estimate"] * 4
cat("Effect of a 4-year increase in education on probability of smoking:", effect_4_years, "\n")
## Effect of a 4-year increase in education on probability of smoking: -2.007013
coef_age <- summary(model)$coefficients["age", "Estimate"]
coef_age2 <- summary(model)$coefficients["I(age^2)", "Estimate"]
age <- 77 #example
marginal_effect_age <- coef_age + 2 * coef_age2 * age
cat("Marginal effect of an additional year of age on smoking probability:", marginal_effect_age, "\n")
## Marginal effect of an additional year of age on smoking probability: -0.6220627
coef_restaurn <- summary(model)$coefficients["restaurn", ]
cat("Coefficient of restaurn:", coef_restaurn["Estimate"], "\n")
## Coefficient of restaurn: -2.865621
cat("Interpretation: Living in a state with restaurant smoking restrictions decreases the probability of smoking by", abs(coef_restaurn["Estimate"]), "on average.\n")
## Interpretation: Living in a state with restaurant smoking restrictions decreases the probability of smoking by 2.865621 on average.
person_206 <- data.frame(
cigpric = 67.44,
income = 6500,
educ = 16,
age = 77,
restaurn = 0,
white = 0
)
person_206$age2 <- person_206$age^2
prob_smoking <- predict(model, newdata = person_206)
cat("Predicted probability of smoking for person 206:", prob_smoking, "\n")
## Predicted probability of smoking for person 206: -0.7953678
cat("Comment: The predicted value is a probability, but linear probability models can sometimes predict values outside [0, 1]. Check if the prediction is reasonable.\n")
## Comment: The predicted value is a probability, but linear probability models can sometimes predict values outside [0, 1]. Check if the prediction is reasonable.
library(wooldridge)
library(lmtest) # For Breusch-Pagan and White tests
str(vote1)
## 'data.frame': 173 obs. of 10 variables:
## $ state : chr "AL" "AK" "AZ" "AZ" ...
## $ district: int 7 1 2 3 3 4 2 3 5 6 ...
## $ democA : int 1 0 1 0 0 1 0 1 1 1 ...
## $ voteA : int 68 62 73 69 75 69 59 71 76 73 ...
## $ expendA : num 328.3 626.4 99.6 319.7 159.2 ...
## $ expendB : num 8.74 402.48 3.07 26.28 60.05 ...
## $ prtystrA: int 41 60 55 64 66 46 58 49 71 64 ...
## $ lexpendA: num 5.79 6.44 4.6 5.77 5.07 ...
## $ lexpendB: num 2.17 6 1.12 3.27 4.1 ...
## $ shareA : num 97.4 60.9 97 92.4 72.6 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
summary(model)
##
## Call:
## lm(formula = voteA ~ prtystrA + democA + log(expendA) + log(expendB),
## data = vote1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.576 -4.864 -1.146 4.903 24.566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.66141 4.73604 7.952 2.56e-13 ***
## prtystrA 0.25192 0.07129 3.534 0.00053 ***
## democA 3.79294 1.40652 2.697 0.00772 **
## log(expendA) 5.77929 0.39182 14.750 < 2e-16 ***
## log(expendB) -6.23784 0.39746 -15.694 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared: 0.8012, Adjusted R-squared: 0.7964
## F-statistic: 169.2 on 4 and 168 DF, p-value: < 2.2e-16
residuals <- resid(model)
residuals_model <- lm(residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
summary(residuals_model)
##
## Call:
## lm(formula = residuals ~ prtystrA + democA + log(expendA) + log(expendB),
## data = vote1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.576 -4.864 -1.146 4.903 24.566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.029e-15 4.736e+00 0 1
## prtystrA 3.001e-17 7.129e-02 0 1
## democA -2.151e-15 1.407e+00 0 1
## log(expendA) 7.193e-17 3.918e-01 0 1
## log(expendB) -1.305e-15 3.975e-01 0 1
##
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared: 8.097e-32, Adjusted R-squared: -0.02381
## F-statistic: 3.401e-30 on 4 and 168 DF, p-value: 1
cat("Explanation: The R-squared is 0 because the residuals are orthogonal to the fitted values of the independent variables by construction in OLS.\n")
## Explanation: The R-squared is 0 because the residuals are orthogonal to the fitted values of the independent variables by construction in OLS.
bp_test <- bptest(model)
cat("Breusch-Pagan test results:\n")
## Breusch-Pagan test results:
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 9.0934, df = 4, p-value = 0.05881
white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2), data = vote1)
cat("White test results:\n")
## White test results:
print(white_test)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 5.49, df = 2, p-value = 0.06425
library(wooldridge)
library(lmtest)
library(sandwich)
library(car)
str(fertil2)
## 'data.frame': 4361 obs. of 27 variables:
## $ mnthborn: int 5 1 7 11 5 8 7 9 12 9 ...
## $ yearborn: int 64 56 58 45 45 52 51 70 53 39 ...
## $ age : int 24 32 30 42 43 36 37 18 34 49 ...
## $ electric: int 1 1 1 1 1 1 1 1 0 1 ...
## $ radio : int 1 1 0 0 1 0 1 1 1 1 ...
## $ tv : int 1 1 0 1 1 0 1 1 0 0 ...
## $ bicycle : int 1 1 0 0 1 0 1 1 0 0 ...
## $ educ : int 12 13 5 4 11 7 16 10 5 4 ...
## $ ceb : int 0 3 1 3 2 1 4 0 1 0 ...
## $ agefbrth: int NA 25 27 17 24 26 20 NA 19 NA ...
## $ children: int 0 3 1 2 2 1 4 0 1 0 ...
## $ knowmeth: int 1 1 1 1 1 1 1 1 1 1 ...
## $ usemeth : int 0 1 0 0 1 1 1 1 1 0 ...
## $ monthfm : int NA 11 6 1 3 11 5 NA 7 11 ...
## $ yearfm : int NA 80 83 61 66 76 78 NA 72 61 ...
## $ agefm : int NA 24 24 15 20 24 26 NA 18 22 ...
## $ idlnchld: int 2 3 5 3 2 4 4 4 4 4 ...
## $ heduc : int NA 12 7 11 14 9 17 NA 3 1 ...
## $ agesq : int 576 1024 900 1764 1849 1296 1369 324 1156 2401 ...
## $ urban : int 1 1 1 1 1 1 1 1 1 1 ...
## $ urb_educ: int 12 13 5 4 11 7 16 10 5 4 ...
## $ spirit : int 0 0 1 0 0 0 0 0 0 1 ...
## $ protest : int 0 0 0 0 1 0 0 0 1 0 ...
## $ catholic: int 0 0 0 0 0 0 1 1 0 0 ...
## $ frsthalf: int 1 1 0 0 1 0 0 0 0 0 ...
## $ educ0 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ evermarr: int 0 1 1 1 1 1 1 0 1 1 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model1 <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
summary(model1)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban,
## data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9012 -0.7136 -0.0039 0.7119 7.4318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2225162 0.2401888 -17.580 < 2e-16 ***
## age 0.3409255 0.0165082 20.652 < 2e-16 ***
## I(age^2) -0.0027412 0.0002718 -10.086 < 2e-16 ***
## educ -0.0752323 0.0062966 -11.948 < 2e-16 ***
## electric -0.3100404 0.0690045 -4.493 7.20e-06 ***
## urban -0.2000339 0.0465062 -4.301 1.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 4352 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.5734, Adjusted R-squared: 0.5729
## F-statistic: 1170 on 5 and 4352 DF, p-value: < 2.2e-16
robust_se <- coeftest(model1, vcov = vcovHC(model1, type = "HC1"))
cat("Heteroskedasticity-robust standard errors:\n")
## Heteroskedasticity-robust standard errors:
print(robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.22251623 0.24385099 -17.3160 < 2.2e-16 ***
## age 0.34092552 0.01917466 17.7800 < 2.2e-16 ***
## I(age^2) -0.00274121 0.00035051 -7.8206 6.549e-15 ***
## educ -0.07523232 0.00630771 -11.9270 < 2.2e-16 ***
## electric -0.31004041 0.06394815 -4.8483 1.289e-06 ***
## urban -0.20003386 0.04547093 -4.3992 1.113e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("Compare the robust and non-robust standard errors. Robust standard errors are generally larger when heteroskedasticity is present.\n")
## Compare the robust and non-robust standard errors. Robust standard errors are generally larger when heteroskedasticity is present.
model2 <- lm(children ~ age + I(age^2) + educ + electric + urban + catholic + protest + spirit, data = fertil2)
summary(model2)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban +
## catholic + protest + spirit, data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9514 -0.7041 -0.0101 0.7213 7.4435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.3147175 0.2433391 -17.731 < 2e-16 ***
## age 0.3418668 0.0165186 20.696 < 2e-16 ***
## I(age^2) -0.0027596 0.0002722 -10.139 < 2e-16 ***
## educ -0.0762130 0.0064608 -11.796 < 2e-16 ***
## electric -0.3057189 0.0690241 -4.429 9.69e-06 ***
## urban -0.2034131 0.0465864 -4.366 1.29e-05 ***
## catholic 0.1173539 0.0834249 1.407 0.1596
## protest 0.0753961 0.0652158 1.156 0.2477
## spirit 0.1404603 0.0558052 2.517 0.0119 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.451 on 4349 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.574, Adjusted R-squared: 0.5733
## F-statistic: 732.6 on 8 and 4349 DF, p-value: < 2.2e-16
joint_test <- linearHypothesis(model2, c("catholic = 0", "protest = 0", "spirit = 0"))
cat("Joint significance test for religious dummy variables:\n")
## Joint significance test for religious dummy variables:
print(joint_test)
##
## Linear hypothesis test:
## catholic = 0
## protest = 0
## spirit = 0
##
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + catholic +
## protest + spirit
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4352 9176.4
## 2 4349 9162.5 3 13.88 2.1961 0.08641 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("P-values from non-robust tests are in the joint significance test above.\n")
## P-values from non-robust tests are in the joint significance test above.
robust_joint_test <- linearHypothesis(model2, c("catholic = 0", "protest = 0", "spirit = 0"), vcov = vcovHC(model2, type = "HC1"))
cat("Robust joint significance test for religious dummy variables:\n")
## Robust joint significance test for religious dummy variables:
print(robust_joint_test)
##
## Linear hypothesis test:
## catholic = 0
## protest = 0
## spirit = 0
##
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + catholic +
## protest + spirit
##
## Note: Coefficient covariance matrix supplied.
##
## Res.Df Df F Pr(>F)
## 1 4352
## 2 4349 3 2.1559 0.0911 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted_values <- fitted(model2)
residuals_squared <- residuals(model2)^2
heteroskedasticity_model <- lm(residuals_squared ~ fitted_values + I(fitted_values^2))
summary(heteroskedasticity_model)
##
## Call:
## lm(formula = residuals_squared ~ fitted_values + I(fitted_values^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.873 -1.290 -0.302 0.357 47.684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3126 0.1114 2.807 0.00503 **
## fitted_values -0.1489 0.1018 -1.462 0.14381
## I(fitted_values^2) 0.2668 0.0196 13.607 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.632 on 4355 degrees of freedom
## Multiple R-squared: 0.2501, Adjusted R-squared: 0.2497
## F-statistic: 726.1 on 2 and 4355 DF, p-value: < 2.2e-16
heteroskedasticity_test <- linearHypothesis(heteroskedasticity_model, c("fitted_values = 0", "I(fitted_values^2) = 0"))
cat("Heteroskedasticity test:\n")
## Heteroskedasticity test:
print(heteroskedasticity_test)
##
## Linear hypothesis test:
## fitted_values = 0
## I(fitted_values^2) = 0
##
## Model 1: restricted model
## Model 2: residuals_squared ~ fitted_values + I(fitted_values^2)
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4357 76589
## 2 4355 57436 2 19153 726.11 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("Conclude that heteroskedasticity is present if the p-value is below 0.05.\n")
## Conclude that heteroskedasticity is present if the p-value is below 0.05.
cat("Practical importance depends on the size of the residual variance relative to the model's predictions and whether robust standard errors significantly change inference.\n")
## Practical importance depends on the size of the residual variance relative to the model's predictions and whether robust standard errors significantly change inference.
# Load necessary library
library(wooldridge)
# View the structure of the CEOSAL2 dataset
str(ceosal2)
## 'data.frame': 177 obs. of 15 variables:
## $ salary : int 1161 600 379 651 497 1067 945 1261 503 1094 ...
## $ age : int 49 43 51 55 44 64 59 63 47 64 ...
## $ college : int 1 1 1 1 1 1 1 1 1 1 ...
## $ grad : int 1 1 1 0 1 1 0 1 1 1 ...
## $ comten : int 9 10 9 22 8 7 35 32 4 39 ...
## $ ceoten : int 2 10 3 22 6 7 10 8 4 5 ...
## $ sales : num 6200 283 169 1100 351 19000 536 4800 610 2900 ...
## $ profits : int 966 48 40 -54 28 614 24 191 7 230 ...
## $ mktval : num 23200 1100 1100 1000 387 3900 623 2100 454 3900 ...
## $ lsalary : num 7.06 6.4 5.94 6.48 6.21 ...
## $ lsales : num 8.73 5.65 5.13 7 5.86 ...
## $ lmktval : num 10.05 7 7 6.91 5.96 ...
## $ comtensq: int 81 100 81 484 64 49 1225 1024 16 1521 ...
## $ ceotensq: int 4 100 9 484 36 49 100 64 16 25 ...
## $ profmarg: num 15.58 16.96 23.67 -4.91 7.98 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
# Question: Functional form misspecification
# Step 1: Estimate the base model
model_base <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = ceosal2)
# Summary of the base model
summary(model_base)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## ceoten + comten, data = ceosal2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5436 -0.2796 -0.0164 0.2857 1.9879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.571977 0.253466 18.038 < 2e-16 ***
## log(sales) 0.187787 0.040003 4.694 5.46e-06 ***
## log(mktval) 0.099872 0.049214 2.029 0.04397 *
## profmarg -0.002211 0.002105 -1.050 0.29514
## ceoten 0.017104 0.005540 3.087 0.00236 **
## comten -0.009238 0.003337 -2.768 0.00626 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared: 0.3525, Adjusted R-squared: 0.3336
## F-statistic: 18.62 on 5 and 171 DF, p-value: 9.488e-15
# Extract R-squared for the base model
r2_base <- summary(model_base)$r.squared
cat("R-squared for the base model:", r2_base, "\n")
## R-squared for the base model: 0.3525374
# Step 2: Add ceoten^2 and comten^2 to the model
model_extended <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + I(ceoten^2) + comten + I(comten^2), data = ceosal2)
# Summary of the extended model
summary(model_extended)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## ceoten + I(ceoten^2) + comten + I(comten^2), data = ceosal2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.47661 -0.26615 -0.03878 0.27787 1.89344
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.424e+00 2.656e-01 16.655 < 2e-16 ***
## log(sales) 1.857e-01 3.980e-02 4.665 6.25e-06 ***
## log(mktval) 1.018e-01 4.872e-02 2.089 0.038228 *
## profmarg -2.575e-03 2.088e-03 -1.233 0.219137
## ceoten 4.772e-02 1.416e-02 3.371 0.000929 ***
## I(ceoten^2) -1.119e-03 4.814e-04 -2.324 0.021329 *
## comten -6.063e-03 1.189e-02 -0.510 0.610815
## I(comten^2) -5.389e-05 2.528e-04 -0.213 0.831474
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4892 on 169 degrees of freedom
## Multiple R-squared: 0.3745, Adjusted R-squared: 0.3486
## F-statistic: 14.45 on 7 and 169 DF, p-value: 1.136e-14
# Extract R-squared for the extended model
r2_extended <- summary(model_extended)$r.squared
cat("R-squared for the extended model:", r2_extended, "\n")
## R-squared for the extended model: 0.3744746
# Step 3: Test for functional form misspecification
# Calculate the difference in R-squared
diff_r2 <- r2_extended - r2_base
cat("Difference in R-squared:", diff_r2, "\n")
## Difference in R-squared: 0.02193721
# Perform an F-test to test the joint significance of ceoten^2 and comten^2
anova_test <- anova(model_base, model_extended)
cat("ANOVA test results:\n")
## ANOVA test results:
print(anova_test)
## Analysis of Variance Table
##
## Model 1: log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten +
## comten
## Model 2: log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten +
## I(ceoten^2) + comten + I(comten^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 171 41.856
## 2 169 40.438 2 1.4182 2.9634 0.05433 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Conclusion
cat("If the p-value from the F-test is less than 0.05, we reject the null hypothesis that ceoten^2 and comten^2 have no effect, indicating evidence of functional form misspecification.\n")
## If the p-value from the F-test is less than 0.05, we reject the null hypothesis that ceoten^2 and comten^2 have no effect, indicating evidence of functional form misspecification.
# Explanation of Exogenous Sample Selection
cat("Exogenous sample selection occurs when inclusion/exclusion from the sample\n")
## Exogenous sample selection occurs when inclusion/exclusion from the sample
cat("is unrelated to the dependent variable (e.g., campus crimes).\n\n")
## is unrelated to the dependent variable (e.g., campus crimes).
cat("- If colleges fail to report crimes due to random factors (e.g., lack of resources),\n")
## - If colleges fail to report crimes due to random factors (e.g., lack of resources),
cat(" the selection is exogenous, and regression results remain unbiased.\n\n")
## the selection is exogenous, and regression results remain unbiased.
cat("- If failure to report is related to crime rates (e.g., high-crime colleges avoid reporting),\n")
## - If failure to report is related to crime rates (e.g., high-crime colleges avoid reporting),
cat(" the selection is endogenous, leading to biased results (sample selection bias).\n\n")
## the selection is endogenous, leading to biased results (sample selection bias).
cat("Endogenous selection can be addressed with methods like the Heckman selection model.\n")
## Endogenous selection can be addressed with methods like the Heckman selection model.
library(wooldridge)
data_1990 <- subset(infmrt, year == 1990)
str(data_1990)
## 'data.frame': 51 obs. of 12 variables:
## $ year : int 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ infmort: num 6.2 7.1 6.4 7 8.1 ...
## $ afdcprt: int 62 21 25 282 52 135 1031 323 549 657 ...
## $ popul : int 1228 1109 563 6016 1003 3287 17990 7730 11882 10847 ...
## $ pcinc : int 17125 21051 17630 22558 18771 25528 22068 24977 18725 17422 ...
## $ physic : int 178 200 253 337 254 305 315 246 235 196 ...
## $ afdcper: num 5.05 1.89 4.44 4.69 5.18 ...
## $ d90 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ lpcinc : num 9.75 9.95 9.78 10.02 9.84 ...
## $ lphysic: num 5.18 5.3 5.53 5.82 5.54 ...
## $ DC : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lpopul : num 7.11 7.01 6.33 8.7 6.91 ...
model_with_dc <- lm(infmort ~ log(pcinc) + afdcper + physic + d90 + DC, data = data_1990)
summary(model_with_dc)
##
## Call:
## lm(formula = infmort ~ log(pcinc) + afdcper + physic + d90 +
## DC, data = data_1990)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4020 -0.7924 -0.1626 0.9818 2.8058
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.122458 16.646868 0.548 0.5863
## log(pcinc) 0.058412 1.773035 0.033 0.9739
## afdcper 0.377671 0.150912 2.503 0.0159 *
## physic -0.011217 0.005873 -1.910 0.0624 .
## d90 NA NA NA NA
## DC 14.527523 2.352392 6.176 1.58e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.304 on 46 degrees of freedom
## Multiple R-squared: 0.662, Adjusted R-squared: 0.6326
## F-statistic: 22.52 on 4 and 46 DF, p-value: 2.374e-10
# Interpret the coefficient for DC
coef_dc <- summary(model_with_dc)$coefficients["DC", ]
cat("Coefficient for DC:", coef_dc["Estimate"], "\n")
## Coefficient for DC: 14.52752
cat("Standard Error for DC:", coef_dc["Std. Error"], "\n")
## Standard Error for DC: 2.352392
cat("P-value for DC:", coef_dc["Pr(>|t|)"], "\n")
## P-value for DC: 1.578493e-07
cat("Interpretation: The coefficient for DC represents the difference in infant mortality\n")
## Interpretation: The coefficient for DC represents the difference in infant mortality
cat("relative to other states after controlling for other variables.\n")
## relative to other states after controlling for other variables.
model_without_dc <- lm(infmort ~ log(pcinc) + afdcper + physic + d90, data = data_1990)
summary(model_without_dc)
##
## Call:
## lm(formula = infmort ~ log(pcinc) + afdcper + physic + d90, data = data_1990)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7333 -0.9933 0.0199 1.0997 4.4608
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.471078 20.094084 2.661 0.01063 *
## log(pcinc) -5.029289 2.100645 -2.394 0.02070 *
## afdcper 0.377488 0.201916 1.870 0.06779 .
## physic 0.016439 0.005085 3.233 0.00224 **
## d90 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.744 on 47 degrees of freedom
## Multiple R-squared: 0.3818, Adjusted R-squared: 0.3423
## F-statistic: 9.674 on 3 and 47 DF, p-value: 4.383e-05
cat("\nComparison of estimates and standard errors:\n")
##
## Comparison of estimates and standard errors:
common_vars <- intersect(
rownames(summary(model_without_dc)$coefficients),
rownames(summary(model_with_dc)$coefficients)
)
comparison <- data.frame(
Variable = common_vars,
Estimate_Without_DC = summary(model_without_dc)$coefficients[common_vars, "Estimate"],
Std_Error_Without_DC = summary(model_without_dc)$coefficients[common_vars, "Std. Error"],
Estimate_With_DC = summary(model_with_dc)$coefficients[common_vars, "Estimate"],
Std_Error_With_DC = summary(model_with_dc)$coefficients[common_vars, "Std. Error"]
)
# Print the comparison
print(comparison)
## Variable Estimate_Without_DC Std_Error_Without_DC
## (Intercept) (Intercept) 53.47107751 20.094083512
## log(pcinc) log(pcinc) -5.02928918 2.100644629
## afdcper afdcper 0.37748839 0.201916424
## physic physic 0.01643928 0.005084557
## Estimate_With_DC Std_Error_With_DC
## (Intercept) 9.12245833 16.646867718
## log(pcinc) 0.05841225 1.773035150
## afdcper 0.37767143 0.150911800
## physic -0.01121737 0.005873413
cat("\nConclusion:\n")
##
## Conclusion:
cat("Including a dummy variable for a single observation (DC) can significantly\n")
## Including a dummy variable for a single observation (DC) can significantly
cat("alter the coefficients and standard errors of the model. This happens because\n")
## alter the coefficients and standard errors of the model. This happens because
cat("the dummy absorbs variability specific to that single observation, which can\n")
## the dummy absorbs variability specific to that single observation, which can
cat("affect the estimates for other variables.\n")
## affect the estimates for other variables.
library(wooldridge)
library(quantreg)
## Loading required package: SparseM
str(rdchem)
## 'data.frame': 32 obs. of 8 variables:
## $ rd : num 430.6 59 23.5 3.5 1.7 ...
## $ sales : num 4570 2830 597 134 42 ...
## $ profits : num 186.9 467 107.4 -4.3 8 ...
## $ rdintens: num 9.42 2.08 3.94 2.62 4.05 ...
## $ profmarg: num 4.09 16.5 18 -3.22 19.05 ...
## $ salessq : num 20886730 8008900 356170 17849 1764 ...
## $ lsales : num 8.43 7.95 6.39 4.89 3.74 ...
## $ lrd : num 6.065 4.078 3.157 1.253 0.531 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
rdchem$sales_bil <- rdchem$sales / 1000
model_ols_all <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem)
summary_ols_all <- summary(model_ols_all)
cat("OLS model with all firms:\n")
## OLS model with all firms:
print(summary_ols_all)
##
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## data = rdchem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_bil 0.316611 0.138854 2.280 0.03041 *
## I(sales_bil^2) -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
rdchem_trimmed <- subset(rdchem, sales_bil < 40)
model_ols_trimmed <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem_trimmed)
summary_ols_trimmed <- summary(model_ols_trimmed)
cat("\nOLS model without the largest firm:\n")
##
## OLS model without the largest firm:
print(summary_ols_trimmed)
##
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## data = rdchem_trimmed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_bil 0.316611 0.138854 2.280 0.03041 *
## I(sales_bil^2) -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
cat("\nComparison of OLS estimates:\n")
##
## Comparison of OLS estimates:
comparison_ols <- data.frame(
Variable = rownames(summary_ols_all$coefficients),
Estimate_All = summary_ols_all$coefficients[, "Estimate"],
Estimate_Trimmed = summary_ols_trimmed$coefficients[, "Estimate"]
)
print(comparison_ols)
## Variable Estimate_All Estimate_Trimmed
## (Intercept) (Intercept) 2.058966676 2.058966676
## sales_bil sales_bil 0.316610977 0.316610977
## I(sales_bil^2) I(sales_bil^2) -0.007389624 -0.007389624
## profmarg profmarg 0.053322172 0.053322172
model_lad_all <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem, tau = 0.5)
summary_lad_all <- summary(model_lad_all)
cat("\nLAD model with all firms:\n")
##
## LAD model with all firms:
print(summary_lad_all)
##
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## tau = 0.5, data = rdchem)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_bil 0.26346 -0.13508 0.75753
## I(sales_bil^2) -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
model_lad_trimmed <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem_trimmed, tau = 0.5)
summary_lad_trimmed <- summary(model_lad_trimmed)
cat("\nLAD model without the largest firm:\n")
##
## LAD model without the largest firm:
print(summary_lad_trimmed)
##
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## tau = 0.5, data = rdchem_trimmed)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_bil 0.26346 -0.13508 0.75753
## I(sales_bil^2) -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
cat("\nComparison of LAD estimates:\n")
##
## Comparison of LAD estimates:
comparison_lad <- data.frame(
Variable = rownames(summary_lad_all$coefficients),
Estimate_All = summary_lad_all$coefficients[, 1],
Estimate_Trimmed = summary_lad_trimmed$coefficients[, 1]
)
print(comparison_lad)
## Variable Estimate_All Estimate_Trimmed
## (Intercept) (Intercept) 1.404279415 1.404279415
## sales_bil sales_bil 0.263463789 0.263463789
## I(sales_bil^2) I(sales_bil^2) -0.006001127 -0.006001127
## profmarg profmarg 0.114003661 0.114003661
cat("\nConclusion:\n")
##
## Conclusion:
cat("Compare the differences in estimates between models with and without the largest firm for OLS and LAD.\n")
## Compare the differences in estimates between models with and without the largest firm for OLS and LAD.
cat("If LAD estimates change less than OLS estimates, it demonstrates that LAD is more resilient to outliers.\n")
## If LAD estimates change less than OLS estimates, it demonstrates that LAD is more resilient to outliers.
cat("Question (i):\n")
## Question (i):
cat("Disagree. Time series data are typically not independently distributed because\n")
## Disagree. Time series data are typically not independently distributed because
cat("observations are often correlated over time (e.g., due to trends, cycles, or autocorrelation).\n")
## observations are often correlated over time (e.g., due to trends, cycles, or autocorrelation).
cat("Independence is a common assumption in cross-sectional data but is rarely valid in time series data.\n\n")
## Independence is a common assumption in cross-sectional data but is rarely valid in time series data.
cat("Question (ii):\n")
## Question (ii):
cat("Agree. The first three Gauss-Markov assumptions are:\n")
## Agree. The first three Gauss-Markov assumptions are:
cat("1. Linearity in parameters.\n")
## 1. Linearity in parameters.
cat("2. No perfect multicollinearity.\n")
## 2. No perfect multicollinearity.
cat("3. Zero conditional mean of errors (no endogeneity).\n")
## 3. Zero conditional mean of errors (no endogeneity).
cat("Under these conditions, OLS remains unbiased in time series regression.\n\n")
## Under these conditions, OLS remains unbiased in time series regression.
cat("Question (iii):\n")
## Question (iii):
cat("Disagree. A trending variable can be used as the dependent variable, but\n")
## Disagree. A trending variable can be used as the dependent variable, but
cat("the regression model must account for the trend (e.g., include a time trend variable or differences).\n")
## the regression model must account for the trend (e.g., include a time trend variable or differences).
cat("Failing to account for the trend can lead to spurious regression results.\n\n")
## Failing to account for the trend can lead to spurious regression results.
cat("Question (iv):\n")
## Question (iv):
cat("Agree. Seasonality typically affects higher-frequency data such as monthly or quarterly observations.\n")
## Agree. Seasonality typically affects higher-frequency data such as monthly or quarterly observations.
cat("In annual time series data, seasonal effects are less relevant because they average out over the year.\n\n")
## In annual time series data, seasonal effects are less relevant because they average out over the year.
# Print the model using the cat function
cat("Model Specification:\n")
## Model Specification:
cat("housing_starts = β0 + β1 * time + β2 * Q1 + β3 * Q2 + β4 * Q3 + β5 * interest_rate + β6 * income + u\n")
## housing_starts = β0 + β1 * time + β2 * Q1 + β3 * Q2 + β4 * Q3 + β5 * interest_rate + β6 * income + u
cat("\nWhere:\n")
##
## Where:
cat("- 'time' captures the trend in housing starts.\n")
## - 'time' captures the trend in housing starts.
cat("- 'Q1', 'Q2', 'Q3' are quarterly dummies to account for seasonality (Q4 is the reference category).\n")
## - 'Q1', 'Q2', 'Q3' are quarterly dummies to account for seasonality (Q4 is the reference category).
cat("- 'interest_rate' captures the relationship between borrowing costs and housing starts.\n")
## - 'interest_rate' captures the relationship between borrowing costs and housing starts.
cat("- 'income' captures the effect of real per capita income on housing starts.\n")
## - 'income' captures the effect of real per capita income on housing starts.
library(wooldridge)
str(intdef)
## 'data.frame': 56 obs. of 13 variables:
## $ year : int 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 ...
## $ i3 : num 1.04 1.1 1.22 1.55 1.77 ...
## $ inf : num 8.1 -1.2 1.3 7.9 1.9 ...
## $ rec : num 16.2 14.5 14.4 16.1 19 ...
## $ out : num 11.6 14.3 15.6 14.2 19.4 ...
## $ def : num -4.6 -0.2 1.2 -1.9 0.4 ...
## $ i3_1 : num NA 1.04 1.1 1.22 1.55 ...
## $ inf_1: num NA 8.1 -1.2 1.3 7.9 ...
## $ def_1: num NA -4.6 -0.2 1.2 -1.9 ...
## $ ci3 : num NA 0.06 0.12 0.33 0.22 ...
## $ cinf : num NA -9.3 2.5 6.6 -6 ...
## $ cdef : num NA 4.4 1.4 -3.1 2.3 ...
## $ y77 : int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
intdef$after1979 <- ifelse(intdef$year > 1979, 1, 0)
model <- lm(i3 ~ inf + def + after1979, data = intdef)
summary(model)
##
## Call:
## lm(formula = i3 ~ inf + def + after1979, data = intdef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4674 -0.8407 0.2388 1.0148 3.9654
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.29623 0.42535 3.047 0.00362 **
## inf 0.60842 0.07625 7.979 1.37e-10 ***
## def 0.36266 0.12025 3.016 0.00396 **
## after1979 1.55877 0.50577 3.082 0.00329 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.711 on 52 degrees of freedom
## Multiple R-squared: 0.6635, Adjusted R-squared: 0.6441
## F-statistic: 34.18 on 3 and 52 DF, p-value: 2.408e-12
coef_after1979 <- summary(model)$coefficients["after1979", ]
cat("\nCoefficient for after1979:\n")
##
## Coefficient for after1979:
cat("Estimate:", coef_after1979["Estimate"], "\n")
## Estimate: 1.558766
cat("Standard Error:", coef_after1979["Std. Error"], "\n")
## Standard Error: 0.5057717
cat("P-value:", coef_after1979["Pr(>|t|)"], "\n")
## P-value: 0.003285646
if (coef_after1979["Pr(>|t|)"] < 0.05) {
cat("\nConclusion:\nThe coefficient for 'after1979' is statistically significant, indicating a shift\n")
cat("in the interest rate equation after 1979. The Federal Reserve's policy change likely\n")
cat("altered the relationship between interest rates and other variables.\n")
} else {
cat("\nConclusion:\nThe coefficient for 'after1979' is not statistically significant, suggesting\n")
cat("no substantial shift in the interest rate equation after 1979.\n")
}
##
## Conclusion:
## The coefficient for 'after1979' is statistically significant, indicating a shift
## in the interest rate equation after 1979. The Federal Reserve's policy change likely
## altered the relationship between interest rates and other variables.
library(wooldridge)
str(volat)
## 'data.frame': 558 obs. of 17 variables:
## $ date : num 1947 1947 1947 1947 1947 ...
## $ sp500 : num 15.2 15.8 15.2 14.6 14.3 ...
## $ divyld: num 4.49 4.38 4.61 4.75 5.05 ...
## $ i3 : num 0.38 0.38 0.38 0.38 0.38 ...
## $ ip : num 22.4 22.5 22.6 22.5 22.6 ...
## $ pcsp : num NA 46.5 -48.6 -44.3 -21.4 ...
## $ rsp500: num NA 50.9 -44 -39.6 -16.3 ...
## $ pcip : num NA 5.36 5.33 -5.31 5.33 ...
## $ ci3 : num NA 0 0 0 0 ...
## $ ci3_1 : num NA NA 0 0 0 ...
## $ ci3_2 : num NA NA NA 0 0 ...
## $ pcip_1: num NA NA 5.36 5.33 -5.31 ...
## $ pcip_2: num NA NA NA 5.36 5.33 ...
## $ pcip_3: num NA NA NA NA 5.36 ...
## $ pcsp_1: num NA NA 46.5 -48.6 -44.3 ...
## $ pcsp_2: num NA NA NA 46.5 -48.6 ...
## $ pcsp_3: num NA NA NA NA 46.5 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
cat("Question (i):\n")
## Question (i):
cat("β1 (pcip): Expected to have a positive sign, as increases in industrial production\n")
## β1 (pcip): Expected to have a positive sign, as increases in industrial production
cat(" often signal economic growth, which could positively impact stock returns.\n")
## often signal economic growth, which could positively impact stock returns.
cat("β2 (i3): Expected to have a negative sign, as higher T-bill returns might make\n")
## β2 (i3): Expected to have a negative sign, as higher T-bill returns might make
cat(" risk-free investments more attractive, reducing stock returns.\n\n")
## risk-free investments more attractive, reducing stock returns.
model <- lm(rsp500 ~ pcip + i3, data = volat)
summary_model <- summary(model)
cat("Question (ii):\n")
## Question (ii):
print(summary_model)
##
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.871 -22.580 2.103 25.524 138.137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.84306 3.27488 5.754 1.44e-08 ***
## pcip 0.03642 0.12940 0.281 0.7785
## i3 -1.36169 0.54072 -2.518 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.13 on 554 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01189, Adjusted R-squared: 0.008325
## F-statistic: 3.334 on 2 and 554 DF, p-value: 0.03637
cat("\nInterpretation of Coefficients:\n")
##
## Interpretation of Coefficients:
cat("β1 (pcip): Indicates the change in S&P 500 returns for a 1% change in industrial production.\n")
## β1 (pcip): Indicates the change in S&P 500 returns for a 1% change in industrial production.
cat("β2 (i3): Indicates the change in S&P 500 returns for a 1% change in the T-bill rate.\n")
## β2 (i3): Indicates the change in S&P 500 returns for a 1% change in the T-bill rate.
cat("\nQuestion (iii):\n")
##
## Question (iii):
significance <- summary_model$coefficients[, "Pr(>|t|)"]
significant_vars <- names(significance[significance < 0.05])
cat("Statistically significant variables (p < 0.05):", significant_vars, "\n")
## Statistically significant variables (p < 0.05): (Intercept) i3
cat("\nQuestion (iv):\n")
##
## Question (iv):
if ("pcip" %in% significant_vars || "i3" %in% significant_vars) {
cat("The statistical significance of predictors suggests that returns on the S&P 500\n")
cat("are predictable to some extent based on changes in industrial production (pcip)\n")
cat("or T-bill returns (i3). However, this does not guarantee strong predictability,\n")
cat("as financial markets are generally efficient.\n")
} else {
cat("If no variables are statistically significant, it implies that the S&P 500 returns\n")
cat("may not be predictable using these variables, consistent with market efficiency.\n")
}
## The statistical significance of predictors suggests that returns on the S&P 500
## are predictable to some extent based on changes in industrial production (pcip)
## or T-bill returns (i3). However, this does not guarantee strong predictability,
## as financial markets are generally efficient.
library(wooldridge)
str(consump)
## 'data.frame': 37 obs. of 24 variables:
## $ year : int 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 ...
## $ i3 : num 3.41 2.93 2.38 2.78 3.16 ...
## $ inf : num 0.7 1.7 1 1 1.3 ...
## $ rdisp : num 1530 1565 1616 1694 1756 ...
## $ rnondc : num 606 615 627 646 660 ...
## $ rserv : num 687 717 746 783 819 ...
## $ pop : num 177830 180671 183691 186538 189242 ...
## $ y : num 8604 8664 8796 9080 9276 ...
## $ rcons : num 1294 1333 1373 1430 1479 ...
## $ c : num 7275 7377 7476 7665 7814 ...
## $ r3 : num 2.71 1.23 1.38 1.78 1.86 ...
## $ lc : num 8.89 8.91 8.92 8.94 8.96 ...
## $ ly : num 9.06 9.07 9.08 9.11 9.14 ...
## $ gc : num NA 0.0139 0.0133 0.0251 0.0192 ...
## $ gy : num NA 0.00696 0.01511 0.03171 0.02145 ...
## $ gc_1 : num NA NA 0.0139 0.0133 0.0251 ...
## $ gy_1 : num NA NA 0.00696 0.01511 0.03171 ...
## $ r3_1 : num NA 2.71 1.23 1.38 1.78 ...
## $ lc_ly : num -0.168 -0.161 -0.163 -0.169 -0.172 ...
## $ lc_ly_1: num NA -0.168 -0.161 -0.163 -0.169 ...
## $ gc_2 : num NA NA NA 0.0139 0.0133 ...
## $ gy_2 : num NA NA NA 0.00696 0.01511 ...
## $ r3_2 : num NA NA 2.71 1.23 1.38 ...
## $ lc_ly_2: num NA NA -0.168 -0.161 -0.163 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model1 <- lm(gc ~ gc_1, data = consump)
summary_model1 <- summary(model1)
cat("Question (i):\n")
## Question (i):
print(summary_model1)
##
## Call:
## lm(formula = gc ~ gc_1, data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.027878 -0.005974 -0.001450 0.007142 0.020227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.011431 0.003778 3.026 0.00478 **
## gc_1 0.446133 0.156047 2.859 0.00731 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01161 on 33 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1985, Adjusted R-squared: 0.1742
## F-statistic: 8.174 on 1 and 33 DF, p-value: 0.007311
cat("\nNull Hypothesis (H0): β1 = 0, indicating no relationship between gc_t and gc_(t-1).\n")
##
## Null Hypothesis (H0): β1 = 0, indicating no relationship between gc_t and gc_(t-1).
cat("Alternative Hypothesis (H1): β1 ≠ 0, indicating a relationship exists.\n")
## Alternative Hypothesis (H1): β1 ≠ 0, indicating a relationship exists.
if (summary_model1$coefficients["gc_1", "Pr(>|t|)"] < 0.05) {
cat("Conclusion: Reject H0 at the 5% level. There is evidence of a relationship.\n")
} else {
cat("Conclusion: Fail to reject H0. The PIH is supported by the data.\n")
}
## Conclusion: Reject H0 at the 5% level. There is evidence of a relationship.
model2 <- lm(gc ~ gc_1 + gy_1 + i3 + inf, data = consump)
summary_model2 <- summary(model2)
cat("\nQuestion (ii):\n")
##
## Question (ii):
print(summary_model2)
##
## Call:
## lm(formula = gc ~ gc_1 + gy_1 + i3 + inf, data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0262462 -0.0076662 0.0009066 0.0063953 0.0174975
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.007e-02 5.839e-03 3.438 0.00174 **
## gc_1 5.398e-01 2.604e-01 2.073 0.04689 *
## gy_1 -1.017e-01 1.792e-01 -0.567 0.57463
## i3 -2.523e-05 1.028e-03 -0.025 0.98059
## inf -1.701e-03 8.825e-04 -1.928 0.06341 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01069 on 30 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3819, Adjusted R-squared: 0.2995
## F-statistic: 4.635 on 4 and 30 DF, p-value: 0.004938
cat("\nSignificance of new variables (gy_1, i3_1, inf_1):\n")
##
## Significance of new variables (gy_1, i3_1, inf_1):
p_values <- summary_model2$coefficients[c("gy_1", "i3", "inf"), "Pr(>|t|)"]
print(p_values)
## gy_1 i3 inf
## 0.57463167 0.98058921 0.06340824
cat("\nJoint Significance of gy_1, i3_1, inf_1 (at 5% level):\n")
##
## Joint Significance of gy_1, i3_1, inf_1 (at 5% level):
if (all(p_values < 0.05)) {
cat("All new variables are individually significant.\n")
} else if (any(p_values < 0.05)) {
cat("Some of the new variables are individually significant.\n")
} else {
cat("None of the new variables are individually significant.\n")
}
## None of the new variables are individually significant.
cat("\nQuestion (iii):\n")
##
## Question (iii):
p_value_gc1 <- summary_model2$coefficients["gc_1", "Pr(>|t|)"]
cat("P-value for gc_1 in the extended model:", p_value_gc1, "\n")
## P-value for gc_1 in the extended model: 0.04689048
if (p_value_gc1 < 0.05) {
cat("Conclusion: gc_1 is still significant after including the new variables.\n")
} else {
cat("Conclusion: gc_1 is no longer significant, providing more support for the PIH.\n")
}
## Conclusion: gc_1 is still significant after including the new variables.
anova_test <- anova(model1, model2)
cat("\nQuestion (iv): F-test for joint significance:\n")
##
## Question (iv): F-test for joint significance:
print(anova_test)
## Analysis of Variance Table
##
## Model 1: gc ~ gc_1
## Model 2: gc ~ gc_1 + gy_1 + i3 + inf
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 33 0.0044447
## 2 30 0.0034276 3 0.0010171 2.9675 0.04767 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\nF-statistic and associated p-value for the extended model:\n")
##
## F-statistic and associated p-value for the extended model:
f_stat <- anova_test$`F`[2]
p_value_f <- anova_test$`Pr(>F)`[2]
cat("F-statistic:", f_stat, "\n")
## F-statistic: 2.967484
cat("P-value:", p_value_f, "\n")
## P-value: 0.04766985
if (p_value_f < 0.05) {
cat("Conclusion: The explanatory variables are jointly significant, suggesting PIH is less supported.\n")
} else {
cat("Conclusion: The explanatory variables are not jointly significant, supporting the PIH.\n")
}
## Conclusion: The explanatory variables are jointly significant, suggesting PIH is less supported.
##C12
library(wooldridge)
str(minwage)
## 'data.frame': 612 obs. of 58 variables:
## $ emp232 : num 271 272 269 263 258 ...
## $ wage232 : num 0.86 0.86 0.86 0.86 0.87 ...
## $ emp236 : num 53.1 54.8 52.9 48.3 47.6 ...
## $ wage236 : num 0.99 1 0.98 0.91 0.93 ...
## $ emp234 : num 93.4 94.6 95.4 95.2 96.3 ...
## $ wage234 : num 0.92 0.92 0.91 0.92 0.92 ...
## $ emp314 : num 254 257 257 254 246 ...
## $ wage314 : num 0.99 0.98 0.99 0.99 0.99 ...
## $ emp228 : num 185 185 182 180 174 ...
## $ wage228 : num 0.91 0.94 0.98 0.98 0.98 ...
## $ emp233 : num 346 358 358 327 303 ...
## $ wage233 : num 1.55 1.57 1.54 1.42 1.38 ...
## $ emp394 : num 79.5 79.5 79.6 78.2 76.2 ...
## $ wage394 : num 1.03 1.04 1.05 1.05 1.06 ...
## $ emp231 : num 144 146 147 146 146 ...
## $ wage231 : num 1.27 1.27 1.27 1.25 1.25 ...
## $ emp226 : num 90.6 90.7 90.6 89.4 88.6 ...
## $ wage226 : num 1.07 1.08 1.1 1.13 1.12 ...
## $ emp387 : num 41.9 42.4 42 41.7 41.6 ...
## $ wage387 : num 1.07 1.07 1.08 1.09 1.1 ...
## $ emp056 : num 560 544 572 582 578 ...
## $ wage056 : num 0.94 0.94 0.96 0.98 0.97 ...
## $ unem : num NA NA NA NA NA NA NA NA NA NA ...
## $ cpi : num 21.5 21.5 21.9 21.9 21.9 ...
## $ minwage : num 0.4 0.4 0.4 0.4 0.4 ...
## $ lemp232 : num 5.6 5.61 5.59 5.57 5.55 ...
## $ lwage232 : num -0.151 -0.151 -0.151 -0.151 -0.139 ...
## $ gemp232 : num NA 0.00663 -0.01405 -0.02259 -0.01767 ...
## $ gwage232 : num NA 0 0 0 0.0116 ...
## $ lminwage : num -0.916 -0.916 -0.916 -0.916 -0.916 ...
## $ gmwage : num NA 0 0 0 0 0 0 0 0 0 ...
## $ gmwage_1 : num NA NA 0 0 0 0 0 0 0 0 ...
## $ gmwage_2 : num NA NA NA 0 0 0 0 0 0 0 ...
## $ gmwage_3 : num NA NA NA NA 0 0 0 0 0 0 ...
## $ gmwage_4 : num NA NA NA NA NA 0 0 0 0 0 ...
## $ gmwage_5 : num NA NA NA NA NA NA 0 0 0 0 ...
## $ gmwage_6 : num NA NA NA NA NA NA NA 0 0 0 ...
## $ gmwage_7 : num NA NA NA NA NA NA NA NA 0 0 ...
## $ gmwage_8 : num NA NA NA NA NA NA NA NA NA 0 ...
## $ gmwage_9 : num NA NA NA NA NA NA NA NA NA NA ...
## $ gmwage_10: num NA NA NA NA NA NA NA NA NA NA ...
## $ gmwage_11: num NA NA NA NA NA NA NA NA NA NA ...
## $ gmwage_12: num NA NA NA NA NA NA NA NA NA NA ...
## $ lemp236 : num 3.97 4 3.97 3.88 3.86 ...
## $ gcpi : num NA 0 0.0184 0 0 ...
## $ lcpi : num 3.07 3.07 3.09 3.09 3.09 ...
## $ lwage236 : num -0.0101 0 -0.0202 -0.0943 -0.0726 ...
## $ gemp236 : num NA 0.0315 -0.0353 -0.091 -0.0146 ...
## $ gwage236 : num NA 0.0101 -0.0202 -0.0741 0.0217 ...
## $ lemp234 : num 4.54 4.55 4.56 4.56 4.57 ...
## $ lwage234 : num -0.0834 -0.0834 -0.0943 -0.0834 -0.0834 ...
## $ gemp234 : num NA 0.01277 0.00842 -0.0021 0.01149 ...
## $ gwage234 : num NA 0 -0.0109 0.0109 0 ...
## $ lemp314 : num 5.54 5.55 5.55 5.54 5.5 ...
## $ lwage314 : num -0.0101 -0.0202 -0.0101 -0.0101 -0.0101 ...
## $ gemp314 : num NA 0.01058 0.00117 -0.01214 -0.03203 ...
## $ gwage314 : num NA -0.0102 0.0102 0 0 ...
## $ t : int 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
acf_result <- acf(minwage$wage232, lag.max = 1, plot = FALSE)
first_order_ac <- acf_result$acf[2]
cat("Question (i):\n")
## Question (i):
cat("First-order autocorrelation in gwage232:", first_order_ac, "\n")
## First-order autocorrelation in gwage232: 0.9951486
if (abs(first_order_ac) > 0.5) {
cat("The series appears to be weakly dependent, as the autocorrelation is significant.\n")
} else {
cat("The series does not appear to be strongly dependent.\n")
}
## The series appears to be weakly dependent, as the autocorrelation is significant.
model_dynamic <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = minwage)
cat("\nQuestion (ii):\n")
##
## Question (ii):
summary_dynamic <- summary(model_dynamic)
print(summary_dynamic)
##
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = minwage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.138e-17 -1.537e-18 -3.990e-19 1.031e-18 3.063e-16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.369e-18 6.877e-19 4.899e+00 1.24e-06 ***
## lag(gwage232, 1) 1.000e+00 6.479e-17 1.544e+16 < 2e-16 ***
## gmwage -4.748e-20 1.825e-17 -3.000e-03 0.998
## gcpi 5.317e-16 1.321e-16 4.024e+00 6.46e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.264e-17 on 607 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.123e+32 on 3 and 607 DF, p-value: < 2.2e-16
cat("Interpretation: The coefficient on gmwage indicates the contemporaneous impact of minimum wage growth\n")
## Interpretation: The coefficient on gmwage indicates the contemporaneous impact of minimum wage growth
cat("on wage growth in sector 232, controlling for past wage growth and CPI growth.\n")
## on wage growth in sector 232, controlling for past wage growth and CPI growth.
model_extended <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + lag(gemp232, 1), data = minwage)
cat("\nQuestion (iii):\n")
##
## Question (iii):
summary_extended <- summary(model_extended)
print(summary_extended)
##
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + lag(gemp232,
## 1), data = minwage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.123e-17 -1.582e-18 -4.110e-19 8.800e-19 3.060e-16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.369e-18 6.880e-19 4.897e+00 1.25e-06 ***
## lag(gwage232, 1) 1.000e+00 6.483e-17 1.543e+16 < 2e-16 ***
## gmwage 1.042e-19 1.826e-17 6.000e-03 0.995
## gcpi 5.319e-16 1.322e-16 4.024e+00 6.45e-05 ***
## lag(gemp232, 1) -2.146e-17 2.747e-17 -7.810e-01 0.435
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.264e-17 on 606 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 8.416e+31 on 4 and 606 DF, p-value: < 2.2e-16
p_value_gemp <- summary_extended$coefficients["lag(gemp232, 1)", "Pr(>|t|)"]
cat("\nP-value for lagged growth in employment (gemp232_1):", p_value_gemp, "\n")
##
## P-value for lagged growth in employment (gemp232_1): 0.4350518
if (p_value_gemp < 0.05) {
cat("Lagged growth in employment is statistically significant.\n")
} else {
cat("Lagged growth in employment is not statistically significant.\n")
}
## Lagged growth in employment is not statistically significant.
model_without_lagged <- lm(gwage232 ~ gmwage + gcpi, data = minwage)
cat("\nQuestion (iv):\n")
##
## Question (iv):
summary_without_lagged <- summary(model_without_lagged)
comparison <- data.frame(
Variable = rownames(summary_without_lagged$coefficients),
Coefficient_Without_Lagged = summary_without_lagged$coefficients[, "Estimate"],
Coefficient_With_Lagged = summary_extended$coefficients[rownames(summary_without_lagged$coefficients), "Estimate"]
)
print(comparison)
## Variable Coefficient_Without_Lagged Coefficient_With_Lagged
## (Intercept) (Intercept) 0.002184222 3.368611e-18
## gmwage gmwage 0.150571350 1.042475e-19
## gcpi gcpi 0.243522252 5.318641e-16
cat("Adding lagged variables affects the coefficient on gmwage if the lagged variables\n")
## Adding lagged variables affects the coefficient on gmwage if the lagged variables
cat("are significant and explain part of the variation in gwage232.\n")
## are significant and explain part of the variation in gwage232.
model_rsq <- lm(gmwage ~ lag(gwage232, 1) + lag(gemp232, 1), data = minwage)
cat("\nQuestion (v):\n")
##
## Question (v):
summary_rsq <- summary(model_rsq)
cat("R-squared of the regression of gmwage on lagged variables:", summary_rsq$r.squared, "\n")
## R-squared of the regression of gmwage on lagged variables: 0.2825375
cat("Interpretation: A high R-squared indicates that gmwage is predictable based on lagged variables,\n")
## Interpretation: A high R-squared indicates that gmwage is predictable based on lagged variables,
cat("which could affect its coefficient in the model.\n")
## which could affect its coefficient in the model.
library(wooldridge)
library(lmtest) # For testing heteroskedasticity
library(tseries) # For ARCH models
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(fGarch) # For ARCH/GARCH estimation
## NOTE: Packages 'fBasics', 'timeDate', and 'timeSeries' are no longer
## attached to the search() path when 'fGarch' is attached.
##
## If needed attach them yourself in your R script by e.g.,
## require("timeSeries")
data("nyse")
model_ols <- lm(return ~ return_1, data = nyse)
summary(model_ols)
##
## Call:
## lm(formula = return ~ return_1, data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.261 -1.302 0.098 1.316 8.065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.17963 0.08074 2.225 0.0264 *
## return_1 0.05890 0.03802 1.549 0.1218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.11 on 687 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.003481, Adjusted R-squared: 0.00203
## F-statistic: 2.399 on 1 and 687 DF, p-value: 0.1218
residuals_ols <- residuals(model_ols)
squared_residuals <- residuals_ols^2
cat("Question (i):\n")
## Question (i):
cat("Average of squared residuals:", mean(squared_residuals), "\n")
## Average of squared residuals: 4.440839
cat("Minimum of squared residuals:", min(squared_residuals), "\n")
## Minimum of squared residuals: 7.35465e-06
cat("Maximum of squared residuals:", max(squared_residuals), "\n")
## Maximum of squared residuals: 232.8946
nyse_clean <- nyse[complete.cases(nyse$return, nyse$return_1), ]
# Add squared residuals to the cleaned dataset
nyse_clean$squared_residuals <- residuals(lm(return ~ return_1, data = nyse_clean))^2
nyse_clean <- nyse[complete.cases(nyse$return_1, nyse$squared_residuals), ]
model_arch1 <- lm(squared_residuals ~ return_1 + I(return_1^2), data = nyse_clean)
cat("\nQuestion (ii):\n")
##
## Question (ii):
summary_arch1 <- summary(model_arch1)
print(summary_arch1)
##
## Call:
## lm(formula = squared_residuals ~ return_1 + I(return_1^2), data = nyse_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.459 -3.011 -1.975 0.676 221.469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.25734 0.44085 7.389 4.32e-13 ***
## return_1 -0.78946 0.19569 -4.034 6.09e-05 ***
## I(return_1^2) 0.29666 0.03552 8.351 3.75e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared: 0.1303, Adjusted R-squared: 0.1278
## F-statistic: 51.4 on 2 and 686 DF, p-value: < 2.2e-16
nyse_clean <- nyse[complete.cases(nyse$return_1, nyse$squared_residuals), ]
cat("Length of return_1:", length(nyse_clean$return_1), "\n")
## Length of return_1: 689
cat("Length of squared_residuals:", length(nyse_clean$squared_residuals), "\n")
## Length of squared_residuals: 0
cat("\nQuestion (iii):\n")
##
## Question (iii):
cat("The conditional variance is a function of return_1. Sketching requires plotting.\n")
## The conditional variance is a function of return_1. Sketching requires plotting.
plot(
nyse_clean$return_1,
nyse_clean$squared_residuals,
main = "Conditional Variance vs. Lagged Return",
xlab = "Lagged Return (return_1)",
ylab = "Squared Residuals",
pch = 20
)
conditional_variance <- fitted(model_arch1)
cat("Length of conditional_variance:", length(conditional_variance), "\n")
## Length of conditional_variance: 689
cat("Minimum variance:", min(conditional_variance), "\n")
## Minimum variance: 2.732132
cat("Maximum variance:", max(conditional_variance), "\n")
## Maximum variance: 84.99515