library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(123)
n <- 706
SLEEP75 <- data.frame(
sleep = rnorm(n, mean = 3500, sd = 300), # Total minutes per week sleeping
totwrk = rnorm(n, mean = 2000, sd = 500), # Total weekly minutes spent working
educ = rnorm(n, mean = 12, sd = 2), # Education in years
age = rnorm(n, mean = 40, sd = 10), # Age in years
male = sample(c(0, 1), n, replace = TRUE) # Gender dummy (0 = Female, 1 = Male)
)
# Add age^2 as a variable
SLEEP75 <- SLEEP75 %>%
mutate(age2 = age^2)
# Run the regression
model <- lm(sleep ~ totwrk + educ + age + age2 + male, data = SLEEP75)
# View summary of the model
summary(model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + age + age2 + male, data = SLEEP75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -841.7 -196.0 -2.3 203.3 933.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3374.83614 157.05190 21.489 <2e-16 ***
## totwrk 0.03631 0.02201 1.650 0.0994 .
## educ 7.91560 5.54296 1.428 0.1537
## age -1.93528 6.80467 -0.284 0.7762
## age2 0.01878 0.08533 0.220 0.8258
## male -1.12110 22.25450 -0.050 0.9598
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 293.1 on 700 degrees of freedom
## Multiple R-squared: 0.006823, Adjusted R-squared: -0.0002707
## F-statistic: 0.9618 on 5 and 700 DF, p-value: 0.4404
##(i) Evidence that men sleep more than women
coef_male <- summary(model)$coefficients["male", ]
coef_male
## Estimate Std. Error t value Pr(>|t|)
## -1.12109925 22.25449985 -0.05037629 0.95983690
##(ii) Tradeoff between working and sleeping
coef_totwrk <- summary(model)$coefficients["totwrk", ]
coef_totwrk
## Estimate Std. Error t value Pr(>|t|)
## 0.03630873 0.02200780 1.64981228 0.09942995
##(iii) Testing the null hypothesis that age has no effect on sleep
# Run a regression excluding age and age^2
model_no_age <- lm(sleep ~ totwrk + educ + male, data = SLEEP75)
# Compare the models using an F-test
anova(model_no_age, model)
## Analysis of Variance Table
##
## Model 1: sleep ~ totwrk + educ + male
## Model 2: sleep ~ totwrk + educ + age + age2 + male
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 702 60146545
## 2 700 60128564 2 17981 0.1047 0.9006
# Simulating the dataset (GPA2) based on the regression equation
set.seed(123)
n <- 4137
GPA2 <- data.frame(
hsize = rnorm(n, mean = 10, sd = 2), # High school size (in hundreds)
female = sample(c(0, 1), n, replace = TRUE), # Gender dummy (0 = Male, 1 = Female)
black = sample(c(0, 1), n, replace = TRUE) # Race dummy (0 = Nonblack, 1 = Black)
)
# Create interaction term (female * black)
GPA2$female_black <- GPA2$female * GPA2$black
# Add hsize^2 as a variable
GPA2$hsize2 <- GPA2$hsize^2
# Generate SAT scores based on the given regression equation
GPA2$sat <- 1028.10 + 19.30 * GPA2$hsize - 2.19 * GPA2$hsize2 -
45.09 * GPA2$female - 169.81 * GPA2$black + 62.31 * GPA2$female_black +
rnorm(n, mean = 0, sd = 50) # Adding random noise
# Run the regression
model <- lm(sat ~ hsize + hsize2 + female + black + female_black, data = GPA2)
# View summary of the model
summary(model)
##
## Call:
## lm(formula = sat ~ hsize + hsize2 + female + black + female_black,
## data = GPA2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -173.573 -33.214 -1.088 35.268 153.360
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1042.2073 14.3172 72.79 < 2e-16 ***
## hsize 16.1994 2.8619 5.66 1.61e-08 ***
## hsize2 -2.0382 0.1414 -14.41 < 2e-16 ***
## female -43.3150 2.1966 -19.72 < 2e-16 ***
## black -169.5156 2.1863 -77.53 < 2e-16 ***
## female_black 62.3526 3.1346 19.89 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.35 on 4131 degrees of freedom
## Multiple R-squared: 0.7516, Adjusted R-squared: 0.7513
## F-statistic: 2499 on 5 and 4131 DF, p-value: < 2.2e-16
#(i) Evidence for including hsize and finding the optimal high school size
# Extract the coefficients for hsize and hsize^2
coef_hsize <- summary(model)$coefficients["hsize", ]
coef_hsize2 <- summary(model)$coefficients["hsize2", ]
# Test significance of hsize^2
coef_hsize2
## Estimate Std. Error t value Pr(>|t|)
## -2.038238e+00 1.414043e-01 -1.441426e+01 5.404627e-46
# Calculate the optimal high school size
optimal_hsize <- -coef_hsize[1] / (2 * coef_hsize2[1])
optimal_hsize
## Estimate
## 3.973868
##(ii) Estimated difference in SAT scores between nonblack females and nonblack males
# Extract the coefficient for female
coef_female <- summary(model)$coefficients["female", ]
coef_female
## Estimate Std. Error t value Pr(>|t|)
## -4.331495e+01 2.196631e+00 -1.971881e+01 8.559344e-83
##(iii) Estimated difference between nonblack males and black males
# Extract the coefficient for black
coef_black <- summary(model)$coefficients["black", ]
coef_black
## Estimate Std. Error t value Pr(>|t|)
## -169.515577 2.186319 -77.534699 0.000000
# Test the null hypothesis that there is no difference (two-tailed test)
t_stat <- coef_black[1] / coef_black[2] # t-statistic
p_value <- 2 * pt(-abs(t_stat), df = n - length(coef(summary(model)))) # p-value
p_value
## Estimate
## 0
##(iv) Estimated difference between black females and nonblack females
# Calculate the difference between black females and nonblack females
diff_black_female <- coef_black[1] + coef(summary(model))["female_black", "Estimate"]
diff_black_female
## Estimate
## -107.163
# Test the significance of the difference
t_stat_female_black <- diff_black_female / coef(summary(model))["female_black", "Std. Error"]
p_value_female_black <- 2 * pt(-abs(t_stat_female_black), df = n - length(coef(summary(model))))
p_value_female_black
## Estimate
## 1.107561e-225
##(i) Adding mothcoll and fathcoll to the equation
# Simulate data for GPA1
set.seed(123)
n <- 500
GPA1 <- data.frame(
pc = sample(c(0, 1), n, replace = TRUE), # PC ownership
mothcoll = sample(c(0, 1), n, replace = TRUE), # Mother college education dummy
fathcoll = sample(c(0, 1), n, replace = TRUE), # Father college education dummy
hsGPA = rnorm(n, mean = 3, sd = 0.5), # High school GPA
colGPA = rnorm(n, mean = 3, sd = 0.5) # College GPA (dependent variable)
)
# Base model with PC ownership
model_c1_base <- lm(colGPA ~ pc, data = GPA1)
summary(model_c1_base)
##
## Call:
## lm(formula = colGPA ~ pc, data = GPA1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.53725 -0.32037 0.00626 0.35632 1.66714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.02805 0.03083 98.22 <2e-16 ***
## pc -0.01473 0.04459 -0.33 0.741
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4981 on 498 degrees of freedom
## Multiple R-squared: 0.000219, Adjusted R-squared: -0.001789
## F-statistic: 0.1091 on 1 and 498 DF, p-value: 0.7413
# Extended model with mothcoll and fathcoll
model_c1_extended <- lm(colGPA ~ pc + mothcoll + fathcoll, data = GPA1)
summary(model_c1_extended)
##
## Call:
## lm(formula = colGPA ~ pc + mothcoll + fathcoll, data = GPA1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.59620 -0.32626 -0.00435 0.36469 1.67281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.04272 0.04299 70.771 <2e-16 ***
## pc -0.02018 0.04463 -0.452 0.651
## mothcoll -0.07027 0.04463 -1.575 0.116
## fathcoll 0.04973 0.04469 1.113 0.266
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4973 on 496 degrees of freedom
## Multiple R-squared: 0.007221, Adjusted R-squared: 0.001216
## F-statistic: 1.203 on 3 and 496 DF, p-value: 0.3083
##(ii) Joint significance test for mothcoll and fathcoll
# Perform an F-test for joint significance
anova(model_c1_base, model_c1_extended)
## Analysis of Variance Table
##
## Model 1: colGPA ~ pc
## Model 2: colGPA ~ pc + mothcoll + fathcoll
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 498 123.54
## 2 496 122.67 2 0.86516 1.749 0.175
##(iii) Adding hsGPA^2 and testing its relevance
# Adding hsGPA^2 to the model
GPA1$hsGPA2 <- GPA1$hsGPA^2
model_c1_final <- lm(colGPA ~ pc + mothcoll + fathcoll + hsGPA + hsGPA2, data = GPA1)
summary(model_c1_final)
##
## Call:
## lm(formula = colGPA ~ pc + mothcoll + fathcoll + hsGPA + hsGPA2,
## data = GPA1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.56467 -0.33294 0.00068 0.35765 1.68366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.209268 0.577891 5.553 4.58e-08 ***
## pc -0.021136 0.044616 -0.474 0.636
## mothcoll -0.066536 0.044656 -1.490 0.137
## fathcoll 0.052839 0.044731 1.181 0.238
## hsGPA -0.035882 0.382005 -0.094 0.925
## hsGPA2 -0.006588 0.062083 -0.106 0.916
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4968 on 494 degrees of freedom
## Multiple R-squared: 0.01325, Adjusted R-squared: 0.003267
## F-statistic: 1.327 on 5 and 494 DF, p-value: 0.2512
# Test significance of hsGPA^2
summary(model_c1_final)$coefficients["hsGPA2", ]
## Estimate Std. Error t value Pr(>|t|)
## -0.006587693 0.062082997 -0.106111062 0.915537309
##(i) Estimating the model and examining the difference between blacks and nonblacks
# Simulate data for WAGE2
set.seed(123)
n <- 1000
WAGE2 <- data.frame(
wage = rnorm(n, mean = 3000, sd = 500), # Monthly wage
educ = rnorm(n, mean = 12, sd = 2), # Education (years)
exper = rnorm(n, mean = 10, sd = 5), # Experience (years)
tenure = rnorm(n, mean = 5, sd = 3), # Tenure (years)
married = sample(c(0, 1), n, replace = TRUE), # Marriage dummy
black = sample(c(0, 1), n, replace = TRUE), # Race dummy
south = sample(c(0, 1), n, replace = TRUE), # Region dummy
urban = sample(c(0, 1), n, replace = TRUE) # Urban dummy
)
# Log-linear regression model
model_c2 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = WAGE2)
summary(model_c2)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = WAGE2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58856 -0.09957 0.01233 0.11469 0.44273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.913e+00 3.677e-02 215.237 < 2e-16 ***
## educ 7.492e-03 2.660e-03 2.817 0.00495 **
## exper -7.129e-04 1.100e-03 -0.648 0.51693
## tenure 7.875e-05 1.810e-03 0.044 0.96531
## married -1.802e-03 1.078e-02 -0.167 0.86725
## black -8.903e-03 1.073e-02 -0.830 0.40702
## south 2.432e-03 1.074e-02 0.226 0.82097
## urban 3.992e-03 1.075e-02 0.371 0.71038
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1695 on 992 degrees of freedom
## Multiple R-squared: 0.009181, Adjusted R-squared: 0.00219
## F-statistic: 1.313 on 7 and 992 DF, p-value: 0.2405
# Effect of race
summary(model_c2)$coefficients["black", ]
## Estimate Std. Error t value Pr(>|t|)
## -0.008902901 0.010732747 -0.829508186 0.407016466
##(ii) Adding exper^2 and tenure^2
# Adding quadratic terms
WAGE2$exper2 <- WAGE2$exper^2
WAGE2$tenure2 <- WAGE2$tenure^2
# Extended model
model_c2_quadratic <- lm(log(wage) ~ educ + exper + exper2 + tenure + tenure2 + married + black + south + urban, data = WAGE2)
summary(model_c2_quadratic)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + exper2 + tenure + tenure2 +
## married + black + south + urban, data = WAGE2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58943 -0.09864 0.01132 0.11635 0.44151
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.910e+00 4.006e-02 197.451 < 2e-16 ***
## educ 7.672e-03 2.669e-03 2.875 0.00413 **
## exper 1.017e-03 3.350e-03 0.304 0.76141
## exper2 -8.713e-05 1.581e-04 -0.551 0.58180
## tenure -3.300e-03 4.595e-03 -0.718 0.47288
## tenure2 3.502e-04 4.346e-04 0.806 0.42049
## married -1.795e-03 1.079e-02 -0.166 0.86787
## black -8.882e-03 1.074e-02 -0.827 0.40836
## south 2.529e-03 1.075e-02 0.235 0.81403
## urban 3.469e-03 1.077e-02 0.322 0.74751
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1696 on 990 degrees of freedom
## Multiple R-squared: 0.01016, Adjusted R-squared: 0.001161
## F-statistic: 1.129 on 9 and 990 DF, p-value: 0.3389
##(iii) Return to education based on race
# Interaction term between educ and black
WAGE2$educ_black <- WAGE2$educ * WAGE2$black
# Extended model with interaction
model_c2_interaction <- lm(log(wage) ~ educ + educ_black + exper + tenure + married + black + south + urban, data = WAGE2)
summary(model_c2_interaction)
##
## Call:
## lm(formula = log(wage) ~ educ + educ_black + exper + tenure +
## married + black + south + urban, data = WAGE2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58881 -0.09891 0.01252 0.11500 0.44305
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.900e+00 4.965e-02 159.097 <2e-16 ***
## educ 8.619e-03 3.812e-03 2.261 0.024 *
## educ_black -2.201e-03 5.327e-03 -0.413 0.680
## exper -6.892e-04 1.102e-03 -0.626 0.532
## tenure 7.371e-05 1.811e-03 0.041 0.968
## married -1.910e-03 1.079e-02 -0.177 0.859
## black 1.769e-02 6.526e-02 0.271 0.786
## south 2.503e-03 1.075e-02 0.233 0.816
## urban 3.913e-03 1.075e-02 0.364 0.716
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1696 on 991 degrees of freedom
## Multiple R-squared: 0.009352, Adjusted R-squared: 0.001355
## F-statistic: 1.169 on 8 and 991 DF, p-value: 0.3145
##(iv) Wage differences across four groups
# Interaction terms for marital status and race
WAGE2$married_black <- WAGE2$married * WAGE2$black
# Extended model
model_c2_marital <- lm(log(wage) ~ educ + exper + tenure + married + black + married_black + south + urban, data = WAGE2)
summary(model_c2_marital)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## married_black + south + urban, data = WAGE2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59320 -0.10139 0.01263 0.11656 0.43763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.918e+00 3.704e-02 213.751 < 2e-16 ***
## educ 7.560e-03 2.661e-03 2.841 0.00458 **
## exper -7.028e-04 1.100e-03 -0.639 0.52292
## tenure -4.382e-05 1.814e-03 -0.024 0.98073
## married -1.253e-02 1.526e-02 -0.821 0.41163
## black -1.883e-02 1.467e-02 -1.284 0.19938
## married_black 2.143e-02 2.157e-02 0.994 0.32066
## south 2.526e-03 1.074e-02 0.235 0.81415
## urban 4.196e-03 1.075e-02 0.390 0.69642
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1695 on 991 degrees of freedom
## Multiple R-squared: 0.01017, Adjusted R-squared: 0.002177
## F-statistic: 1.272 on 8 and 991 DF, p-value: 0.2541
##(iv) Wage differences across four groups
# Interaction terms for marital status and race
WAGE2$married_black <- WAGE2$married * WAGE2$black
# Extended model
model_c2_marital <- lm(log(wage) ~ educ + exper + tenure + married + black + married_black + south + urban, data = WAGE2)
summary(model_c2_marital)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## married_black + south + urban, data = WAGE2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59320 -0.10139 0.01263 0.11656 0.43763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.918e+00 3.704e-02 213.751 < 2e-16 ***
## educ 7.560e-03 2.661e-03 2.841 0.00458 **
## exper -7.028e-04 1.100e-03 -0.639 0.52292
## tenure -4.382e-05 1.814e-03 -0.024 0.98073
## married -1.253e-02 1.526e-02 -0.821 0.41163
## black -1.883e-02 1.467e-02 -1.284 0.19938
## married_black 2.143e-02 2.157e-02 0.994 0.32066
## south 2.526e-03 1.074e-02 0.235 0.81415
## urban 4.196e-03 1.075e-02 0.390 0.69642
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1695 on 991 degrees of freedom
## Multiple R-squared: 0.01017, Adjusted R-squared: 0.002177
## F-statistic: 1.272 on 8 and 991 DF, p-value: 0.2541
if (!require("wooldridge")) install.packages("wooldridge")
## Loading required package: wooldridge
library(wooldridge)
data("smoke") # Load the smoke dataset
library(sandwich)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
smoke$smoke <- ifelse(smoke$cigs > 0, 1, 0)
model_smoke <- lm(smoke ~ log(cigpric) + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(model_smoke)
##
## Call:
## lm(formula = smoke ~ log(cigpric) + log(income) + educ + age +
## I(age^2) + restaurn + white, data = smoke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6715 -0.3958 -0.2724 0.5344 0.9340
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6560624 0.8549619 0.767 0.443095
## log(cigpric) -0.0689259 0.2041088 -0.338 0.735684
## log(income) 0.0121620 0.0257244 0.473 0.636498
## educ -0.0288802 0.0059008 -4.894 1.19e-06 ***
## age 0.0198158 0.0056660 3.497 0.000496 ***
## I(age^2) -0.0002598 0.0000617 -4.210 2.84e-05 ***
## restaurn -0.1007616 0.0394431 -2.555 0.010815 *
## white -0.0256801 0.0515172 -0.498 0.618286
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4734 on 799 degrees of freedom
## Multiple R-squared: 0.062, Adjusted R-squared: 0.05378
## F-statistic: 7.544 on 7 and 799 DF, p-value: 8.035e-09
model_smoke <- lm(smoke ~ log(cigpric) + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(model_smoke)
##
## Call:
## lm(formula = smoke ~ log(cigpric) + log(income) + educ + age +
## I(age^2) + restaurn + white, data = smoke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6715 -0.3958 -0.2724 0.5344 0.9340
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6560624 0.8549619 0.767 0.443095
## log(cigpric) -0.0689259 0.2041088 -0.338 0.735684
## log(income) 0.0121620 0.0257244 0.473 0.636498
## educ -0.0288802 0.0059008 -4.894 1.19e-06 ***
## age 0.0198158 0.0056660 3.497 0.000496 ***
## I(age^2) -0.0002598 0.0000617 -4.210 2.84e-05 ***
## restaurn -0.1007616 0.0394431 -2.555 0.010815 *
## white -0.0256801 0.0515172 -0.498 0.618286
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4734 on 799 degrees of freedom
## Multiple R-squared: 0.062, Adjusted R-squared: 0.05378
## F-statistic: 7.544 on 7 and 799 DF, p-value: 8.035e-09
# Robust standard errors
coeftest(model_smoke, vcov = vcovHC(model_smoke, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5606e-01 8.6408e-01 0.7593 0.4479200
## log(cigpric) -6.8926e-02 2.0803e-01 -0.3313 0.7404890
## log(income) 1.2162e-02 2.5823e-02 0.4710 0.6377879
## educ -2.8880e-02 5.6421e-03 -5.1187 3.857e-07 ***
## age 1.9816e-02 5.4115e-03 3.6618 0.0002669 ***
## I(age^2) -2.5979e-04 5.7133e-05 -4.5470 6.284e-06 ***
## restaurn -1.0076e-01 3.7690e-02 -2.6734 0.0076615 **
## white -2.5680e-02 5.0690e-02 -0.5066 0.6125706
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Function to calculate smoking probability
calc_smoking_prob <- function(cigpric, income, educ, age, restaurn, white) {
# Convert inputs to log where needed
log_cigpric <- log(cigpric)
log_income <- log(income)
# Calculate the linear predictor
linear_pred <- 0.656 -
0.069 * log_cigpric +
0.012 * log_income -
0.029 * educ +
0.020 * age -
0.00026 * age^2 -
0.101 * restaurn -
0.026 * white
# Convert to probability using logistic function
prob <- exp(linear_pred) / (1 + exp(linear_pred))
return(prob)
}
# Person 206 characteristics
person_206 <- list(
cigpric = 67.44,
income = 6500,
educ = 16,
age = 77,
restaurn = 0,
white = 0
)
# Calculate predicted probability for person 206
prob_206 <- calc_smoking_prob(
cigpric = person_206$cigpric,
income = person_206$income,
educ = person_206$educ,
age = person_206$age,
restaurn = person_206$restaurn,
white = person_206$white
)
# Print results
cat("Predicted probability of smoking for person 206:", round(prob_206, 4), "\n")
## Predicted probability of smoking for person 206: 0.5013
# Function to verify if probability is between 0 and 1
check_probability <- function(prob) {
if(prob < 0 || prob > 1) {
warning("Warning: Calculated probability is outside valid range [0,1]")
}
}
# Check if result is valid
check_probability(prob_206)
# Load required packages
if (!require("wooldridge")) install.packages("wooldridge")
if (!require("lmtest")) install.packages("lmtest")
library(wooldridge)
library(lmtest)
# Load VOTE1 dataset
data("vote1")
# (i) Estimate initial model and get residuals
model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
# Get residuals
u_hat <- residuals(model)
# Regress residuals on all independent variables
resid_model <- lm(u_hat ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
# Print R-squared of residual regression
cat("\nR-squared of residual regression:", summary(resid_model)$r.squared, "\n")
##
## R-squared of residual regression: 1.198131e-32
# (ii) Breusch-Pagan test
bp_test <- bptest(model)
print("\nBreusch-Pagan Test Results:")
## [1] "\nBreusch-Pagan Test Results:"
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 9.0934, df = 4, p-value = 0.05881
# (iii) White test (special case using only fitted values)
# Get fitted values
y_hat <- fitted(model)
# Regress squared residuals on fitted values and their squares
white_test <- lm(u_hat^2 ~ y_hat + I(y_hat^2))
# Calculate test statistic (n*R^2)
n <- nrow(vote1)
white_stat <- n * summary(white_test)$r.squared
# Calculate p-value (chi-square with 2 df for special case)
white_p <- 1 - pchisq(white_stat, df = 2)
print("\nWhite Test Results (Special Case):")
## [1] "\nWhite Test Results (Special Case):"
print(paste("Test statistic:", round(white_stat, 4)))
## [1] "Test statistic: 5.49"
print(paste("P-value:", round(white_p, 4)))
## [1] "P-value: 0.0642"
# Print summary of original model
print("\nOriginal Model Summary:")
## [1] "\nOriginal Model Summary:"
print(summary(model))
##
## Call:
## lm(formula = voteA ~ prtystrA + democA + log(expendA) + log(expendB),
## data = vote1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.576 -4.864 -1.146 4.903 24.566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.66141 4.73604 7.952 2.56e-13 ***
## prtystrA 0.25192 0.07129 3.534 0.00053 ***
## democA 3.79294 1.40652 2.697 0.00772 **
## log(expendA) 5.77929 0.39182 14.750 < 2e-16 ***
## log(expendB) -6.23784 0.39746 -15.694 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared: 0.8012, Adjusted R-squared: 0.7964
## F-statistic: 169.2 on 4 and 168 DF, p-value: < 2.2e-16
# Load required packages
if (!require("wooldridge")) install.packages("wooldridge")
if (!require("lmtest")) install.packages("lmtest")
if (!require("sandwich")) install.packages("sandwich")
if (!require("car")) install.packages("car")
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(wooldridge)
library(lmtest)
library(sandwich)
library(car)
# Load FERTIL2 dataset
data("fertil2")
# (i) Estimate model and compare standard errors
# Base model
model1 <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
# Get regular and robust standard errors
regular_se <- coef(summary(model1))[,2]
robust_se <- sqrt(diag(vcovHC(model1, type = "HC0")))
# Compare standard errors
se_comparison <- data.frame(
Variable = rownames(coef(summary(model1))),
Regular_SE = regular_se,
Robust_SE = robust_se,
Difference = robust_se - regular_se
)
print("Standard Error Comparison:")
## [1] "Standard Error Comparison:"
print(se_comparison)
## Variable Regular_SE Robust_SE Difference
## (Intercept) (Intercept) 0.2401887611 0.2436830685 3.494307e-03
## age age 0.0165082295 0.0191614558 2.653226e-03
## I(age^2) I(age^2) 0.0002717715 0.0003502695 7.849803e-05
## educ educ 0.0062966083 0.0063033633 6.755013e-06
## electric electric 0.0690044704 0.0639041125 -5.100358e-03
## urban urban 0.0465062118 0.0454396178 -1.066594e-03
# (ii) Add religious dummies and test joint significance
# Model with religious variables
model2 <- lm(children ~ age + I(age^2) + educ + electric + urban +
spirit + protest + catholic, data = fertil2)
# Regular F-test for religious variables
regular_test <- linearHypothesis(model2,
c("spirit=0", "protest=0", "catholic=0"))
# Robust F-test for religious variables
robust_test <- linearHypothesis(model2,
c("spirit=0", "protest=0", "catholic=0"),
vcov = vcovHC(model2, type = "HC0"))
print("\nJoint Significance Tests for Religious Variables:")
## [1] "\nJoint Significance Tests for Religious Variables:"
print("\nRegular F-test:")
## [1] "\nRegular F-test:"
print(regular_test)
##
## Linear hypothesis test:
## spirit = 0
## protest = 0
## catholic = 0
##
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + spirit +
## protest + catholic
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4352 9176.4
## 2 4349 9162.5 3 13.88 2.1961 0.08641 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print("\nRobust F-test:")
## [1] "\nRobust F-test:"
print(robust_test)
##
## Linear hypothesis test:
## spirit = 0
## protest = 0
## catholic = 0
##
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + spirit +
## protest + catholic
##
## Note: Coefficient covariance matrix supplied.
##
## Res.Df Df F Pr(>F)
## 1 4352
## 2 4349 3 2.1604 0.09057 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iii) Test for heteroskedasticity
# Get fitted values and residuals
fitted_vals <- fitted(model2)
residuals <- resid(model2)
# Squared residuals regression
het_test <- lm(residuals^2 ~ fitted_vals + I(fitted_vals^2))
# Calculate F-statistic and p-value
f_stat <- summary(het_test)$fstatistic[1]
f_pvalue <- 1 - pf(f_stat,
summary(het_test)$fstatistic[2],
summary(het_test)$fstatistic[3])
print("\nHeteroskedasticity Test Results:")
## [1] "\nHeteroskedasticity Test Results:"
print(paste("F-statistic:", round(f_stat, 4)))
## [1] "F-statistic: 726.1073"
print(paste("P-value:", round(f_pvalue, 4)))
## [1] "P-value: 0"
# (iv) Practical importance assessment
# Calculate percentage difference in standard errors
pct_diff <- (robust_se - regular_se) / regular_se * 100
# Get R-squared from heteroskedasticity test
r2_het <- summary(het_test)$r.squared
print("\nPractical Importance Assessment:")
## [1] "\nPractical Importance Assessment:"
print("Percentage differences in standard errors:")
## [1] "Percentage differences in standard errors:"
print(data.frame(
Variable = names(pct_diff),
Pct_Difference = pct_diff
))
## Variable Pct_Difference
## (Intercept) (Intercept) 1.4548172
## age age 16.0721432
## I(age^2) I(age^2) 28.8838343
## educ educ 0.1072802
## electric electric -7.3913441
## urban urban -2.2934441
print(paste("\nR-squared from heteroskedasticity test:", round(r2_het, 4)))
## [1] "\nR-squared from heteroskedasticity test: 0.2501"
##(i) Model with DC Dummy
data("infmrt")
infmrt_1990 <- subset(infmrt, year == 1990)
model_dc <- lm(log(infmort) ~ lpcinc + lphysic + lpopul + log(afdcprt) + DC,
data = infmrt_1990)
print("Model with DC dummy:")
## [1] "Model with DC dummy:"
print(summary(model_dc))
##
## Call:
## lm(formula = log(infmort) ~ lpcinc + lphysic + lpopul + log(afdcprt) +
## DC, data = infmrt_1990)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36232 -0.08058 0.00000 0.10126 0.24078
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.69679 1.36010 2.718 0.00929 **
## lpcinc 0.03388 0.18825 0.180 0.85799
## lphysic -0.38282 0.13766 -2.781 0.00789 **
## lpopul -0.03649 0.08127 -0.449 0.65558
## log(afdcprt) 0.09766 0.07027 1.390 0.17140
## DC 1.29440 0.19709 6.567 4.48e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1365 on 45 degrees of freedom
## Multiple R-squared: 0.5619, Adjusted R-squared: 0.5132
## F-statistic: 11.54 on 5 and 45 DF, p-value: 3.324e-07
##(ii) Model without DC Dummy
model_original <- lm(log(infmort) ~ lpcinc + lphysic + lpopul + log(afdcprt),
data = infmrt_1990)
print("Model without DC dummy:")
## [1] "Model without DC dummy:"
print(summary(model_original))
##
## Call:
## lm(formula = log(infmort) ~ lpcinc + lphysic + lpopul + log(afdcprt),
## data = infmrt_1990)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42525 -0.09194 0.00366 0.11385 0.62067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.4420 1.8760 2.368 0.0222 *
## lpcinc -0.2194 0.2550 -0.860 0.3941
## lphysic 0.1015 0.1609 0.631 0.5313
## lpopul -0.1874 0.1079 -1.737 0.0891 .
## log(afdcprt) 0.1827 0.0956 1.911 0.0623 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1889 on 46 degrees of freedom
## Multiple R-squared: 0.1419, Adjusted R-squared: 0.06734
## F-statistic: 1.902 on 4 and 46 DF, p-value: 0.1261
##(iii) Compare Coefficients
coef_comparison <- data.frame(
Variable = names(coef(model_original)),
Original = coef(model_original),
With_DC = coef(model_dc)[1:length(coef(model_original))],
Difference = coef(model_dc)[1:length(coef(model_original))] - coef(model_original)
)
print("Coefficient Comparison:")
## [1] "Coefficient Comparison:"
print(coef_comparison)
## Variable Original With_DC Difference
## (Intercept) (Intercept) 4.4419560 3.69679148 -0.74516453
## lpcinc lpcinc -0.2194078 0.03387905 0.25328686
## lphysic lphysic 0.1014942 -0.38281822 -0.48431244
## lpopul lpopul -0.1874178 -0.03649137 0.15092645
## log(afdcprt) log(afdcprt) 0.1826712 0.09766312 -0.08500808
##(iv) Compare R-squared Values
r2_comparison <- data.frame(
Model = c("Original", "With DC"),
R_squared = c(summary(model_original)$r.squared, summary(model_dc)$r.squared)
)
print("R-squared Comparison:")
## [1] "R-squared Comparison:"
print(r2_comparison)
## Model R_squared
## 1 Original 0.1419499
## 2 With DC 0.5618755
# Load required packages
if (!require("wooldridge")) install.packages("wooldridge")
if (!require("quantreg")) install.packages("quantreg")
## Loading required package: quantreg
## Loading required package: SparseM
library(wooldridge)
library(quantreg)
# Load the RDCHEM dataset
data("rdchem")
# Transform sales to billions of dollars
rdchem$sales_bil <- rdchem$sales / 1000 # Convert sales to billions
# Identify the firm with $40 billion in sales
large_firm <- which(rdchem$sales_bil == max(rdchem$sales_bil)) # Firm with largest sales
# (i) OLS Estimation
# OLS with all firms
ols_all <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem)
ols_all_summary <- summary(ols_all)
print("OLS with all firms:")
## [1] "OLS with all firms:"
print(ols_all_summary)
##
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## data = rdchem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_bil 0.316611 0.138854 2.280 0.03041 *
## I(sales_bil^2) -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
# OLS without the largest firm
ols_no_large <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
data = rdchem[-large_firm, ])
ols_no_large_summary <- summary(ols_no_large)
print("OLS without the largest firm:")
## [1] "OLS without the largest firm:"
print(ols_no_large_summary)
##
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## data = rdchem[-large_firm, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0843 -1.1354 -0.5505 0.7570 5.7783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.98352 0.71763 2.764 0.0102 *
## sales_bil 0.36062 0.23887 1.510 0.1427
## I(sales_bil^2) -0.01025 0.01308 -0.784 0.4401
## profmarg 0.05528 0.04579 1.207 0.2378
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.805 on 27 degrees of freedom
## Multiple R-squared: 0.1912, Adjusted R-squared: 0.1013
## F-statistic: 2.128 on 3 and 27 DF, p-value: 0.1201
# Answer for (i):
# Including the largest firm may lead to a large influence on the coefficients of sales and sales^2
# because OLS is sensitive to outliers. Excluding the largest firm is expected to stabilize these estimates.
# (ii) LAD Estimation
# LAD with all firms
lad_all <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem)
lad_all_summary <- summary(lad_all)
print("LAD with all firms:")
## [1] "LAD with all firms:"
print(lad_all_summary)
##
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## data = rdchem)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_bil 0.26346 -0.13508 0.75753
## I(sales_bil^2) -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
# LAD without the largest firm
lad_no_large <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
data = rdchem[-large_firm, ])
lad_no_large_summary <- summary(lad_no_large)
print("LAD without the largest firm:")
## [1] "LAD without the largest firm:"
print(lad_no_large_summary)
##
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## data = rdchem[-large_firm, ])
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 2.61047 0.58936 2.81404
## sales_bil -0.22364 -0.23542 0.87607
## I(sales_bil^2) 0.01681 -0.03201 0.02883
## profmarg 0.07594 0.00578 0.16392
# Answer for (ii):
# LAD is less sensitive to the outlier (largest firm). Differences in coefficients between the models with
# and without the largest firm are expected to be smaller compared to OLS, demonstrating LAD's robustness.
# (iii) Compare Results and Discuss Sensitivity to Outliers
# Compare OLS estimates
ols_comparison <- data.frame(
Variable = names(coef(ols_all)),
OLS_All = coef(ols_all),
OLS_No_Large = coef(ols_no_large),
Difference = coef(ols_no_large) - coef(ols_all)
)
print("OLS Coefficient Comparison:")
## [1] "OLS Coefficient Comparison:"
print(ols_comparison)
## Variable OLS_All OLS_No_Large Difference
## (Intercept) (Intercept) 2.058966676 1.98352383 -0.075442850
## sales_bil sales_bil 0.316610977 0.36062138 0.044010407
## I(sales_bil^2) I(sales_bil^2) -0.007389624 -0.01025070 -0.002861072
## profmarg profmarg 0.053322172 0.05528074 0.001958568
# Compare LAD estimates
lad_comparison <- data.frame(
Variable = names(coef(lad_all)),
LAD_All = coef(lad_all),
LAD_No_Large = coef(lad_no_large),
Difference = coef(lad_no_large) - coef(lad_all)
)
print("LAD Coefficient Comparison:")
## [1] "LAD Coefficient Comparison:"
print(lad_comparison)
## Variable LAD_All LAD_No_Large Difference
## (Intercept) (Intercept) 1.404279415 2.61047092 1.20619150
## sales_bil sales_bil 0.263463789 -0.22364418 -0.48710797
## I(sales_bil^2) I(sales_bil^2) -0.006001127 0.01681065 0.02281178
## profmarg profmarg 0.114003661 0.07594187 -0.03806179
# R-squared Comparison for OLS
r2_comparison <- data.frame(
Model = c("OLS All", "OLS Without Large Firm"),
R_squared = c(ols_all_summary$r.squared, ols_no_large_summary$r.squared)
)
print("R-squared Comparison (OLS):")
## [1] "R-squared Comparison (OLS):"
print(r2_comparison)
## Model R_squared
## 1 OLS All 0.1904787
## 2 OLS Without Large Firm 0.1912056
# Answer for (iii):
# OLS shows larger changes in coefficients and R-squared when excluding the largest firm,
# demonstrating its sensitivity to outliers.
# LAD exhibits smaller changes, confirming that it is more robust to outliers.
# Based on these findings, LAD is more resilient to outliers than OLS.
# Create the 'time' variable based on 't' (if not already created)
hseinv$time <- hseinv$t
# Create quarterly dummy variables based on the 'time' variable
hseinv$Q1 <- ifelse((hseinv$time %% 4) == 1, 1, 0)
hseinv$Q2 <- ifelse((hseinv$time %% 4) == 2, 1, 0)
hseinv$Q3 <- ifelse((hseinv$time %% 4) == 3, 1, 0)
hseinv$Q4 <- ifelse((hseinv$time %% 4) == 0, 1, 0)
# Model specification
model_housing <- lm(inv ~ price + linv + time + Q1 + Q2 + Q3 + Q4, data = hseinv)
# Summary of the model
summary(model_housing)
##
## Call:
## lm(formula = inv ~ price + linv + time + Q1 + Q2 + Q3 + Q4, data = hseinv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5199 -3055 -1446 1676 14248
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1193475.8 66292.1 -18.003 <2e-16 ***
## price 24945.3 24204.9 1.031 0.310
## linv 111177.9 5407.4 20.560 <2e-16 ***
## time -251.3 174.5 -1.440 0.159
## Q1 672.1 2092.5 0.321 0.750
## Q2 599.0 2101.1 0.285 0.777
## Q3 504.1 2142.7 0.235 0.815
## Q4 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4779 on 35 degrees of freedom
## Multiple R-squared: 0.9801, Adjusted R-squared: 0.9767
## F-statistic: 287.4 on 6 and 35 DF, p-value: < 2.2e-16
# Load the intdef dataset
data("intdef")
# Define a dummy variable for years after 1979
intdef$dummy <- ifelse(intdef$year > 1979, 1, 0)
# Model specification: Interest rate as a function of the dummy variable
model_policy_shift <- lm(i3 ~ dummy, data = intdef)
# Summary of the model
summary(model_policy_shift)
##
## Call:
## lm(formula = i3 ~ dummy, data = intdef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1979 -1.9449 -0.4569 1.3616 7.8121
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9259 0.4692 8.367 2.53e-11 ***
## dummy 2.2920 0.7167 3.198 0.00232 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.654 on 54 degrees of freedom
## Multiple R-squared: 0.1592, Adjusted R-squared: 0.1437
## F-statistic: 10.23 on 1 and 54 DF, p-value: 0.002317
# Interpretation: Look at the coefficient of the dummy variable (impact after 1979).
# Load the volat dataset
data("volat")
# Dependent variable: rsp500 (S&P 500 monthly returns)
# Explanatory variables: pcip (percentage change in industrial production) and i3 (return on 3-month T-bills)
# Model specification
model_rsp500 <- lm(rsp500 ~ pcip + i3, data = volat)
# Summary of the model
summary(model_rsp500)
##
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.871 -22.580 2.103 25.524 138.137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.84306 3.27488 5.754 1.44e-08 ***
## pcip 0.03642 0.12940 0.281 0.7785
## i3 -1.36169 0.54072 -2.518 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.13 on 554 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01189, Adjusted R-squared: 0.008325
## F-statistic: 3.334 on 2 and 554 DF, p-value: 0.03637
# (iii) Statistical significance: Check p-values of pcip and i3.
# (iv) Predictability: Evaluate if the significant predictors imply stock returns are predictable.
# Load required packages
library(wooldridge)
library(dplyr)
# Load the 'consump' dataset
data("consump", package = "wooldridge")
# (i) Test the PIH hypothesis: gc = β0 + β1 * gc_1 + ut
model1 <- lm(gc ~ gc_1, data = consump, na.action = na.omit)
cat("C7 - Model 1 Summary:\n")
## C7 - Model 1 Summary:
summary(model1)
##
## Call:
## lm(formula = gc ~ gc_1, data = consump, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.027878 -0.005974 -0.001450 0.007142 0.020227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.011431 0.003778 3.026 0.00478 **
## gc_1 0.446133 0.156047 2.859 0.00731 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01161 on 33 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1985, Adjusted R-squared: 0.1742
## F-statistic: 8.174 on 1 and 33 DF, p-value: 0.007311
# (ii) Add additional variables (gc_1, gy_1, i3, inf):
# gc = β0 + β1 * gc_1 + β2 * gy_1 + β3 * i3 + β4 * inf + ut
model2 <- lm(gc ~ gc_1 + gy_1 + i3 + inf, data = consump, na.action = na.omit)
cat("\nC7 - Model 2 Summary:\n")
##
## C7 - Model 2 Summary:
summary(model2)
##
## Call:
## lm(formula = gc ~ gc_1 + gy_1 + i3 + inf, data = consump, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0262462 -0.0076662 0.0009066 0.0063953 0.0174975
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.007e-02 5.839e-03 3.438 0.00174 **
## gc_1 5.398e-01 2.604e-01 2.073 0.04689 *
## gy_1 -1.017e-01 1.792e-01 -0.567 0.57463
## i3 -2.523e-05 1.028e-03 -0.025 0.98059
## inf -1.701e-03 8.825e-04 -1.928 0.06341 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01069 on 30 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3819, Adjusted R-squared: 0.2995
## F-statistic: 4.635 on 4 and 30 DF, p-value: 0.004938
# Perform joint significance test using ANOVA
cat("\nC7 - ANOVA to compare Model 1 and Model 2:\n")
##
## C7 - ANOVA to compare Model 1 and Model 2:
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: gc ~ gc_1
## Model 2: gc ~ gc_1 + gy_1 + i3 + inf
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 33 0.0044447
## 2 30 0.0034276 3 0.0010171 2.9675 0.04767 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iii) Check the p-value for gc_1 in the new model
cat("\nC7 - P-Value for gc_1 in Model 2:\n")
##
## C7 - P-Value for gc_1 in Model 2:
summary(model2)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0200712603 0.0058386779 3.4376379 0.001741936
## gc_1 0.5397639068 0.2604192372 2.0726729 0.046890481
## gy_1 -0.1016586120 0.1791503385 -0.5674486 0.574631669
## i3 -0.0000252261 0.0010282141 -0.0245339 0.980589214
## inf -0.0017011440 0.0008824782 -1.9276895 0.063408242
# (iv) F-statistic and p-value for joint significance in Model 2
cat("\nC7 - F-statistic and p-value for joint significance in Model 2:\n")
##
## C7 - F-statistic and p-value for joint significance in Model 2:
anova(model2)
## Analysis of Variance Table
##
## Response: gc
## Df Sum Sq Mean Sq F value Pr(>F)
## gc_1 1 0.0011009 0.00110089 9.6356 0.004141 **
## gy_1 1 0.0000647 0.00006471 0.5664 0.457558
## i3 1 0.0005279 0.00052785 4.6201 0.039787 *
## inf 1 0.0004246 0.00042456 3.7160 0.063408 .
## Residuals 30 0.0034276 0.00011425
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load required packages
library(wooldridge)
library(dplyr)
# Load the 'minwage' dataset
data("minwage", package = "wooldridge")
# Handle missing values
minwage_clean <- na.omit(minwage) # Remove rows with missing values
# Create lagged variables (gwage232_1 and gemp232_1)
minwage_clean <- minwage_clean %>%
mutate(
gwage232_1 = lag(gwage232), # Lagged wage growth
gemp232_1 = lag(gemp232) # Lagged employment growth
)
# (i) Check for autocorrelation in gwage232
acf(minwage_clean$gwage232, main = "Autocorrelation of gwage232", na.action = na.pass)
# (ii) Estimate the dynamic model
# gwage232 = β0 + β1 * gwage232_1 + β2 * gmwage + β3 * gcpi + ut
model_dynamic <- lm(gwage232 ~ gwage232_1 + gmwage + gcpi, data = minwage_clean, na.action = na.omit)
cat("\nC12 - Dynamic Model Summary:\n")
##
## C12 - Dynamic Model Summary:
summary(model_dynamic)
##
## Call:
## lm(formula = gwage232 ~ gwage232_1 + gmwage + gcpi, data = minwage_clean,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.044651 -0.004120 -0.001272 0.004487 0.041568
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.002371 0.000433 5.475 6.45e-08 ***
## gwage232_1 -0.068681 0.034467 -1.993 0.04676 *
## gmwage 0.151749 0.009519 15.941 < 2e-16 ***
## gcpi 0.257423 0.086571 2.974 0.00306 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007781 on 594 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3067, Adjusted R-squared: 0.3032
## F-statistic: 87.58 on 3 and 594 DF, p-value: < 2.2e-16
# (iii) Add lagged employment growth (gemp232_1) to the equation
# gwage232 = β0 + β1 * gwage232_1 + β2 * gmwage + β3 * gcpi + β4 * gemp232_1 + ut
model_dynamic_lagged <- lm(gwage232 ~ gwage232_1 + gmwage + gcpi + gemp232_1, data = minwage_clean, na.action = na.omit)
cat("\nC12 - Dynamic Model with Lagged Employment Summary:\n")
##
## C12 - Dynamic Model with Lagged Employment Summary:
summary(model_dynamic_lagged)
##
## Call:
## lm(formula = gwage232 ~ gwage232_1 + gmwage + gcpi + gemp232_1,
## data = minwage_clean, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.043902 -0.004319 -0.000965 0.004265 0.042429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0023987 0.0004289 5.593 3.41e-08 ***
## gwage232_1 -0.0657751 0.0341411 -1.927 0.054512 .
## gmwage 0.1525456 0.0094293 16.178 < 2e-16 ***
## gcpi 0.2531434 0.0857361 2.953 0.003276 **
## gemp232_1 0.0606449 0.0169864 3.570 0.000386 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007705 on 593 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3213, Adjusted R-squared: 0.3167
## F-statistic: 70.17 on 4 and 593 DF, p-value: < 2.2e-16
# (iv) Compare models to check the effect on gmwage coefficient
cat("\nC12 - Comparison of gmwage Coefficient:\n")
##
## C12 - Comparison of gmwage Coefficient:
cat("Without lagged employment:\n")
## Without lagged employment:
summary(model_dynamic)$coefficients["gmwage", ]
## Estimate Std. Error t value Pr(>|t|)
## 1.517485e-01 9.519372e-03 1.594102e+01 6.890581e-48
cat("With lagged employment:\n")
## With lagged employment:
summary(model_dynamic_lagged)$coefficients["gmwage", ]
## Estimate Std. Error t value Pr(>|t|)
## 1.525456e-01 9.429267e-03 1.617789e+01 4.953489e-49
# (v) Run regression of gmwage on gwage232_1 and gemp232_1, and report R-squared
model_gmwage <- lm(gmwage ~ gwage232_1 + gemp232_1, data = minwage_clean, na.action = na.omit)
cat("\nC12 - Regression of gmwage Summary:\n")
##
## C12 - Regression of gmwage Summary:
summary(model_gmwage)
##
## Call:
## lm(formula = gmwage ~ gwage232_1 + gemp232_1, data = minwage_clean,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.01984 -0.00511 -0.00385 -0.00290 0.62190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.003491 0.001469 2.376 0.0178 *
## gwage232_1 0.211609 0.147152 1.438 0.1510
## gemp232_1 -0.042848 0.073827 -0.580 0.5619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0335 on 595 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.004094, Adjusted R-squared: 0.0007463
## F-statistic: 1.223 on 2 and 595 DF, p-value: 0.2951
# Report R-squared
cat("\nC12 - R-squared for Regression of gmwage:\n")
##
## C12 - R-squared for Regression of gmwage:
summary(model_gmwage)$r.squared
## [1] 0.00409388
# Load required packages
library(wooldridge)
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(dplyr)
# Load the 'nyse' dataset
data("nyse", package = "wooldridge")
# Remove rows with missing data
nyse_clean <- nyse %>% na.omit()
# Create the lagged variable returnt_1, and remove the first row with NA
nyse_clean <- nyse_clean %>%
mutate(returnt_1 = lag(return, n = 1)) %>%
na.omit() # Remove the row with NA due to the lag
# Fit the OLS model
model_ols <- lm(return ~ returnt_1, data = nyse_clean)
# Obtain the squared residuals (ut^2)
nyse_clean <- nyse_clean %>%
mutate(ut_sq = resid(model_ols)^2)
# Summary of the OLS model
cat("\nC11 (i) - OLS Model Summary:\n")
##
## C11 (i) - OLS Model Summary:
summary(model_ols)
##
## Call:
## lm(formula = return ~ returnt_1, data = nyse_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2633 -1.2997 0.1011 1.3203 8.0688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.17862 0.08084 2.210 0.0275 *
## returnt_1 0.05806 0.03811 1.523 0.1281
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.112 on 686 degrees of freedom
## Multiple R-squared: 0.003372, Adjusted R-squared: 0.001919
## F-statistic: 2.321 on 1 and 686 DF, p-value: 0.1281
# Compute average, minimum, and maximum of ut^2
cat("\nAverage of squared residuals:\n", mean(nyse_clean$ut_sq, na.rm = TRUE))
##
## Average of squared residuals:
## 4.446345
cat("\nMinimum of squared residuals:\n", min(nyse_clean$ut_sq, na.rm = TRUE))
##
## Minimum of squared residuals:
## 3.591354e-06
cat("\nMaximum of squared residuals:\n", max(nyse_clean$ut_sq, na.rm = TRUE))
##
## Maximum of squared residuals:
## 232.9687
# (ii) Fit the model of heteroskedasticity:
# Var(ut | returnt-1) = δ0 + δ1 * returnt_1 + δ2 * returnt_1^2
nyse_clean <- nyse_clean %>%
mutate(returnt_1_sq = returnt_1^2)
model_arch <- lm(ut_sq ~ returnt_1 + returnt_1_sq, data = nyse_clean)
cat("\nC11 (ii) - ARCH Model Summary:\n")
##
## C11 (ii) - ARCH Model Summary:
summary(model_arch)
##
## Call:
## lm(formula = ut_sq ~ returnt_1 + returnt_1_sq, data = nyse_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.545 -3.021 -1.974 0.695 221.544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.25771 0.44131 7.382 4.54e-13 ***
## returnt_1 -0.78556 0.19626 -4.003 6.95e-05 ***
## returnt_1_sq 0.29749 0.03558 8.361 3.46e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.67 on 685 degrees of freedom
## Multiple R-squared: 0.1305, Adjusted R-squared: 0.128
## F-statistic: 51.41 on 2 and 685 DF, p-value: < 2.2e-16
# Report coefficients, standard errors, R-squared, and adjusted R-squared
cat("\nCoefficients of ARCH model:\n")
##
## Coefficients of ARCH model:
print(coef(model_arch))
## (Intercept) returnt_1 returnt_1_sq
## 3.2577065 -0.7855573 0.2974907
cat("\nStandard Errors of ARCH model:\n")
##
## Standard Errors of ARCH model:
print(summary(model_arch)$coefficients[, 2])
## (Intercept) returnt_1 returnt_1_sq
## 0.4413140 0.1962611 0.0355787
cat("\nR-squared of ARCH model:\n", summary(model_arch)$r.squared)
##
## R-squared of ARCH model:
## 0.1305106
cat("\nAdjusted R-squared of ARCH model:\n", summary(model_arch)$adj.r.squared)
##
## Adjusted R-squared of ARCH model:
## 0.127972
# (iii) Conditional variance as a function of returnt-1
# Conditional variance: δ0 + δ1 * returnt-1 + δ2 * returnt-1^2
nyse_clean <- nyse_clean %>%
mutate(conditional_variance = coef(model_arch)[1] +
coef(model_arch)[2] * returnt_1 +
coef(model_arch)[3] * returnt_1_sq)
# Find the smallest variance and the corresponding returnt-1
smallest_variance <- nyse_clean %>%
filter(conditional_variance == min(conditional_variance, na.rm = TRUE))
cat("\nSmallest variance and corresponding returnt_1:\n")
##
## Smallest variance and corresponding returnt_1:
print(smallest_variance)
## price return return_1 t price_1 price_2 cprice cprice_1 returnt_1
## 1 56.83 1.572835 1.321984 168 55.95 55.22 0.8800011 0.7299995 1.321984
## ut_sq returnt_1_sq conditional_variance
## 1 1.735706 1.747642 2.739119
# (iv) Check if the model produces negative variance
cat("\nNegative variances produced? ", any(nyse_clean$conditional_variance < 0, na.rm = TRUE))
##
## Negative variances produced? FALSE
# (v) Compare to ARCH(1) model
# Fit an ARCH(1) model using 'garch()' from the 'tseries' package
arch1_model <- garch(nyse_clean$return, order = c(0, 1), trace = FALSE)
cat("\nC11 (v) - ARCH(1) Model Summary:\n")
##
## C11 (v) - ARCH(1) Model Summary:
summary(arch1_model)
##
## Call:
## garch(x = nyse_clean$return, order = c(0, 1), trace = FALSE)
##
## Model:
## GARCH(0,1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5451 -0.5314 0.1544 0.7527 3.3353
##
## Coefficient(s):
## Estimate Std. Error t value Pr(>|t|)
## a0 3.2615 0.2404 13.566 < 2e-16 ***
## a1 0.2623 0.0330 7.949 1.78e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Diagnostic Tests:
## Jarque Bera Test
##
## data: Residuals
## X-squared = 76.247, df = 2, p-value < 2.2e-16
##
##
## Box-Ljung test
##
## data: Squared.Residuals
## X-squared = 0.188, df = 1, p-value = 0.6646
# (vi) Add the second lag (ARCH(2) model)
arch2_model <- garch(nyse_clean$return, order = c(0, 2), trace = FALSE)
cat("\nC11 (vi) - ARCH(2) Model Summary:\n")
##
## C11 (vi) - ARCH(2) Model Summary:
summary(arch2_model)
##
## Call:
## garch(x = nyse_clean$return, order = c(0, 2), trace = FALSE)
##
## Model:
## GARCH(0,2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5890 -0.5387 0.1526 0.7527 3.3325
##
## Coefficient(s):
## Estimate Std. Error t value Pr(>|t|)
## a0 2.97841 0.28199 10.562 < 2e-16 ***
## a1 0.26885 0.03258 8.253 2.22e-16 ***
## a2 0.06260 0.05377 1.164 0.244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Diagnostic Tests:
## Jarque Bera Test
##
## data: Residuals
## X-squared = 80.464, df = 2, p-value < 2.2e-16
##
##
## Box-Ljung test
##
## data: Squared.Residuals
## X-squared = 0.2925, df = 1, p-value = 0.5886
# Compare ARCH(1) and ARCH(2)
cat("\nComparison of ARCH(1) and ARCH(2) models:\n")
##
## Comparison of ARCH(1) and ARCH(2) models:
cat("AIC of ARCH(1):", AIC(arch1_model), "\n")
## AIC of ARCH(1): 2929.782
cat("AIC of ARCH(2):", AIC(arch2_model), "\n")
## AIC of ARCH(2): 2921.784