library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(wooldridge)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(dslabs)
library(ISLR2)
library(matlib)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl.init' failed, running with 'rgl.useNULL = TRUE'.
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
data("sleep75")
sleep_model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
summary(sleep_model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2378.00 -243.29 6.74 259.24 1350.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3840.83197 235.10870 16.336 <2e-16 ***
## totwrk -0.16342 0.01813 -9.013 <2e-16 ***
## educ -11.71332 5.86689 -1.997 0.0463 *
## age -8.69668 11.20746 -0.776 0.4380
## I(age^2) 0.12844 0.13390 0.959 0.3378
## male 87.75243 34.32616 2.556 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared: 0.1228, Adjusted R-squared: 0.1165
## F-statistic: 19.59 on 5 and 700 DF, p-value: < 2.2e-16
# (i) Is there evidence that men sleep more than women?
coef_male <- coef(summary(sleep_model))["male", "Estimate"]
p_value_male <- coef(summary(sleep_model))["male", "Pr(>|t|)"]
cat("Coefficient for 'male':", coef_male, "\n")
## Coefficient for 'male': 87.75243
cat("p-value for 'male':", p_value_male, "\n")
## p-value for 'male': 0.01078518
if (p_value_male < 0.05) {
cat("Conclusion: There is significant evidence that men sleep more (or less) than women.\n")
} else {
cat("Conclusion: There is no significant evidence that men sleep more (or less) than women.\n")
}
## Conclusion: There is significant evidence that men sleep more (or less) than women.
# (ii) Tradeoff between working and sleeping
coef_totwrk <- coef(summary(sleep_model))["totwrk", "Estimate"]
cat("Coefficient for 'totwrk':", coef_totwrk, "\n")
## Coefficient for 'totwrk': -0.1634232
cat("Interpretation: Each additional minute of work reduces sleep by", abs(coef_totwrk), "minutes.\n")
## Interpretation: Each additional minute of work reduces sleep by 0.1634232 minutes.
# (iii) Test whether age has a significant effect on sleep
sleep_model_no_age <- lm(sleep ~ totwrk + educ + male, data = sleep75)
anova(sleep_model_no_age, sleep_model)
## Analysis of Variance Table
##
## Model 1: sleep ~ totwrk + educ + male
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 702 122631662
## 2 700 122147777 2 483885 1.3865 0.2506
data("gpa2")
sat_model <- lm(sat ~ hsize + I(hsize^2) + female + black + female:black, data = gpa2)
summary(sat_model)
##
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + female:black,
## data = gpa2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.45 -89.54 -5.24 85.41 479.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1028.0972 6.2902 163.445 < 2e-16 ***
## hsize 19.2971 3.8323 5.035 4.97e-07 ***
## I(hsize^2) -2.1948 0.5272 -4.163 3.20e-05 ***
## female -45.0915 4.2911 -10.508 < 2e-16 ***
## black -169.8126 12.7131 -13.357 < 2e-16 ***
## female:black 62.3064 18.1542 3.432 0.000605 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared: 0.08578, Adjusted R-squared: 0.08468
## F-statistic: 77.52 on 5 and 4131 DF, p-value: < 2.2e-16
# (i) Is hsize^2 important in the model?
beta1 <- coef(sat_model)["hsize"]
beta2 <- coef(sat_model)["I(hsize^2)"]
optimal_hsize <- -beta1 / (2 * beta2)
cat("Optimal high school size:", optimal_hsize, "\n")
## Optimal high school size: 4.39603
# (ii) SAT score difference between nonblack females and nonblack males
coef_female <- coef(summary(sat_model))["female", "Estimate"]
p_value_female <- coef(summary(sat_model))["female", "Pr(>|t|)"]
cat("Coefficient for 'female' (SAT score difference between nonblack females and nonblack males):", coef_female, "\n")
## Coefficient for 'female' (SAT score difference between nonblack females and nonblack males): -45.09145
cat("p-value for 'female':", p_value_female, "\n")
## p-value for 'female': 1.656301e-25
if (p_value_female < 0.05) {
cat("Conclusion: There is significant evidence of a difference in SAT scores between nonblack females and nonblack males.\n")
} else {
cat("Conclusion: There is no significant evidence of a difference in SAT scores between nonblack females and nonblack males.\n")
}
## Conclusion: There is significant evidence of a difference in SAT scores between nonblack females and nonblack males.
# (iii) SAT score difference between nonblack males and black males
linearHypothesis(sat_model, "black = 0")
##
## Linear hypothesis test:
## black = 0
##
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4132 76652652
## 2 4131 73479121 1 3173531 178.42 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iv) SAT score difference between black females and nonblack females
linearHypothesis(sat_model, "black + female:black = 0")
##
## Linear hypothesis test:
## black + female:black = 0
##
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4132 74700518
## 2 4131 73479121 1 1221397 68.667 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(wooldridge)
data1 <- wooldridge::gpa1
head(data1)
## age soph junior senior senior5 male campus business engineer colGPA hsGPA ACT
## 1 21 0 0 1 0 0 0 1 0 3.0 3.0 21
## 2 21 0 0 1 0 0 0 1 0 3.4 3.2 24
## 3 20 0 1 0 0 0 0 1 0 3.0 3.6 26
## 4 19 1 0 0 0 1 1 1 0 3.5 3.5 27
## 5 20 0 1 0 0 0 0 1 0 3.6 3.9 28
## 6 20 0 0 1 0 1 1 1 0 3.0 3.4 25
## job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1 0 1 1 0 0 0 0 0 1 1 0 0
## 2 0 1 1 0 0 0 0 0 1 0 1 1
## 3 1 0 0 0 1 0 0 0 1 1 0 1
## 4 1 0 0 0 1 0 0 0 0 1 0 0
## 5 0 1 0 1 0 0 0 0 1 1 1 0
## 6 0 0 0 0 1 0 0 0 1 1 0 0
## skipped alcohol gradMI fathcoll mothcoll
## 1 2 1.0 1 0 0
## 2 0 1.0 1 1 1
## 3 0 1.0 1 1 1
## 4 0 0.0 0 0 0
## 5 0 1.5 1 1 0
## 6 0 0.0 0 1 0
data_C1 <- gpa1
model_C1 <- lm(colGPA~PC+hsGPA+ACT+mothcoll+fathcoll,data= data_C1)
summary(model_C1)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll,
## data = data_C1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78149 -0.25726 -0.02121 0.24691 0.74432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.255554 0.335392 3.744 0.000268 ***
## PC 0.151854 0.058716 2.586 0.010762 *
## hsGPA 0.450220 0.094280 4.775 4.61e-06 ***
## ACT 0.007724 0.010678 0.723 0.470688
## mothcoll -0.003758 0.060270 -0.062 0.950376
## fathcoll 0.041800 0.061270 0.682 0.496265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared: 0.2222, Adjusted R-squared: 0.1934
## F-statistic: 7.713 on 5 and 135 DF, p-value: 2.083e-06
library(wooldridge)
data1 <- wooldridge::gpa1
# Perform the regression analysis
model_7C1_ur <- lm(colGPA ~ PC + hsGPA + ACT, data = data1)
summary(model_7C1_ur)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7901 -0.2622 -0.0107 0.2334 0.7570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.263520 0.333125 3.793 0.000223 ***
## PC 0.157309 0.057287 2.746 0.006844 **
## hsGPA 0.447242 0.093647 4.776 4.54e-06 ***
## ACT 0.008659 0.010534 0.822 0.412513
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3325 on 137 degrees of freedom
## Multiple R-squared: 0.2194, Adjusted R-squared: 0.2023
## F-statistic: 12.83 on 3 and 137 DF, p-value: 1.932e-07
F= ((0.2222-0.2194)/2)/((1-0.2222)/135)
F
## [1] 0.2429931
critical_value = qf(0.95,2,135, TRUE)
critical_value
## [1] 4.423013
model6<- lm(colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll , data = data1)
summary(model6)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll,
## data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78149 -0.25726 -0.02121 0.24691 0.74432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.255554 0.335392 3.744 0.000268 ***
## PC 0.151854 0.058716 2.586 0.010762 *
## hsGPA 0.450220 0.094280 4.775 4.61e-06 ***
## ACT 0.007724 0.010678 0.723 0.470688
## mothcoll -0.003758 0.060270 -0.062 0.950376
## fathcoll 0.041800 0.061270 0.682 0.496265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared: 0.2222, Adjusted R-squared: 0.1934
## F-statistic: 7.713 on 5 and 135 DF, p-value: 2.083e-06
library(car)
hypotheses <- c("mothcoll=0", "fathcoll=0")
joint_test <- linearHypothesis(model6, hypotheses)
joint_test
##
## Linear hypothesis test:
## mothcoll = 0
## fathcoll = 0
##
## Model 1: restricted model
## Model 2: colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 137 15.149
## 2 135 15.094 2 0.054685 0.2446 0.7834
summary(joint_test)
## Res.Df RSS Df Sum of Sq F
## Min. :135.0 Min. :15.09 Min. :2 Min. :0.05469 Min. :0.2446
## 1st Qu.:135.5 1st Qu.:15.11 1st Qu.:2 1st Qu.:0.05469 1st Qu.:0.2446
## Median :136.0 Median :15.12 Median :2 Median :0.05469 Median :0.2446
## Mean :136.0 Mean :15.12 Mean :2 Mean :0.05469 Mean :0.2446
## 3rd Qu.:136.5 3rd Qu.:15.14 3rd Qu.:2 3rd Qu.:0.05469 3rd Qu.:0.2446
## Max. :137.0 Max. :15.15 Max. :2 Max. :0.05469 Max. :0.2446
## NA's :1 NA's :1 NA's :1
## Pr(>F)
## Min. :0.7834
## 1st Qu.:0.7834
## Median :0.7834
## Mean :0.7834
## 3rd Qu.:0.7834
## Max. :0.7834
## NA's :1
model3 <- lm (colGPA ~ PC + hsGPA + I(hsGPA^2) + ACT + mothcoll + fathcoll, data = gpa1)
summary (model3)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + I(hsGPA^2) + ACT + mothcoll +
## fathcoll, data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78998 -0.24327 -0.00648 0.26179 0.72231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.040328 2.443038 2.063 0.0410 *
## PC 0.140446 0.058858 2.386 0.0184 *
## hsGPA -1.802520 1.443552 -1.249 0.2140
## I(hsGPA^2) 0.337341 0.215711 1.564 0.1202
## ACT 0.004786 0.010786 0.444 0.6580
## mothcoll 0.003091 0.060110 0.051 0.9591
## fathcoll 0.062761 0.062401 1.006 0.3163
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3326 on 134 degrees of freedom
## Multiple R-squared: 0.2361, Adjusted R-squared: 0.2019
## F-statistic: 6.904 on 6 and 134 DF, p-value: 2.088e-06
data3 <- wooldridge::wage2
head(data3)
## wage hours IQ KWW educ exper tenure age married black south urban sibs
## 1 769 40 93 35 12 11 2 31 1 0 0 1 1
## 2 808 50 119 41 18 11 16 37 1 0 0 1 1
## 3 825 40 108 46 14 11 9 33 1 0 0 1 1
## 4 650 40 96 32 12 13 7 32 1 0 0 1 4
## 5 562 40 74 27 11 14 5 34 1 0 0 1 10
## 6 1400 40 116 43 16 14 2 35 1 1 0 1 1
## brthord meduc feduc lwage
## 1 2 8 8 6.645091
## 2 NA 14 14 6.694562
## 3 2 14 14 6.715384
## 4 3 12 12 6.476973
## 5 6 6 11 6.331502
## 6 2 8 NA 7.244227
data("wage2")
model4 <- lm(log(wage) ~ educ +exper +tenure + married + black + south + urban, data = wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
model4 <- lm(log(wage) ~ educ +exper + I(exper^2) + tenure + I(tenure^2) + married + black + south + urban, data = wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + I(exper^2) + tenure +
## I(tenure^2) + married + black + south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98236 -0.21972 -0.00036 0.24078 1.25127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.3586756 0.1259143 42.558 < 2e-16 ***
## educ 0.0642761 0.0063115 10.184 < 2e-16 ***
## exper 0.0172146 0.0126138 1.365 0.172665
## I(exper^2) -0.0001138 0.0005319 -0.214 0.830622
## tenure 0.0249291 0.0081297 3.066 0.002229 **
## I(tenure^2) -0.0007964 0.0004710 -1.691 0.091188 .
## married 0.1985470 0.0391103 5.077 4.65e-07 ***
## black -0.1906636 0.0377011 -5.057 5.13e-07 ***
## south -0.0912153 0.0262356 -3.477 0.000531 ***
## urban 0.1854241 0.0269585 6.878 1.12e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared: 0.255, Adjusted R-squared: 0.2477
## F-statistic: 35.17 on 9 and 925 DF, p-value: < 2.2e-16
linearHypothesis(model4, c("I(exper^2)=0", "I(tenure^2)=0" ))
##
## Linear hypothesis test:
## I(exper^2) = 0
## I(tenure^2) = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) +
## married + black + south + urban
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 123.82
## 2 925 123.42 2 0.39756 1.4898 0.226
F= ((0.255-0.2526)/2)/((1-0.255)/925)
F
## [1] 1.489933
1-pf(F,2,925)
## [1] 0.2259282
model4 <- lm(log(wage) ~ educ +exper +tenure + married + black + south + urban + I(black*educ), data = wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban + I(black * educ), data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## black 0.094809 0.255399 0.371 0.710561
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## I(black * educ) -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
model4 <- lm(log(wage)~educ+exper+tenure+married+black+south+urban+educ*black,data= wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban + educ * black, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## black 0.094809 0.255399 0.371 0.710561
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
model_7C2_4 <- lm(log(wage)~educ+exper+tenure+married+black+south+urban+married*black,data= wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban + educ * black, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## black 0.094809 0.255399 0.371 0.710561
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
This suggests that, on average, married Black individuals earn 17.94% less in monthly salary compared to married non-Black individuals.
data("smoke")
ols_model <- lm(cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(ols_model)
##
## Call:
## lm(formula = cigs ~ cigpric + log(income) + educ + age + I(age^2) +
## restaurn + white, data = smoke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.758 -9.336 -5.895 7.927 70.267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.782521 8.852068 -0.653 0.51379
## cigpric -0.005724 0.101064 -0.057 0.95485
## log(income) 0.865360 0.728766 1.187 0.23541
## educ -0.501908 0.167169 -3.002 0.00276 **
## age 0.774357 0.160516 4.824 1.68e-06 ***
## I(age^2) -0.009068 0.001748 -5.187 2.71e-07 ***
## restaurn -2.880176 1.116067 -2.581 0.01004 *
## white -0.553286 1.459490 -0.379 0.70472
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared: 0.05289, Adjusted R-squared: 0.04459
## F-statistic: 6.374 on 7 and 799 DF, p-value: 2.609e-07
bptest(ols_model)
##
## studentized Breusch-Pagan test
##
## data: ols_model
## BP = 32.431, df = 7, p-value = 3.379e-05
coeftest(ols_model, vcov = vcovHC(ols_model, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.7825205 8.8831245 -0.6510 0.515262
## cigpric -0.0057240 0.1062869 -0.0539 0.957064
## log(income) 0.8653600 0.5985769 1.4457 0.148655
## educ -0.5019084 0.1624052 -3.0905 0.002068 **
## age 0.7743569 0.1380708 5.6084 2.814e-08 ***
## I(age^2) -0.0090678 0.0014596 -6.2127 8.375e-10 ***
## restaurn -2.8801757 1.0155265 -2.8361 0.004682 **
## white -0.5532861 1.3782955 -0.4014 0.688213
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
waldtest(ols_model, vcov = vcovHC(ols_model, type = "HC1"))
## Wald test
##
## Model 1: cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn +
## white
## Model 2: cigs ~ 1
## Res.Df Df F Pr(>F)
## 1 799
## 2 806 -7 9.3617 3.49e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
usual_se <- summary(ols_model)$coefficients[, "Std. Error"]
robust_se <- sqrt(diag(vcovHC(ols_model, type = "HC1")))
comparison <- data.frame(Variable = names(coef(ols_model)), Usual_SE = usual_se, Robust_SE = robust_se)
print(comparison)
## Variable Usual_SE Robust_SE
## (Intercept) (Intercept) 8.852067639 8.883124494
## cigpric cigpric 0.101063853 0.106286855
## log(income) log(income) 0.728765519 0.598576892
## educ educ 0.167169257 0.162405199
## age age 0.160516185 0.138070842
## I(age^2) I(age^2) 0.001748065 0.001459558
## restaurn restaurn 1.116067336 1.015526517
## white white 1.459489762 1.378295460
data4 <- wooldridge::smoke
head(data4)
## educ cigpric white age income cigs restaurn lincome agesq lcigpric
## 1 16.0 60.506 1 46 20000 0 0 9.903487 2116 4.102743
## 2 16.0 57.883 1 40 30000 0 0 10.308952 1600 4.058424
## 3 12.0 57.664 1 58 30000 3 0 10.308952 3364 4.054633
## 4 13.5 57.883 1 30 20000 0 0 9.903487 900 4.058424
## 5 10.0 58.320 1 17 20000 0 0 9.903487 289 4.065945
## 6 6.0 59.340 1 86 6500 0 0 8.779557 7396 4.083283
model11 <- lm(cigs ~ log(cigpric) + log(income) + educ + age + agesq + restaurn + white , data = data4)
summary(model11)
##
## Call:
## lm(formula = cigs ~ log(cigpric) + log(income) + educ + age +
## agesq + restaurn + white, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.772 -9.330 -5.907 7.945 70.275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.682419 24.220729 -0.111 0.91184
## log(cigpric) -0.850907 5.782321 -0.147 0.88305
## log(income) 0.869014 0.728763 1.192 0.23344
## educ -0.501753 0.167168 -3.001 0.00277 **
## age 0.774502 0.160516 4.825 1.68e-06 ***
## agesq -0.009069 0.001748 -5.188 2.70e-07 ***
## restaurn -2.865621 1.117406 -2.565 0.01051 *
## white -0.559236 1.459461 -0.383 0.70169
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared: 0.05291, Adjusted R-squared: 0.04461
## F-statistic: 6.377 on 7 and 799 DF, p-value: 2.588e-07
# (i) The standard errors for each coefficient, whether calculated using the usual method or heteroskedasticity-robust approach, yield very similar results in practice. There are no notable differences between them.
s_error <- coef(summary(model11))
s_error
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.682418774 24.220728831 -0.1107489 9.118433e-01
## log(cigpric) -0.850907441 5.782321084 -0.1471567 8.830454e-01
## log(income) 0.869013971 0.728763480 1.1924499 2.334389e-01
## educ -0.501753247 0.167167689 -3.0014966 2.770186e-03
## age 0.774502156 0.160515805 4.8250835 1.676279e-06
## agesq -0.009068603 0.001748055 -5.1878253 2.699355e-07
## restaurn -2.865621212 1.117405936 -2.5645301 1.051320e-02
## white -0.559236403 1.459461034 -0.3831801 7.016882e-01
# (ii) Calculate the change in the probability of smoking if education increases by 4 years
edu_4 <- coef(model11)["educ"] * 4
edu_4
## educ
## -2.007013
# (iii) Find the age at which the probability of smoking starts to decrease
age <- -coef(model11)["age"] / (2 * coef(model11)["I(age^2)"])
age
## age
## NA
print(paste("The turning point for the quadratic is calculated as 0.020 / [2(0.00026)] ≈ 38.46, which is approximately 38 and a half years."))
## [1] "The turning point for the quadratic is calculated as 0.020 / [2(0.00026)] ≈ 38.46, which is approximately 38 and a half years."
# (iv) Interpret the coefficient on restaurn
coef_res <- coef(model11)["restaurn"]
coef_res
## restaurn
## -2.865621
# (v) Predict the probability of smoking for person 206
library(lmtest)
person_206 <- data.frame(cigpric = 67.44, income = 6500, educ = 16, age = 77, restaurn = 0, white = 0)
het_test <- bptest(model11)
het_test
##
## studentized Breusch-Pagan test
##
## data: model11
## BP = 32.377, df = 7, p-value = 3.458e-05
print(paste("The equation for smoking is: smokes = −0.656 + 0.069 × log(67.44) + 0.012 × log(6,500) − 0.029 × (16) + 0.020 × (77) + 0.00026 × (77²) − 0.0052."))
## [1] "The equation for smoking is: smokes = −0.656 + 0.069 × log(67.44) + 0.012 × log(6,500) − 0.029 × (16) + 0.020 × (77) + 0.00026 × (77²) − 0.0052."
data("vote1")
model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
# (i) Estimate the model with OLS
residual_model <- lm(resid(model) ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
r_squared <- summary(residual_model)$r.squared
print(paste("R^2 of residual regression:", r_squared)) # Expected R^2 ≈ 0
## [1] "R^2 of residual regression: 1.19813088272216e-32"
# (ii) Breusch-Pagan test for heteroskedasticity
bp_test <- bptest(model)
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 9.0934, df = 4, p-value = 0.05881
# (iii) White test for heteroskedasticity (F-statistic version)
white_test <- bptest(model, ~ poly(fitted(model), 2), data = vote1)
print(white_test)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 5.49, df = 2, p-value = 0.06425
library(sandwich)
library(lmtest)
data6 <- wooldridge::fertil2
head(data6)
## mnthborn yearborn age electric radio tv bicycle educ ceb agefbrth children
## 1 5 64 24 1 1 1 1 12 0 NA 0
## 2 1 56 32 1 1 1 1 13 3 25 3
## 3 7 58 30 1 0 0 0 5 1 27 1
## 4 11 45 42 1 0 1 0 4 3 17 2
## 5 5 45 43 1 1 1 1 11 2 24 2
## 6 8 52 36 1 0 0 0 7 1 26 1
## knowmeth usemeth monthfm yearfm agefm idlnchld heduc agesq urban urb_educ
## 1 1 0 NA NA NA 2 NA 576 1 12
## 2 1 1 11 80 24 3 12 1024 1 13
## 3 1 0 6 83 24 5 7 900 1 5
## 4 1 0 1 61 15 3 11 1764 1 4
## 5 1 1 3 66 20 2 14 1849 1 11
## 6 1 1 11 76 24 4 9 1296 1 7
## spirit protest catholic frsthalf educ0 evermarr
## 1 0 0 0 1 0 0
## 2 0 0 0 1 0 1
## 3 1 0 0 0 0 1
## 4 0 0 0 0 0 1
## 5 0 1 0 1 0 1
## 6 0 0 0 0 0 1
model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
robust_se <- sqrt(diag(vcovHC(model)))
summary_with_robust_se <- cbind(coef(model), "Robust SE" = robust_se)
summary(model)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban,
## data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9012 -0.7136 -0.0039 0.7119 7.4318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2225162 0.2401888 -17.580 < 2e-16 ***
## age 0.3409255 0.0165082 20.652 < 2e-16 ***
## I(age^2) -0.0027412 0.0002718 -10.086 < 2e-16 ***
## educ -0.0752323 0.0062966 -11.948 < 2e-16 ***
## electric -0.3100404 0.0690045 -4.493 7.20e-06 ***
## urban -0.2000339 0.0465062 -4.301 1.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 4352 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.5734, Adjusted R-squared: 0.5729
## F-statistic: 1170 on 5 and 4352 DF, p-value: < 2.2e-16
print(summary_with_robust_se)
## Robust SE
## (Intercept) -4.222516228 0.2443961935
## age 0.340925520 0.0192199445
## I(age^2) -0.002741209 0.0003513959
## educ -0.075232323 0.0063159137
## electric -0.310040409 0.0640737262
## urban -0.200033857 0.0455162364
coeftest(model6, vcov=vcovHC(model6, type = "HC0"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2555544 0.3368805 3.7270 0.000284 ***
## PC 0.1518539 0.0596641 2.5451 0.012048 *
## hsGPA 0.4502203 0.0962508 4.6776 6.962e-06 ***
## ACT 0.0077242 0.0103074 0.7494 0.454932
## mothcoll -0.0037579 0.0601271 -0.0625 0.950258
## fathcoll 0.0417999 0.0578749 0.7222 0.471392
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
F= ((0.574-0.5734)/3)/((1-0.574)/4349)
F
## [1] 2.041784
critical_value = qf(0.95,3,4349, TRUE)
critical_value
## [1] 3.403585
p_value = 1- pf(F,3,4349)
p_value
## [1] 0.1058349
a <- coeftest(model, vcov = sandwich)
a
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.22251623 0.24368307 -17.3279 < 2.2e-16 ***
## age 0.34092552 0.01916146 17.7923 < 2.2e-16 ***
## I(age^2) -0.00274121 0.00035027 -7.8260 6.278e-15 ***
## educ -0.07523232 0.00630336 -11.9353 < 2.2e-16 ***
## electric -0.31004041 0.06390411 -4.8517 1.267e-06 ***
## urban -0.20003386 0.04543962 -4.4022 1.097e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1065, df = 5, p-value < 2.2e-16
# (i) Some robust standard are smaller than the non-robust standard errors.
usqr=(summary(model6)$residual)^2
yhat=model6$fitted.values
yhatsqr=model6$fitted.values^2
model6WT=summary(lm(usqr~yhat+yhatsqr,data=fertil2))$r.squared
model6WT*4361
## [1] 143.6236
1-pchisq(1093.183,2)
## [1] 0
joint_test <- coeftest(model6, vcov = vcovHC)
print(joint_test[, "Pr(>|t|)"])
## (Intercept) PC hsGPA ACT mothcoll fathcoll
## 5.903211e-04 1.682044e-02 1.977193e-05 4.846838e-01 9.528773e-01 4.948069e-01
p_value = 1- pf(F,3,4349)
p_value
## [1] 0.1058349
head(summary_with_robust_se)
## Robust SE
## (Intercept) -4.222516228 0.2443961935
## age 0.340925520 0.0192199445
## I(age^2) -0.002741209 0.0003513959
## educ -0.075232323 0.0063159137
## electric -0.310040409 0.0640737262
## urban -0.200033857 0.0455162364
fv <- fitted(model6)
head(fv,6)
## 1 2 3 4 5 6
## 2.768423 2.919682 3.115218 3.039878 3.269490 3.021208
residuals <- resid(model6)
head(residuals,6)
## 1 2 3 4 5 6
## 0.23157702 0.48031850 -0.11521801 0.46012183 0.33050949 -0.02120773
hetero_test <- lm(residuals^2 ~ fv)
summary(hetero_test)
##
## Call:
## lm(formula = residuals^2 ~ fv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.13813 -0.07873 -0.04322 0.04132 0.51313
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.27680 0.18877 -1.466 0.1448
## fv 0.12558 0.06165 2.037 0.0436 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.128 on 139 degrees of freedom
## Multiple R-squared: 0.02898, Adjusted R-squared: 0.022
## F-statistic: 4.149 on 1 and 139 DF, p-value: 0.04357
print(paste("iii. Ho: There is no heteroskedasticity H1: There is heteroskedasticity."))
## [1] "iii. Ho: There is no heteroskedasticity H1: There is heteroskedasticity."
print(paste("As the p-value is below alpha at a 95% confidence level, there exists adequate statistical evidence to reject the null hypothesis, signifying the presence of heteroskedasticity in the model."))
## [1] "As the p-value is below alpha at a 95% confidence level, there exists adequate statistical evidence to reject the null hypothesis, signifying the presence of heteroskedasticity in the model."
# (iv) Is heteroskedasticity practically important?
print(paste("No, since the difference between the robust and non robust standar errors was really small."))
## [1] "No, since the difference between the robust and non robust standar errors was really small."
data7 <- wooldridge::ceosal2
head(data7)
## salary age college grad comten ceoten sales profits mktval lsalary lsales
## 1 1161 49 1 1 9 2 6200 966 23200 7.057037 8.732305
## 2 600 43 1 1 10 10 283 48 1100 6.396930 5.645447
## 3 379 51 1 1 9 3 169 40 1100 5.937536 5.129899
## 4 651 55 1 0 22 22 1100 -54 1000 6.478509 7.003066
## 5 497 44 1 1 8 6 351 28 387 6.208590 5.860786
## 6 1067 64 1 1 7 7 19000 614 3900 6.972606 9.852194
## lmktval comtensq ceotensq profmarg
## 1 10.051908 81 4 15.580646
## 2 7.003066 100 100 16.961130
## 3 7.003066 81 9 23.668638
## 4 6.907755 484 484 -4.909091
## 5 5.958425 64 36 7.977208
## 6 8.268732 49 49 3.231579
model11 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = data7)
summary(model11)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## ceoten + comten, data = data7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5436 -0.2796 -0.0164 0.2857 1.9879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.571977 0.253466 18.038 < 2e-16 ***
## log(sales) 0.187787 0.040003 4.694 5.46e-06 ***
## log(mktval) 0.099872 0.049214 2.029 0.04397 *
## profmarg -0.002211 0.002105 -1.050 0.29514
## ceoten 0.017104 0.005540 3.087 0.00236 **
## comten -0.009238 0.003337 -2.768 0.00626 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared: 0.3525, Adjusted R-squared: 0.3336
## F-statistic: 18.62 on 5 and 171 DF, p-value: 9.488e-15
r_squared <- summary(model11)$r.squared
r_squared
## [1] 0.3525374
model12 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceotensq + comtensq, data = data7)
summary(model12)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## ceotensq + comtensq, data = data7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.47481 -0.25933 -0.00511 0.27010 2.07583
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.612e+00 2.524e-01 18.276 < 2e-16 ***
## log(sales) 1.805e-01 4.021e-02 4.489 1.31e-05 ***
## log(mktval) 1.018e-01 4.988e-02 2.040 0.0429 *
## profmarg -2.077e-03 2.135e-03 -0.973 0.3321
## ceotensq 3.761e-04 1.916e-04 1.963 0.0512 .
## comtensq -1.788e-04 7.236e-05 -2.471 0.0144 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5024 on 171 degrees of freedom
## Multiple R-squared: 0.3324, Adjusted R-squared: 0.3129
## F-statistic: 17.03 on 5 and 171 DF, p-value: 1.195e-13
r_squared <- summary(model12)$r.squared
r_squared
## [1] 0.3323998
data9 <- wooldridge::campus
head(data9)
## enroll priv police crime lcrime lenroll lpolice
## 1 21836 0 24 446 6.100319 9.991315 3.178054
## 2 6485 0 13 1 0.000000 8.777247 2.564949
## 3 2123 0 3 1 0.000000 7.660585 1.098612
## 4 8240 0 17 121 4.795791 9.016756 2.833213
## 5 19793 0 30 470 6.152733 9.893084 3.401197
## 6 3256 1 9 25 3.218876 8.088255 2.197225
b0 <- -6.63
se_b0 <- 1.03
b1 <- 1.27
se_b1 <- 0.11
n <- nrow(data9)
t_stat <- (b1 - 1) / se_b1
df <- n - 2
critical_value <- qt(0.95, df)
if (t_stat > critical_value) {
cat("Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.\n")
} else {
cat("Fail to reject the null hypothesis H0: B1 = 1 at the 5% level.\n")
}
## Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.
data("infmrt")
# Add a dummy variable for District of Columbia (assuming DC is observation 51)
infmrt$dc <- ifelse(row.names(infmrt) == "51", 1, 0)
# (i) Estimate the model including the DC dummy variable
model_with_dc <- lm(log(infmort) ~ pcinc + physic + dc, data = infmrt)
summary(model_with_dc)
##
## Call:
## lm(formula = log(infmort) ~ pcinc + physic + dc, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43481 -0.10338 0.00447 0.09096 0.39050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.456e+00 8.688e-02 28.264 < 2e-16 ***
## pcinc -3.078e-05 6.499e-06 -4.737 7.33e-06 ***
## physic 1.495e-03 2.729e-04 5.478 3.33e-07 ***
## dc -7.940e-02 1.664e-01 -0.477 0.634
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1629 on 98 degrees of freedom
## Multiple R-squared: 0.2537, Adjusted R-squared: 0.2309
## F-statistic: 11.1 on 3 and 98 DF, p-value: 2.444e-06
# (ii) Compare with the model without the DC dummy variable
model_without_dc <- lm(log(infmort) ~ pcinc + physic, data = infmrt)
summary(model_without_dc)
##
## Call:
## lm(formula = log(infmort) ~ pcinc + physic, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43432 -0.10235 0.00626 0.09337 0.39064
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.448e+00 8.502e-02 28.794 < 2e-16 ***
## pcinc -3.026e-05 6.378e-06 -4.743 7.07e-06 ***
## physic 1.487e-03 2.713e-04 5.480 3.24e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1623 on 99 degrees of freedom
## Multiple R-squared: 0.252, Adjusted R-squared: 0.2369
## F-statistic: 16.67 on 2 and 99 DF, p-value: 5.744e-07
# Compare both models
anova(model_without_dc, model_with_dc)
## Analysis of Variance Table
##
## Model 1: log(infmort) ~ pcinc + physic
## Model 2: log(infmort) ~ pcinc + physic + dc
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 99 2.6064
## 2 98 2.6004 1 0.0060446 0.2278 0.6342
library(quantreg)
## Loading required package: SparseM
data <- as.data.frame(rdchem)
data$sales_billion <- data$sales / 1e3
ols_full <- lm(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data)
summary(ols_full)
##
## Call:
## lm(formula = rdintens ~ sales_billion + I(sales_billion^2) +
## profmarg, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_billion 0.316611 0.138854 2.280 0.03041 *
## I(sales_billion^2) -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
# (i) OLS regression
data_filtered <- data %>% filter(sales_billion < 40)
ols_filtered <- lm(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data_filtered)
summary(ols_filtered)
##
## Call:
## lm(formula = rdintens ~ sales_billion + I(sales_billion^2) +
## profmarg, data = data_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_billion 0.316611 0.138854 2.280 0.03041 *
## I(sales_billion^2) -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
# (ii) LAD regression
lad_full <- rq(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data, tau = 0.5)
summary(lad_full)
##
## Call: rq(formula = rdintens ~ sales_billion + I(sales_billion^2) +
## profmarg, tau = 0.5, data = data)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_billion 0.26346 -0.13508 0.75753
## I(sales_billion^2) -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
lad_filtered <- rq(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data_filtered, tau = 0.5)
summary(lad_filtered)
##
## Call: rq(formula = rdintens ~ sales_billion + I(sales_billion^2) +
## profmarg, tau = 0.5, data = data_filtered)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_billion 0.26346 -0.13508 0.75753
## I(sales_billion^2) -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
# (iii) Compare OLS and LAD results
comparison <- data.frame(
Method = c("OLS (Full)", "OLS (Filtered)", "LAD (Full)", "LAD (Filtered)"),
Intercept = c(coef(ols_full)[1], coef(ols_filtered)[1], coef(lad_full)[1], coef(lad_filtered)[1]),
Sales = c(coef(ols_full)[2], coef(ols_filtered)[2], coef(lad_full)[2], coef(lad_filtered)[2]),
Sales_Squared = c(coef(ols_full)[3], coef(ols_filtered)[3], coef(lad_full)[3], coef(lad_filtered)[3]),
Profmarg = c(coef(ols_full)[4], coef(ols_filtered)[4], coef(lad_full)[4], coef(lad_filtered)[4])
)
print(comparison)
## Method Intercept Sales Sales_Squared Profmarg
## 1 OLS (Full) 2.058967 0.3166110 -0.007389624 0.05332217
## 2 OLS (Filtered) 2.058967 0.3166110 -0.007389624 0.05332217
## 3 LAD (Full) 1.404279 0.2634638 -0.006001127 0.11400366
## 4 LAD (Filtered) 1.404279 0.2634638 -0.006001127 0.11400366
# Example code to create quarterly dummies and time trend
# Create quarterly dummies and time trend
#housing_data <- housing_data %>%
# mutate(
# quarter = as.factor(quarter),
# time_trend = 1:n()
# )
# Run the regression model
#housing_model <- lm(hs ~ interest_rate + income + time_trend + quarter, data = housing_data)
#summary(housing_model)
library(dplyr)
data("intdef")
intdef <- intdef %>%
mutate(
post_1979 = ifelse(year > 1979, 1, 0) # Dummy variable: 1 if year > 1979, else 0
)
# Run the regression including the dummy variable
policy_model <- lm(i3_1 ~ inf_1 + def_1 + post_1979, data = intdef)
summary(policy_model)
##
## Call:
## lm(formula = i3_1 ~ inf_1 + def_1 + post_1979, data = intdef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5802 -0.8353 0.1936 0.7983 4.0447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.39151 0.39028 3.565 0.000800 ***
## inf_1 0.56529 0.07144 7.913 1.99e-10 ***
## def_1 0.38269 0.11156 3.430 0.001203 **
## post_1979 1.77623 0.47268 3.758 0.000442 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.59 on 51 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.705, Adjusted R-squared: 0.6876
## F-statistic: 40.63 on 3 and 51 DF, p-value: 1.479e-13
cat("\nConclusion:\n")
##
## Conclusion:
cat("The coefficient on post_1979 indicates whether there was a significant shift in the interest rate equation after 1979.\n")
## The coefficient on post_1979 indicates whether there was a significant shift in the interest rate equation after 1979.
data("volat")
# (i) Discuss expected signs of coefficients
cat("Expected signs:\n")
## Expected signs:
cat("β1 (pcip): Positive. An increase in industrial production should generally lead to positive stock market returns.\n")
## β1 (pcip): Positive. An increase in industrial production should generally lead to positive stock market returns.
cat("β2 (i3): Negative. An increase in short-term interest rates often reduces stock market returns as borrowing becomes more expensive.\n\n")
## β2 (i3): Negative. An increase in short-term interest rates often reduces stock market returns as borrowing becomes more expensive.
# (ii) Estimate the equation by OLS
model_c9 <- lm(rsp500 ~ pcip + i3, data = volat)
summary(model_c9)
##
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.871 -22.580 2.103 25.524 138.137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.84306 3.27488 5.754 1.44e-08 ***
## pcip 0.03642 0.12940 0.281 0.7785
## i3 -1.36169 0.54072 -2.518 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.13 on 554 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01189, Adjusted R-squared: 0.008325
## F-statistic: 3.334 on 2 and 554 DF, p-value: 0.03637
cat("\nOLS Results:\n")
##
## OLS Results:
cat("The coefficients for pcip and i3, along with their standard errors, t-values, and p-values, are displayed in the summary above.\n\n")
## The coefficients for pcip and i3, along with their standard errors, t-values, and p-values, are displayed in the summary above.
# (iii)Determine which variables are statistically significant
p_values <- summary(model_c9)$coefficients[, 4]
significant <- p_values < 0.05
cat("P-values for each variable:\n")
## P-values for each variable:
print(p_values)
## (Intercept) pcip i3
## 1.444871e-08 7.784810e-01 1.207402e-02
cat("\nSignificance (TRUE = significant at 5% level):\n")
##
## Significance (TRUE = significant at 5% level):
print(significant)
## (Intercept) pcip i3
## TRUE FALSE TRUE
# (iv)Interpret predictability
if (all(significant[-1] == FALSE)) {
cat("\nThe variables pcip and i3 are NOT statistically significant.\n")
cat("This suggests that the return on the S&P 500 is NOT predictable using these variables.\n")
} else {
cat("\nAt least one variable is statistically significant.\n")
cat("This suggests that the return on the S&P 500 shows SOME level of predictability.\n")
}
##
## At least one variable is statistically significant.
## This suggests that the return on the S&P 500 shows SOME level of predictability.
data("consump")
# (i) Estimate g_ct = β0 + β1 * g_ct-1 + u_t and test the PIH
consump <- consump %>%
mutate(g_ct = log(c) - lag(log(c))) # g_ct = log(c_t) - log(c_t-1)
model_i <- lm(g_ct ~ lag(g_ct, 1), data = consump)
summary(model_i)
##
## Call:
## lm(formula = g_ct ~ lag(g_ct, 1), data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.027878 -0.005974 -0.001451 0.007142 0.020227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.011431 0.003778 3.026 0.00478 **
## lag(g_ct, 1) 0.446141 0.156047 2.859 0.00731 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01161 on 33 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1985, Adjusted R-squared: 0.1742
## F-statistic: 8.174 on 1 and 33 DF, p-value: 0.00731
# (ii) Add lagged income growth, interest rate, and inflation
consump <- consump %>%
mutate(gy_lag = lag(gy),
i3_lag = lag(i3),
inf_lag = lag(inf))
model_ii <- lm(g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag, data = consump)
summary(model_ii)
##
## Call:
## lm(formula = g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag,
## data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0249093 -0.0075869 0.0000855 0.0087234 0.0188617
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0225940 0.0070892 3.187 0.00335 **
## lag(g_ct, 1) 0.4335832 0.2896525 1.497 0.14487
## gy_lag -0.1079087 0.1946371 -0.554 0.58341
## i3_lag -0.0007467 0.0011107 -0.672 0.50654
## inf_lag -0.0008281 0.0010041 -0.825 0.41606
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01134 on 30 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3038, Adjusted R-squared: 0.211
## F-statistic: 3.273 on 4 and 30 DF, p-value: 0.02431
# (iii) Compare p-value of lag(g_ct, 1) in model_i and model_ii
cat("\nPART (iii): P-values for lag(g_ct, 1)\n")
##
## PART (iii): P-values for lag(g_ct, 1)
cat("Model (i) p-value for lag(g_ct, 1):", summary(model_i)$coefficients[2, 4], "\n")
## Model (i) p-value for lag(g_ct, 1): 0.007310181
cat("Model (ii) p-value for lag(g_ct, 1):", summary(model_ii)$coefficients[2, 4], "\n")
## Model (ii) p-value for lag(g_ct, 1): 0.1448651
# (iv) F-statistic for joint significance of all variables
anova_test <- anova(model_i, model_ii)
cat("\nPART (iv): F-statistic for joint significance\n")
##
## PART (iv): F-statistic for joint significance
print(anova_test)
## Analysis of Variance Table
##
## Model 1: g_ct ~ lag(g_ct, 1)
## Model 2: g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 33 0.0044446
## 2 30 0.0038608 3 0.00058384 1.5122 0.2315
data("minwage")
# Create a lagged variable for gwage232
minwage$lag_gwage232 <- lag(minwage$gwage232)
# (i)
filtered_data <- minwage[complete.cases(minwage$gwage232, minwage$lag_gwage232), ]
acf(filtered_data$gwage232, lag.max = 1)
# (ii)
model1 <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi, data = filtered_data)
summary(model1)
##
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi, data = filtered_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.044642 -0.004134 -0.001312 0.004482 0.041612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0024003 0.0004308 5.572 3.79e-08 ***
## lag_gwage232 -0.0779092 0.0342851 -2.272 0.02341 *
## gmwage 0.1518459 0.0096485 15.738 < 2e-16 ***
## gcpi 0.2630876 0.0824457 3.191 0.00149 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007889 on 606 degrees of freedom
## Multiple R-squared: 0.2986, Adjusted R-squared: 0.2951
## F-statistic: 85.99 on 3 and 606 DF, p-value: < 2.2e-16
# (iii)
model2 <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi + lag(gemp232), data = filtered_data)
summary(model2)
##
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi + lag(gemp232),
## data = filtered_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.043786 -0.004384 -0.001022 0.004325 0.042554
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0024248 0.0004268 5.681 2.08e-08 ***
## lag_gwage232 -0.0756372 0.0339207 -2.230 0.026126 *
## gmwage 0.1526838 0.0095403 16.004 < 2e-16 ***
## gcpi 0.2652171 0.0826041 3.211 0.001394 **
## lag(gemp232) 0.0662916 0.0169639 3.908 0.000104 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007798 on 604 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3167, Adjusted R-squared: 0.3122
## F-statistic: 69.98 on 4 and 604 DF, p-value: < 2.2e-16
# (iv)
coef(model1)["gmwage"]
## gmwage
## 0.1518459
coef(model2)["gmwage"]
## gmwage
## 0.1526838
# (v)
model3 <- lm(gmwage ~ lag(gwage232) + lag(gemp232), data = filtered_data)
summary(model3)$r.squared
## [1] 0.003908469
data("nyse")
# (i) Estimate the OLS model in equation (12.47)
nyse <- nyse %>% mutate(return_lag1 = lag(return, 1))
ols_model <- lm(return ~ return_lag1, data = nyse)
cat("OLS Model Summary:\n")
## OLS Model Summary:
summary(ols_model)
##
## Call:
## lm(formula = return ~ return_lag1, data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.261 -1.302 0.098 1.316 8.065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.17963 0.08074 2.225 0.0264 *
## return_lag1 0.05890 0.03802 1.549 0.1218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.11 on 687 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.003481, Adjusted R-squared: 0.00203
## F-statistic: 2.399 on 1 and 687 DF, p-value: 0.1218
nyse <- nyse %>% filter(!is.na(return_lag1))
nyse$residuals_sq <- residuals(ols_model)^2
mean_resid_sq <- mean(nyse$residuals_sq)
min_resid_sq <- min(nyse$residuals_sq)
max_resid_sq <- max(nyse$residuals_sq)
cat("Mean of squared residuals:", mean_resid_sq, "\n")
## Mean of squared residuals: 4.440839
cat("Minimum of squared residuals:", min_resid_sq, "\n")
## Minimum of squared residuals: 7.35465e-06
cat("Maximum of squared residuals:", max_resid_sq, "\n")
## Maximum of squared residuals: 232.8946
# (ii) Estimate heteroskedasticity model using lagged return
hetero_model <- lm(residuals_sq ~ return_lag1 + I(return_lag1^2), data = nyse)
cat("Heteroskedasticity Model Summary:\n")
## Heteroskedasticity Model Summary:
summary(hetero_model)
##
## Call:
## lm(formula = residuals_sq ~ return_lag1 + I(return_lag1^2), data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.459 -3.011 -1.975 0.676 221.469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.25734 0.44085 7.389 4.32e-13 ***
## return_lag1 -0.78946 0.19569 -4.034 6.09e-05 ***
## I(return_lag1^2) 0.29666 0.03552 8.351 3.75e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared: 0.1303, Adjusted R-squared: 0.1278
## F-statistic: 51.4 on 2 and 686 DF, p-value: < 2.2e-16
# (iii) Plot conditional variance as a function of lagged return
plot_data <- data.frame(return_lag1 = seq(min(nyse$return_lag1, na.rm = TRUE),
max(nyse$return_lag1, na.rm = TRUE), length.out = 100))
plot_data$conditional_variance <- predict(hetero_model, newdata = plot_data)
plot(plot_data$return_lag1, plot_data$conditional_variance, type = "l",
main = "Conditional Variance vs. Lagged Return",
xlab = "Lagged Return", ylab = "Conditional Variance")
min_variance_index <- which.min(plot_data$conditional_variance)
cat("Smallest conditional variance:", plot_data$conditional_variance[min_variance_index], "\n")
## Smallest conditional variance: 2.734254
cat("Lagged return value for smallest variance:", plot_data$return_lag1[min_variance_index], "\n")
## Lagged return value for smallest variance: 1.245583
# (iv) Check for negative variance estimates
if (any(plot_data$conditional_variance < 0)) {
cat("Negative variance estimates are present.\n")
} else {
cat("No negative variance estimates.\n")
}
## No negative variance estimates.
# (v) Compare ARCH(1) model fit
nyse$residuals_sq_lag1 <- lag(nyse$residuals_sq, 1)
arch1_model <- lm(residuals_sq ~ residuals_sq_lag1, data = nyse)
summary(arch1_model)
##
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1, data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.337 -3.292 -2.157 0.556 223.981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.94743 0.44023 6.695 4.49e-11 ***
## residuals_sq_lag1 0.33706 0.03595 9.377 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.76 on 686 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1136, Adjusted R-squared: 0.1123
## F-statistic: 87.92 on 1 and 686 DF, p-value: < 2.2e-16
cat("ARCH(1) model R-squared:", summary(arch1_model)$r.squared, "\n")
## ARCH(1) model R-squared: 0.1136065
cat("Heteroskedasticity model R-squared:", summary(hetero_model)$r.squared, "\n")
## Heteroskedasticity model R-squared: 0.1303208
# (vi) Add second lag to ARCH(1) model for ARCH(2)
nyse$residuals_sq_lag2 <- lag(nyse$residuals_sq, 2)
arch2_model <- lm(residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2, data = nyse)
summary(arch2_model)
##
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2,
## data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.934 -3.298 -2.158 0.600 224.296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.82950 0.45495 6.219 8.69e-10 ***
## residuals_sq_lag1 0.32284 0.03820 8.450 < 2e-16 ***
## residuals_sq_lag2 0.04179 0.03820 1.094 0.274
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.76 on 684 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1151, Adjusted R-squared: 0.1125
## F-statistic: 44.47 on 2 and 684 DF, p-value: < 2.2e-16
cat("ARCH(2) model Summary:\n")
## ARCH(2) model Summary:
print(summary(arch2_model))
##
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2,
## data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.934 -3.298 -2.158 0.600 224.296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.82950 0.45495 6.219 8.69e-10 ***
## residuals_sq_lag1 0.32284 0.03820 8.450 < 2e-16 ***
## residuals_sq_lag2 0.04179 0.03820 1.094 0.274
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.76 on 684 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1151, Adjusted R-squared: 0.1125
## F-statistic: 44.47 on 2 and 684 DF, p-value: < 2.2e-16