CHAPTER 7

Question 1.

library(wooldridge)

# Load the dataset
data <- as.data.frame(sleep75)

# View the first few rows
head(data)
##   age black case clerical construc educ earns74 gdhlth inlf leis1 leis2 leis3
## 1  32     0    1        0        0   12       0      0    1  3529  3479  3479
## 2  31     0    2        0        0   14    9500      1    1  2140  2140  2140
## 3  44     0    3        0        0   17   42500      1    1  4595  4505  4227
## 4  30     0    4        0        0   12   42500      1    1  3211  3211  3211
## 5  64     0    5        0        0   14    2500      1    1  4052  4007  4007
## 6  41     0    6        0        0   12       0      1    1  4812  4797  4797
##   smsa  lhrwage   lothinc male marr prot rlxall selfe sleep slpnaps south
## 1    0 1.955861 10.075380    1    1    1   3163     0  3113    3163     0
## 2    0 0.357674  0.000000    1    0    1   2920     1  2920    2920     1
## 3    1 3.021887  0.000000    1    1    0   3038     1  2670    2760     0
## 4    0 2.263844  0.000000    0    1    1   3083     1  3083    3083     0
## 5    0 1.011601  9.328213    1    1    1   3493     0  3448    3493     0
## 6    0 2.957511 10.657280    1    1    1   4078     0  4063    4078     0
##   spsepay spwrk75 totwrk union worknrm workscnd exper yngkid yrsmarr    hrwage
## 1       0       0   3438     0    3438        0    14      0      13  7.070004
## 2       0       0   5020     0    5020        0    11      0       0  1.429999
## 3   20000       1   2815     0    2815        0    21      0       0 20.529997
## 4    5000       1   3786     0    3786        0    12      0      12  9.619998
## 5    2400       1   2580     0    2580        0    44      0      33  2.750000
## 6       0       0   1205     0       0     1205    23      0      23 19.249998
##   agesq
## 1  1024
## 2   961
## 3  1936
## 4   900
## 5  4096
## 6  1681
# Run regression
model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = data)

# Display summary
summary(model)
## 
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2378.00  -243.29     6.74   259.24  1350.19 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3840.83197  235.10870  16.336   <2e-16 ***
## totwrk        -0.16342    0.01813  -9.013   <2e-16 ***
## educ         -11.71332    5.86689  -1.997   0.0463 *  
## age           -8.69668   11.20746  -0.776   0.4380    
## I(age^2)       0.12844    0.13390   0.959   0.3378    
## male          87.75243   34.32616   2.556   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared:  0.1228, Adjusted R-squared:  0.1165 
## F-statistic: 19.59 on 5 and 700 DF,  p-value: < 2.2e-16

i. All other factors being equal, is there evidence that men sleep more than women? How strong is the evidence?

# Extract coefficient for male and its p-value
male_coef <- summary(model)$coefficients["male", c("Estimate", "Pr(>|t|)")]
male_coef
##    Estimate    Pr(>|t|) 
## 87.75242861  0.01078518

ii. Is there a statistically significant tradeoff between working and sleeping? What is the estimated tradeoff?

totwrk_coef <- summary(model)$coefficients["totwrk", c("Estimate", "Pr(>|t|)")]
totwrk_coef
##      Estimate      Pr(>|t|) 
## -1.634232e-01  1.891272e-18

iii. What other regression do you need to run the test the null hypothesis that, holding other factors fixed, age has no effect on sleeping?

# Restricted model without age
model_restricted <- lm(sleep ~ totwrk + educ + male, data = data)

# Compare models
anova(model_restricted, model)
## Analysis of Variance Table
## 
## Model 1: sleep ~ totwrk + educ + male
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
##   Res.Df       RSS Df Sum of Sq      F Pr(>F)
## 1    702 122631662                           
## 2    700 122147777  2    483885 1.3865 0.2506

Question 3.

data <- as.data.frame(gpa2)

head(data)
##    sat tothrs colgpa athlete verbmath hsize hsrank   hsperc female white black
## 1  920     43   2.04       1  0.48387  0.10      4 40.00000      1     0     0
## 2 1170     18   4.00       0  0.82813  9.40    191 20.31915      0     1     0
## 3  810     14   1.78       1  0.88372  1.19     42 35.29412      0     1     0
## 4  940     40   2.42       0  0.80769  5.71    252 44.13310      0     1     0
## 5 1180     18   2.61       0  0.73529  2.14     86 40.18692      0     1     0
## 6  980    114   3.03       0  0.81481  2.68     41 15.29851      1     1     0
##   hsizesq
## 1  0.0100
## 2 88.3600
## 3  1.4161
## 4 32.6041
## 5  4.5796
## 6  7.1824
str(data)
## 'data.frame':    4137 obs. of  12 variables:
##  $ sat     : int  920 1170 810 940 1180 980 880 980 1240 1230 ...
##  $ tothrs  : int  43 18 14 40 18 114 78 55 18 17 ...
##  $ colgpa  : num  2.04 4 1.78 2.42 2.61 ...
##  $ athlete : int  1 0 1 0 0 0 0 0 0 0 ...
##  $ verbmath: num  0.484 0.828 0.884 0.808 0.735 ...
##  $ hsize   : num  0.1 9.4 1.19 5.71 2.14 2.68 3.11 2.68 3.67 0.1 ...
##  $ hsrank  : int  4 191 42 252 86 41 161 101 161 3 ...
##  $ hsperc  : num  40 20.3 35.3 44.1 40.2 ...
##  $ female  : int  1 0 0 0 0 1 0 0 0 0 ...
##  $ white   : int  0 1 1 1 1 1 0 1 1 1 ...
##  $ black   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hsizesq : num  0.01 88.36 1.42 32.6 4.58 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model <- lm(sat ~ hsize + I(hsize^2) + female + black + female:black, data = data)

summary(model)
## 
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + female:black, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.45  -89.54   -5.24   85.41  479.13 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1028.0972     6.2902 163.445  < 2e-16 ***
## hsize          19.2971     3.8323   5.035 4.97e-07 ***
## I(hsize^2)     -2.1948     0.5272  -4.163 3.20e-05 ***
## female        -45.0915     4.2911 -10.508  < 2e-16 ***
## black        -169.8126    12.7131 -13.357  < 2e-16 ***
## female:black   62.3064    18.1542   3.432 0.000605 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared:  0.08578,    Adjusted R-squared:  0.08468 
## F-statistic: 77.52 on 5 and 4131 DF,  p-value: < 2.2e-16

i. Is there strong evidence that hsize^2 should be included in that model?

hsize2_coef <- summary(model)$coefficients["I(hsize^2)", c("Estimate", "Pr(>|t|)")]
hsize2_coef
##      Estimate      Pr(>|t|) 
## -2.194828e+00  3.198417e-05
optimal_hsize <- -coef(model)["hsize"] / (2 * coef(model)["I(hsize^2)"])
optimal_hsize
##   hsize 
## 4.39603

ii. Holding hsize fixed, what is the estimated difference in SAT score between nonblack females and nonblack males?

# Extract coefficient for female
female_coef <- summary(model)$coefficients["female", c("Estimate", "Pr(>|t|)")]
female_coef
##      Estimate      Pr(>|t|) 
## -4.509145e+01  1.656301e-25

iii. What is the estimated difference in SAT score between nonblack males and black males?

# Extract coefficient for black
black_coef <- summary(model)$coefficients["black", c("Estimate", "Pr(>|t|)")]
black_coef
##      Estimate      Pr(>|t|) 
## -1.698126e+02  7.140387e-40

iv. What is the estimated difference in SAT score between black females and nonblack females?

black_female_diff <- coef(model)["female"] + coef(model)["female:black"]


black_female_diff_pvalue <- summary(model)$coefficients[c("female", "female:black"), "Pr(>|t|)"]
black_female_diff
##   female 
## 17.21491

Question C1.

data <- as.data.frame(gpa1)

# View the first few rows
head(data)
##   age soph junior senior senior5 male campus business engineer colGPA hsGPA ACT
## 1  21    0      0      1       0    0      0        1        0    3.0   3.0  21
## 2  21    0      0      1       0    0      0        1        0    3.4   3.2  24
## 3  20    0      1      0       0    0      0        1        0    3.0   3.6  26
## 4  19    1      0      0       0    1      1        1        0    3.5   3.5  27
## 5  20    0      1      0       0    0      0        1        0    3.6   3.9  28
## 6  20    0      0      1       0    1      1        1        0    3.0   3.4  25
##   job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1     0     1     1    0    0       0  0     0   1        1        0     0
## 2     0     1     1    0    0       0  0     0   1        0        1     1
## 3     1     0     0    0    1       0  0     0   1        1        0     1
## 4     1     0     0    0    1       0  0     0   0        1        0     0
## 5     0     1     0    1    0       0  0     0   1        1        1     0
## 6     0     0     0    0    1       0  0     0   1        1        0     0
##   skipped alcohol gradMI fathcoll mothcoll
## 1       2     1.0      1        0        0
## 2       0     1.0      1        1        1
## 3       0     1.0      1        1        1
## 4       0     0.0      0        0        0
## 5       0     1.5      1        1        0
## 6       0     0.0      0        1        0
data("gpa1")
#(i)
# Model with mothcoll and fathcoll
model1 <- lm(colGPA ~ hsGPA + ACT + mothcoll + fathcoll + PC, data = gpa1)

# Display summary
summary(model1)
## 
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + mothcoll + fathcoll + PC, 
##     data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78149 -0.25726 -0.02121  0.24691  0.74432 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.255554   0.335392   3.744 0.000268 ***
## hsGPA        0.450220   0.094280   4.775 4.61e-06 ***
## ACT          0.007724   0.010678   0.723 0.470688    
## mothcoll    -0.003758   0.060270  -0.062 0.950376    
## fathcoll     0.041800   0.061270   0.682 0.496265    
## PC           0.151854   0.058716   2.586 0.010762 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared:  0.2222, Adjusted R-squared:  0.1934 
## F-statistic: 7.713 on 5 and 135 DF,  p-value: 2.083e-06

i. Add mothcoll and fathcoll to the equation and report the results.

# Regression with mothcoll and fathcoll
model1 <- lm(colGPA ~ hsGPA + ACT + mothcoll + fathcoll, data = data)
summary(model1)
## 
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + mothcoll + fathcoll, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81126 -0.24179 -0.03234  0.27047  0.84316 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.268408   0.342297   3.706 0.000306 ***
## hsGPA       0.456839   0.096196   4.749 5.12e-06 ***
## ACT         0.007929   0.010898   0.728 0.468119    
## mothcoll    0.016244   0.061009   0.266 0.790441    
## fathcoll    0.057430   0.062233   0.923 0.357739    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3413 on 136 degrees of freedom
## Multiple R-squared:  0.1837, Adjusted R-squared:  0.1597 
## F-statistic:  7.65 on 4 and 136 DF,  p-value: 1.371e-05

ii. Test for joint significance of mothcoll and fathcoll in the equation from part (i) and be sure to report the p-value.

library(car)  
## Loading required package: carData
Anova(model1, test = "F")
## Anova Table (Type II tests)
## 
## Response: colGPA
##            Sum Sq  Df F value   Pr(>F)    
## hsGPA      2.6271   1 22.5534 5.12e-06 ***
## ACT        0.0617   1  0.5294   0.4681    
## mothcoll   0.0083   1  0.0709   0.7904    
## fathcoll   0.0992   1  0.8516   0.3577    
## Residuals 15.8418 136                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

iii. Add hsGPA^2 to the model and check if it improves the model

data$hsGPA2 <- data$hsGPA^2
model2 <- lm(colGPA ~ hsGPA + hsGPA2 + ACT + mothcoll + fathcoll, data = data)
summary(model2)
## 
## Call:
## lm(formula = colGPA ~ hsGPA + hsGPA2 + ACT + mothcoll + fathcoll, 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.77587 -0.23343 -0.02708  0.27998  0.80816 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  5.767777   2.465721   2.339   0.0208 *
## hsGPA       -2.222512   1.457477  -1.525   0.1296  
## hsGPA2       0.401135   0.217737   1.842   0.0676 .
## ACT          0.004417   0.010971   0.403   0.6879  
## mothcoll     0.022601   0.060577   0.373   0.7097  
## fathcoll     0.080959   0.063001   1.285   0.2010  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3383 on 135 degrees of freedom
## Multiple R-squared:  0.2037, Adjusted R-squared:  0.1742 
## F-statistic: 6.906 on 5 and 135 DF,  p-value: 9.025e-06

Question C2.

data_wage <- as.data.frame(wage2)

# View the first few rows
head(data_wage)
##   wage hours  IQ KWW educ exper tenure age married black south urban sibs
## 1  769    40  93  35   12    11      2  31       1     0     0     1    1
## 2  808    50 119  41   18    11     16  37       1     0     0     1    1
## 3  825    40 108  46   14    11      9  33       1     0     0     1    1
## 4  650    40  96  32   12    13      7  32       1     0     0     1    4
## 5  562    40  74  27   11    14      5  34       1     0     0     1   10
## 6 1400    40 116  43   16    14      2  35       1     1     0     1    1
##   brthord meduc feduc    lwage
## 1       2     8     8 6.645091
## 2      NA    14    14 6.694562
## 3       2    14    14 6.715384
## 4       3    12    12 6.476973
## 5       6     6    11 6.331502
## 6       2     8    NA 7.244227
data("wage2")
#(i)
model1 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
summary(model1)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16

i. Estimate the model

model_wage <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = data_wage)
summary(model_wage)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = data_wage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16

ii. Add exper^2 and tenure^2 and check significance to the equation

data_wage$exper2 <- data_wage$exper^2
data_wage$tenure2 <- data_wage$tenure^2

# Run regression with squared terms
model_wage2 <- lm(log(wage) ~ educ + exper + exper2 + tenure + tenure2 + married + black + south + urban, data = data_wage)
summary(model_wage2)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + exper2 + tenure + tenure2 + 
##     married + black + south + urban, data = data_wage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98236 -0.21972 -0.00036  0.24078  1.25127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3586756  0.1259143  42.558  < 2e-16 ***
## educ         0.0642761  0.0063115  10.184  < 2e-16 ***
## exper        0.0172146  0.0126138   1.365 0.172665    
## exper2      -0.0001138  0.0005319  -0.214 0.830622    
## tenure       0.0249291  0.0081297   3.066 0.002229 ** 
## tenure2     -0.0007964  0.0004710  -1.691 0.091188 .  
## married      0.1985470  0.0391103   5.077 4.65e-07 ***
## black       -0.1906636  0.0377011  -5.057 5.13e-07 ***
## south       -0.0912153  0.0262356  -3.477 0.000531 ***
## urban        0.1854241  0.0269585   6.878 1.12e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared:  0.255,  Adjusted R-squared:  0.2477 
## F-statistic: 35.17 on 9 and 925 DF,  p-value: < 2.2e-16

iii. Extend the original model to allow the return to education to depend on race and test whether to return to education does depend on race.

data_wage$exper2 <- data_wage$exper^2
data_wage$tenure2 <- data_wage$tenure^2

# Run regression with squared terms
model_wage2 <- lm(log(wage) ~ educ + exper + exper2 + tenure + tenure2 + married + black + south + urban, data = data_wage)
summary(model_wage2)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + exper2 + tenure + tenure2 + 
##     married + black + south + urban, data = data_wage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98236 -0.21972 -0.00036  0.24078  1.25127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3586756  0.1259143  42.558  < 2e-16 ***
## educ         0.0642761  0.0063115  10.184  < 2e-16 ***
## exper        0.0172146  0.0126138   1.365 0.172665    
## exper2      -0.0001138  0.0005319  -0.214 0.830622    
## tenure       0.0249291  0.0081297   3.066 0.002229 ** 
## tenure2     -0.0007964  0.0004710  -1.691 0.091188 .  
## married      0.1985470  0.0391103   5.077 4.65e-07 ***
## black       -0.1906636  0.0377011  -5.057 5.13e-07 ***
## south       -0.0912153  0.0262356  -3.477 0.000531 ***
## urban        0.1854241  0.0269585   6.878 1.12e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared:  0.255,  Adjusted R-squared:  0.2477 
## F-statistic: 35.17 on 9 and 925 DF,  p-value: < 2.2e-16

iv. Allow wages to differ across four groups.

model_wage4 <- lm(log(wage) ~ married * black + exper + tenure + educ + south + urban, data = data_wage)
summary(model_wage4)
## 
## Call:
## lm(formula = log(wage) ~ married * black + exper + tenure + educ + 
##     south + urban, data = data_wage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98013 -0.21780  0.01057  0.24219  1.22889 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.403793   0.114122  47.351  < 2e-16 ***
## married        0.188915   0.042878   4.406 1.18e-05 ***
## black         -0.240820   0.096023  -2.508 0.012314 *  
## exper          0.014146   0.003191   4.433 1.04e-05 ***
## tenure         0.011663   0.002458   4.745 2.41e-06 ***
## educ           0.065475   0.006253  10.471  < 2e-16 ***
## south         -0.091989   0.026321  -3.495 0.000497 ***
## urban          0.184350   0.026978   6.833 1.50e-11 ***
## married:black  0.061354   0.103275   0.594 0.552602    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared:  0.2528, Adjusted R-squared:  0.2464 
## F-statistic: 39.17 on 8 and 926 DF,  p-value: < 2.2e-16

CHAPTER 8

Question 1. Which of the following are concequences of heteroskedasticity?

data("smoke")

ols_model <- lm(cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(ols_model)
## 
## Call:
## lm(formula = cigs ~ cigpric + log(income) + educ + age + I(age^2) + 
##     restaurn + white, data = smoke)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.758  -9.336  -5.895   7.927  70.267 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.782521   8.852068  -0.653  0.51379    
## cigpric     -0.005724   0.101064  -0.057  0.95485    
## log(income)  0.865360   0.728766   1.187  0.23541    
## educ        -0.501908   0.167169  -3.002  0.00276 ** 
## age          0.774357   0.160516   4.824 1.68e-06 ***
## I(age^2)    -0.009068   0.001748  -5.187 2.71e-07 ***
## restaurn    -2.880176   1.116067  -2.581  0.01004 *  
## white       -0.553286   1.459490  -0.379  0.70472    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared:  0.05289,    Adjusted R-squared:  0.04459 
## F-statistic: 6.374 on 7 and 799 DF,  p-value: 2.609e-07

i. The OLS estimators are inconsistent.

False: Heteroskedasticity does not make OLS estimators inconsistent. They remain unbiased and consistent but inefficient.

ii. The usual F statistic no longer has an F distribution.

True: Heteroskedasticity affects the validity of the F statistic. The usual F test for overall significance may not follow an exact F distribution when heteroskedasticity is present.

iii. The OLS estimators are no longer BLUE (Best Linear Unbiased Estimators).

True: OLS estimators are no longer the best (most efficient) when heteroskedasticity is present. They are still linear and unbiased, but not efficient.

Question 5.

data(smoke)

smoke$smokes <- ifelse(smoke$cigs > 0, 1, 0)

model <- lm(smokes ~ log(cigpric) + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(model)
## 
## Call:
## lm(formula = smokes ~ log(cigpric) + log(income) + educ + age + 
##     I(age^2) + restaurn + white, data = smoke)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6715 -0.3958 -0.2724  0.5344  0.9340 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.6560624  0.8549619   0.767 0.443095    
## log(cigpric) -0.0689259  0.2041088  -0.338 0.735684    
## log(income)   0.0121620  0.0257244   0.473 0.636498    
## educ         -0.0288802  0.0059008  -4.894 1.19e-06 ***
## age           0.0198158  0.0056660   3.497 0.000496 ***
## I(age^2)     -0.0002598  0.0000617  -4.210 2.84e-05 ***
## restaurn     -0.1007616  0.0394431  -2.555 0.010815 *  
## white        -0.0256801  0.0515172  -0.498 0.618286    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4734 on 799 degrees of freedom
## Multiple R-squared:  0.062,  Adjusted R-squared:  0.05378 
## F-statistic: 7.544 on 7 and 799 DF,  p-value: 8.035e-09

ii. Holding other factors fixed, if education increases by 4 years, what happens to the estimated probability of smoking?

change_educ <- coef(model)["educ"] * 4
print(paste("Change in smoking probability with 4 more years of education:", change_educ))
## [1] "Change in smoking probability with 4 more years of education: -0.115520886961172"

iii. At what point does another year of age reduce the probability of smoking?

age_vertex <- -coef(model)["age"] / (2 * coef(model)["I(age^2)"])
print(paste("Age at which smoking probability starts decreasing:", age_vertex))
## [1] "Age at which smoking probability starts decreasing: 38.1386089151421"

iv. Interpret the coefficient on the binary variable restaurn.

restaurn_coef <- coef(model)["restaurn"]
print(paste("The coefficient on restaurn is:", restaurn_coef))
## [1] "The coefficient on restaurn is: -0.100761630797458"

v. Person number 206 in the data set has the following characteristics.

print("Interpretation: Living in a state with restaurant smoking restrictions reduces the probability of smoking by approximately this value (in percentage points).")
## [1] "Interpretation: Living in a state with restaurant smoking restrictions reduces the probability of smoking by approximately this value (in percentage points)."
predicted <- coef(model)["(Intercept)"] +
  coef(model)["log(cigpric)"] * log(67.44) +
  coef(model)["log(income)"] * log(6500) +
  coef(model)["educ"] * 16 +
  coef(model)["age"] * 77 +
  coef(model)["I(age^2)"] * (77^2) +
  coef(model)["restaurn"] * 0 +
  coef(model)["white"] * 0

print(paste("Predicted probability of smoking for individual 206:", predicted))
## [1] "Predicted probability of smoking for individual 206: -0.00396546766766326"

Question C4.

i. Estimate a model with voteA as the dependet variable.

data("vote1")

#(i)

model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)

# Get residuals
vote1$residuals <- resid(model)

# Regress residuals on independent variables
resid_model <- lm(residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)

# Display R-squared
summary(resid_model)$r.squared
## [1] 1.198131e-32

Question C13.

i. Estimate the model

data("fertil2")
model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
summary(model)
## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban, 
##     data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9012 -0.7136 -0.0039  0.7119  7.4318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.2225162  0.2401888 -17.580  < 2e-16 ***
## age          0.3409255  0.0165082  20.652  < 2e-16 ***
## I(age^2)    -0.0027412  0.0002718 -10.086  < 2e-16 ***
## educ        -0.0752323  0.0062966 -11.948  < 2e-16 ***
## electric    -0.3100404  0.0690045  -4.493 7.20e-06 ***
## urban       -0.2000339  0.0465062  -4.301 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 4352 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5729 
## F-statistic:  1170 on 5 and 4352 DF,  p-value: < 2.2e-16

iii. From the regression in part ii, obtain the fitted values y and the residuals.

# Obtain fitted values and residuals
fitted_vals <- fitted(model2)
residuals_sq <- resid(model2)^2

# Regression of squared residuals on fitted values
het_test <- lm(residuals_sq ~ fitted_vals + I(fitted_vals^2))

# Test joint significance
summary(het_test)
## 
## Call:
## lm(formula = residuals_sq ~ fitted_vals + I(fitted_vals^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12411 -0.09344 -0.04890  0.04351  0.54298 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)        3.1469     3.0320   1.038    0.301
## fitted_vals       -1.9883     1.9544  -1.017    0.311
## I(fitted_vals^2)   0.3244     0.3143   1.032    0.304
## 
## Residual standard error: 0.1288 on 138 degrees of freedom
## Multiple R-squared:  0.008996,   Adjusted R-squared:  -0.005366 
## F-statistic: 0.6264 on 2 and 138 DF,  p-value: 0.536
linearHypothesis(het_test, c("fitted_vals = 0", "I(fitted_vals^2) = 0"))
## 
## Linear hypothesis test:
## fitted_vals = 0
## I(fitted_vals^2) = 0
## 
## Model 1: restricted model
## Model 2: residuals_sq ~ fitted_vals + I(fitted_vals^2)
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    140 2.3097                           
## 2    138 2.2889  2  0.020778 0.6264  0.536
  1. Would you say the heteroskedasticity you found in part iii is practically important?
#Yes, there is a systematic relationship between the fitted values and a sizable amount of the error variance.The fitted values have a regular relationship with a sizable amount of the error variance.

Chapter 9

Question 1.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Exercise 1
# Step 1: Load the CEOSAL2 dataset
data("ceosal2")

# Step 2: Estimate the base model
base_model <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = ceosal2)
summary(base_model)
## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceoten + comten, data = ceosal2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5436 -0.2796 -0.0164  0.2857  1.9879 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.571977   0.253466  18.038  < 2e-16 ***
## log(sales)   0.187787   0.040003   4.694 5.46e-06 ***
## log(mktval)  0.099872   0.049214   2.029  0.04397 *  
## profmarg    -0.002211   0.002105  -1.050  0.29514    
## ceoten       0.017104   0.005540   3.087  0.00236 ** 
## comten      -0.009238   0.003337  -2.768  0.00626 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared:  0.3525, Adjusted R-squared:  0.3336 
## F-statistic: 18.62 on 5 and 171 DF,  p-value: 9.488e-15
# Extract R-squared for the base model
R2_base <- summary(base_model)$r.squared
cat("R-squared for the base model: ", R2_base, "\n")
## R-squared for the base model:  0.3525374
# Step 3: Add ceoten^2 and comten^2 and estimate the updated model
CEOSAL2 <- ceosal2 %>%
  mutate(ceoten_sq = ceoten^2, comten_sq = comten^2)

updated_model <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten + ceoten_sq + comten_sq, data = CEOSAL2)
summary(updated_model)
## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceoten + comten + ceoten_sq + comten_sq, data = CEOSAL2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.47661 -0.26615 -0.03878  0.27787  1.89344 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.424e+00  2.656e-01  16.655  < 2e-16 ***
## log(sales)   1.857e-01  3.980e-02   4.665 6.25e-06 ***
## log(mktval)  1.018e-01  4.872e-02   2.089 0.038228 *  
## profmarg    -2.575e-03  2.088e-03  -1.233 0.219137    
## ceoten       4.772e-02  1.416e-02   3.371 0.000929 ***
## comten      -6.063e-03  1.189e-02  -0.510 0.610815    
## ceoten_sq   -1.119e-03  4.814e-04  -2.324 0.021329 *  
## comten_sq   -5.389e-05  2.528e-04  -0.213 0.831474    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4892 on 169 degrees of freedom
## Multiple R-squared:  0.3745, Adjusted R-squared:  0.3486 
## F-statistic: 14.45 on 7 and 169 DF,  p-value: 1.136e-14
# Extract R-squared for the updated model
R2_updated <- summary(updated_model)$r.squared
cat("R-squared for the updated model: ", R2_updated, "\n")
## R-squared for the updated model:  0.3744746
# Step 4: Compare R-squared values
if (R2_updated > R2_base) {
  cat("The R-squared increased after adding ceoten^2 and comten^2, suggesting a possible functional form misspecification in the base model.\n")
} else {
  cat("No evidence of functional form misspecification.\n")
}
## The R-squared increased after adding ceoten^2 and comten^2, suggesting a possible functional form misspecification in the base model.

Question 5.

data("crime4")

missing_crimes <- crime4 %>%
  filter(is.na(crmrte)) 

cat("Number of colleges missing crime data: ", nrow(missing_crimes), "\n")
## Number of colleges missing crime data:  0

Question C4.

i. Reestimate equation (9.43), but now include a dummy variable for the observation

model_with_dc <- lm(infmort ~ lpcinc + lphysic + lpopul + DC, data = infmrt)
summary(model_with_dc)
## 
## Call:
## lm(formula = infmort ~ lpcinc + lphysic + lpopul + DC, data = infmrt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6894 -0.8973 -0.1412  0.7054  3.1705 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.2125     6.7267   5.383 5.09e-07 ***
## lpcinc       -2.3964     0.8866  -2.703  0.00812 ** 
## lphysic      -1.5548     0.7734  -2.010  0.04718 *  
## lpopul        0.5755     0.1365   4.215 5.60e-05 ***
## DC           13.9632     1.2466  11.201  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.255 on 97 degrees of freedom
## Multiple R-squared:  0.6432, Adjusted R-squared:  0.6285 
## F-statistic: 43.72 on 4 and 97 DF,  p-value: < 2.2e-16
dc_coeff <- summary(model_with_dc)$coefficients["DC", ]
cat("Coefficient on DC:", dc_coeff[1], "\n")
## Coefficient on DC: 13.96317
cat("Standard error for DC:", dc_coeff[2], "\n")
## Standard error for DC: 1.246646
cat("P-value for DC:", dc_coeff[4], "\n")
## P-value for DC: 3.514299e-19
#Without DC
coeff_with_dc <- coef(summary(model_with_dc))[, c(1, 2)]        # With DC

ii. Compare the estimates and standard errors from part i with those from equation (9.44).

model_without_dc <- lm(infmort ~ lpcinc + lphysic + lpopul, data = infmrt)
summary(model_without_dc)
## 
## Call:
## lm(formula = infmort ~ lpcinc + lphysic + lpopul, data = infmrt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0204 -1.1897 -0.1152  0.9083  8.1890 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.22614   10.13465   3.574 0.000547 ***
## lpcinc      -4.88415    1.29319  -3.777 0.000273 ***
## lphysic      4.02783    0.89101   4.521 1.73e-05 ***
## lpopul      -0.05356    0.18748  -0.286 0.775712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.891 on 98 degrees of freedom
## Multiple R-squared:  0.1818, Adjusted R-squared:  0.1568 
## F-statistic:  7.26 on 3 and 98 DF,  p-value: 0.0001898

Question C5.

library(dplyr)
library(quantreg)
## Loading required package: SparseM
library(wooldridge)
data("rdchem")

summary(rdchem)
##        rd              sales            profits           rdintens    
##  Min.   :   1.70   Min.   :   42.0   Min.   :  -4.30   Min.   :1.027  
##  1st Qu.:  10.85   1st Qu.:  507.8   1st Qu.:  42.12   1st Qu.:2.031  
##  Median :  42.60   Median : 1332.5   Median : 111.70   Median :2.641  
##  Mean   : 153.68   Mean   : 3797.0   Mean   : 370.50   Mean   :3.266  
##  3rd Qu.:  78.85   3rd Qu.: 2856.6   3rd Qu.: 295.68   3rd Qu.:3.965  
##  Max.   :1428.00   Max.   :39709.0   Max.   :4154.00   Max.   :9.422  
##     profmarg         salessq              lsales            lrd        
##  Min.   :-3.219   Min.   :1.764e+03   Min.   : 3.738   Min.   :0.5306  
##  1st Qu.: 4.074   1st Qu.:2.579e+05   1st Qu.: 6.230   1st Qu.:2.3839  
##  Median : 8.760   Median :1.790e+06   Median : 7.191   Median :3.7498  
##  Mean   : 9.823   Mean   :7.020e+07   Mean   : 7.165   Mean   :3.6028  
##  3rd Qu.:16.456   3rd Qu.:8.162e+06   3rd Qu.: 7.957   3rd Qu.:4.3631  
##  Max.   :27.187   Max.   :1.577e+09   Max.   :10.589   Max.   :7.2640
head(rdchem)
##      rd  sales profits rdintens  profmarg     salessq   lsales       lrd
## 1 430.6 4570.2   186.9 9.421906  4.089536 20886730.00 8.427312 6.0651798
## 2  59.0 2830.0   467.0 2.084806 16.501766  8008900.00 7.948032 4.0775375
## 3  23.5  596.8   107.4 3.937668 17.995979   356170.22 6.391582 3.1570003
## 4   3.5  133.6    -4.3 2.619760 -3.218563    17848.96 4.894850 1.2527629
## 5   1.7   42.0     8.0 4.047619 19.047619     1764.00 3.737670 0.5306283
## 6   8.4  390.0    47.3 2.153846 12.128205   152100.00 5.966147 2.1282318
# Convert sales to billions
rdchem <- rdchem %>%
  mutate(sales_bil = sales / 1000,
         sales_bil_sq = sales_bil^2)

i. Estimate the equation by OLS.

ols_full <- lm(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem)
summary(ols_full)
## 
## Call:
## lm(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg, 
##     data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   2.058967   0.626269   3.288  0.00272 **
## sales_bil     0.316611   0.138854   2.280  0.03041 * 
## sales_bil_sq -0.007390   0.003716  -1.989  0.05657 . 
## profmarg      0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107
largest_firm <- rdchem %>% filter(sales_bil == max(sales_bil))

rdchem_no_outlier <- rdchem %>% filter(sales_bil != max(sales_bil))

ols_no_outlier <- lm(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem_no_outlier)
summary(ols_no_outlier)
## 
## Call:
## lm(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg, 
##     data = rdchem_no_outlier)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0843 -1.1354 -0.5505  0.7570  5.7783 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   1.98352    0.71763   2.764   0.0102 *
## sales_bil     0.36062    0.23887   1.510   0.1427  
## sales_bil_sq -0.01025    0.01308  -0.784   0.4401  
## profmarg      0.05528    0.04579   1.207   0.2378  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.805 on 27 degrees of freedom
## Multiple R-squared:  0.1912, Adjusted R-squared:  0.1013 
## F-statistic: 2.128 on 3 and 27 DF,  p-value: 0.1201

ii. Estimate the same equation by LAD.

lad_full <- rq(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem, tau = 0.5)
summary(lad_full)
## 
## Call: rq(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg, 
##     tau = 0.5, data = rdchem)
## 
## tau: [1] 0.5
## 
## Coefficients:
##              coefficients lower bd upper bd
## (Intercept)   1.40428      0.87031  2.66628
## sales_bil     0.26346     -0.13508  0.75753
## sales_bil_sq -0.00600     -0.01679  0.00344
## profmarg      0.11400      0.01376  0.16427
lad_no_outlier <- rq(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem_no_outlier, tau = 0.5)
summary(lad_no_outlier)
## 
## Call: rq(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg, 
##     tau = 0.5, data = rdchem_no_outlier)
## 
## tau: [1] 0.5
## 
## Coefficients:
##              coefficients lower bd upper bd
## (Intercept)   2.61047      0.58936  2.81404
## sales_bil    -0.22364     -0.23542  0.87607
## sales_bil_sq  0.01681     -0.03201  0.02883
## profmarg      0.07594      0.00578  0.16392

iii. Based on your findings i and ii, would you say OLS and LAD is more resilient to outliers?

cat("LAD is more resilient to outliers than OLS as it shows smaller variations after removing the largest firm.")
## LAD is more resilient to outliers than OLS as it shows smaller variations after removing the largest firm.

Chapter 10

Question 1.

i. Like cross-sectional observations, we can assume that most time series observations are independently distributed.

#disagree. Time series data often exhibit autocorrelation, where observations are not independent over time. For example, the value of a variable in one period is often correlated with its value in the previous period (e.g., stock prices, GDP growth). This violates the assumption of independence.

ii. The OLS estimator in a time series regression is unbiased under the first 3 Gauss-Markov assumptions.

#Agree. The first three Gauss-Markov assumptions are linearity, no perfect multicollinearity, and zero conditional mean of errors. If these assumptions hold in time series data, OLS is unbiased. However, unbiasedness does not imply efficiency if there is heteroskedasticity or autocorrelation.
mean(residuals(model))  
## [1] -6.393285e-18

iii. A trending variable cannot be used as the dependent variable in multiple regression analysis.

#Disagree.Trending variables can be used as the dependent variable if the regression includes controls for trend (e.g., time dummies, polynomial trends, or differencing). Failure to control for trend, however, can lead to spurious regression results.
# Generate a trending variable
set.seed(123)
time <- 1:100
trending_var <- 0.5 * time + rnorm(100)

# Check for trend
trend_model <- lm(trending_var ~ time)
summary(trend_model)
## 
## Call:
## lm(formula = trending_var ~ time)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.45356 -0.55236 -0.03462  0.64850  2.09487 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.036404   0.184287  -0.198    0.844    
## time         0.502511   0.003168 158.611   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9145 on 98 degrees of freedom
## Multiple R-squared:  0.9961, Adjusted R-squared:  0.9961 
## F-statistic: 2.516e+04 on 1 and 98 DF,  p-value: < 2.2e-16

iv. Seasonality is not an issue when using annual time series observations.

#Agree. Seasonality refers to systematic patterns within shorter time intervals (e.g., months or quarters). With annual data, such intra-year patterns are not present, so seasonality is not a concern.
annual_data <- ts(rnorm(10), frequency = 1) 

Question 5.

# Explanation:
# - time: Captures overall trend
# - interest_rate: Reflects interest rate effects
# - income: Reflects income effects
# - Q1, Q2, Q3: Seasonal dummies for the first three quarters (Q4 as the reference)
# - housing_starts: Dependent variable representing quarterly housing starts

Question C1.

# Create a dummy variable for years after 1979
intdef$after1979 <- ifelse(intdef$year > 1979, 1, 0)

# Fit the regression model
interest_model <- lm(i3 ~ inf + after1979, data = intdef)

# Display the summary of the model
summary(interest_model)
## 
## Call:
## lm(formula = i3 ~ inf + after1979, data = intdef)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5906 -0.7579  0.1414  1.1752  3.8358 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.52833    0.44914   3.403  0.00128 ** 
## inf          0.62991    0.08151   7.728 3.05e-10 ***
## after1979    2.17781    0.49630   4.388 5.48e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.837 on 53 degrees of freedom
## Multiple R-squared:  0.6047, Adjusted R-squared:  0.5898 
## F-statistic: 40.53 on 2 and 53 DF,  p-value: 2.086e-11

Question C9.

i. Consider the equation.

β₁ (associated with pcip): Likely positive. Higher industrial production growth (pcip) indicates economic growth, which tends to positively influence stock market returns.

β₂ (associated with i3): Likely negative. Higher returns on T-bills (i3) might attract investors away from stocks (substitution effect), reducing stock returns.

ii. Estimate the previous equation by OLS, reporting the results in standard form.

volat_model <- lm(rsp500 ~ pcip + i3, data = volat)
summary(volat_model)
## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.871  -22.580    2.103   25.524  138.137 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.84306    3.27488   5.754 1.44e-08 ***
## pcip         0.03642    0.12940   0.281   0.7785    
## i3          -1.36169    0.54072  -2.518   0.0121 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.13 on 554 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01189,    Adjusted R-squared:  0.008325 
## F-statistic: 3.334 on 2 and 554 DF,  p-value: 0.03637

iii. Which of the variables is statistically significant?

# Example output interpretation
# If summary(model)$coefficients shows:
# pcip: Estimate = 0.05, p-value = 0.01 (significant)
# i3: Estimate = -0.02, p-value = 0.15 (not significant)

iv. Does your finding from part iii imply that the return on the S&P 500 is predictable?

#If either pcip or i3 is statistically significant, it suggests some predictability in S&P 500 returns. However: 
#Economic predictability may not imply strong real-world forecasting ability, as stock markets incorporate information quickly (efficient market hypothesis).

Chapter 11

Question C7.

data("consump", package = "wooldridge")

i. Test the PIH

consump$gct <- c(NA, diff(log(consump$c)))
consump$gct_lag <- c(NA, head(consump$gct, -1))

# Fit the regression model for gct ~ gct_lag
model1 <- lm(gct ~ gct_lag, data = consump, na.action = na.omit)
summary(model1)
## 
## Call:
## lm(formula = gct ~ gct_lag, data = consump, na.action = na.omit)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.027878 -0.005974 -0.001451  0.007142  0.020227 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 0.011431   0.003778   3.026  0.00478 **
## gct_lag     0.446141   0.156047   2.859  0.00731 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01161 on 33 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1985, Adjusted R-squared:  0.1742 
## F-statistic: 8.174 on 1 and 33 DF,  p-value: 0.00731

ii. Add Additional Variables

consump$gy <- c(NA, diff(log(consump$y)))  # Growth rate of income
consump$gy_lag <- c(NA, head(consump$gy, -1))  # Lagged growth rate of income
consump$i3_lag <- c(NA, head(consump$i3, -1))  # Lagged T-bill rate
consump$inf_lag <- c(NA, head(consump$inf, -1))  # Lagged inflation rate

# Fit the regression model with additional variables
model2 <- lm(gct ~ gct_lag + gy_lag + i3_lag + inf_lag, data = consump, na.action = na.omit)
summary(model2)
## 
## Call:
## lm(formula = gct ~ gct_lag + gy_lag + i3_lag + inf_lag, data = consump, 
##     na.action = na.omit)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0249092 -0.0075869  0.0000855  0.0087233  0.0188618 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.0225940  0.0070892   3.187  0.00335 **
## gct_lag      0.4335853  0.2896533   1.497  0.14486   
## gy_lag      -0.1079101  0.1946373  -0.554  0.58340   
## i3_lag      -0.0007467  0.0011107  -0.672  0.50654   
## inf_lag     -0.0008281  0.0010041  -0.825  0.41606   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01134 on 30 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3038, Adjusted R-squared:  0.211 
## F-statistic: 3.273 on 4 and 30 DF,  p-value: 0.02431

iii. Impact on p-value of gct−1​

#After adding gy_t-1, i3_t-1, and inf_t-1, check the p-value for lag_gc in model_2. If it becomes insignificant, this suggests multicollinearity or the additional variables absorbing its effect.

iv. Joint Significance of All Variables

anova_result <- anova(model1, model2)
anova_result
## Analysis of Variance Table
## 
## Model 1: gct ~ gct_lag
## Model 2: gct ~ gct_lag + gy_lag + i3_lag + inf_lag
##   Res.Df       RSS Df  Sum of Sq      F Pr(>F)
## 1     33 0.0044446                            
## 2     30 0.0038608  3 0.00058384 1.5122 0.2315

Question C12.

i. Find the first order autocorrelation in gwage232.

library(dplyr) 
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
gwage232_clean <- na.omit(minwage$gwage232)

# Calculate first-order autocorrelation
acf_result <- acf(gwage232_clean, plot = FALSE)
autocorrelation <- acf_result$acf[2] # First lag autocorrelation
gwage232 <- data$gwage232  # Extract the gwage232 column

autocorr_1 <- acf_result$acf[2]  # The first-order autocorrelation
cat("First-order autocorrelation of gwage232:", autocorr_1, "\n")
## First-order autocorrelation of gwage232: -0.03493135

ii. Estimate the dynamic model.

gwage232 <- data$gwage232  # Extract the gwage232 column
autocorr_1 <- acf_result$acf[2]  # The first-order autocorrelation
cat("First-order autocorrelation of gwage232:", autocorr_1, "\n")
## First-order autocorrelation of gwage232: -0.03493135

ii. Estimate the dynamic model by OLS.

model_dynamic <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = minwage)
summary(model_dynamic)
## 
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = minwage)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.044642 -0.004134 -0.001312  0.004482  0.041612 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.0024003  0.0004308   5.572 3.79e-08 ***
## lag(gwage232, 1) -0.0779092  0.0342851  -2.272  0.02341 *  
## gmwage            0.1518459  0.0096485  15.738  < 2e-16 ***
## gcpi              0.2630876  0.0824457   3.191  0.00149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007889 on 606 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2986, Adjusted R-squared:  0.2951 
## F-statistic: 85.99 on 3 and 606 DF,  p-value: < 2.2e-16

iii. Add lagged growth in employment (gemp232_t-1) to the model.

minwage$gemp232_lag <- c(NA, head(minwage$gemp232, -1)) # Lagged growth in employment
model_with_gemp <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + gemp232_lag, data = minwage, na.action = na.omit)
summary(model_with_gemp)
## 
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + gemp232_lag, 
##     data = minwage, na.action = na.omit)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.043842 -0.004378 -0.001034  0.004321  0.042548 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.002451   0.000426   5.753  1.4e-08 ***
## lag(gwage232, 1) -0.074546   0.033901  -2.199 0.028262 *  
## gmwage            0.152707   0.009540  16.007  < 2e-16 ***
## gcpi              0.252296   0.081544   3.094 0.002066 ** 
## gemp232_lag       0.066131   0.016962   3.899 0.000108 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007798 on 605 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3158, Adjusted R-squared:  0.3112 
## F-statistic:  69.8 on 4 and 605 DF,  p-value: < 2.2e-16

iv. Compare models with and without lagged variables

gmwage_coef_dynamic <- summary(model_dynamic)$coefficients["gmwage", "Estimate"]
gmwage_coef_with_gemp <- summary(model_with_gemp)$coefficients["gmwage", "Estimate"]
cat("Part (iv) Coefficient for gmwage in model_dynamic:", gmwage_coef_dynamic, "\n")
## Part (iv) Coefficient for gmwage in model_dynamic: 0.1518459

v. Regress gmwage on gwage232_t-1 and gemp232_t-1

model_gmwage <- lm(gmwage ~ lag(gwage232, 1) + gemp232_lag, data = minwage, na.action = na.omit)
summary(model_gmwage)
## 
## Call:
## lm(formula = gmwage ~ lag(gwage232, 1) + gemp232_lag, data = minwage, 
##     na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.01914 -0.00500 -0.00379 -0.00287  0.62208 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       0.003433   0.001440   2.384   0.0174 *
## lag(gwage232, 1)  0.203167   0.143140   1.419   0.1563  
## gemp232_lag      -0.041706   0.072110  -0.578   0.5632  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03318 on 607 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.00392,    Adjusted R-squared:  0.0006377 
## F-statistic: 1.194 on 2 and 607 DF,  p-value: 0.3036
rsquared <- summary(model_gmwage)$r.squared
cat("Part (v) R-squared from regression of gmwage:", rsquared, "\n")
## Part (v) R-squared from regression of gmwage: 0.003919682
cat("A higher R-squared suggests that gmwage is explained well by lagged variables, aligning with part (iv).\n")
## A higher R-squared suggests that gmwage is explained well by lagged variables, aligning with part (iv).

Chapter 12

Question 11.

data("nyse", package = "wooldridge")

# Create the lagged variable for return
nyse$return_lag1 <- c(NA, head(nyse$return, -1))

# Fit the model (12.47)
model_1247 <- lm(return ~ return_lag1, data = nyse, na.action = na.omit)
summary(model_1247)
## 
## Call:
## lm(formula = return ~ return_lag1, data = nyse, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.261  -1.302   0.098   1.316   8.065 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.17963    0.08074   2.225   0.0264 *
## return_lag1  0.05890    0.03802   1.549   0.1218  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.11 on 687 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.003481,   Adjusted R-squared:  0.00203 
## F-statistic: 2.399 on 1 and 687 DF,  p-value: 0.1218
nyse_clean <- nyse[!is.na(nyse$return_lag1), ]

# Add squared residuals to the cleaned dataset
nyse_clean$squared_resid <- residuals(model_1247)^2

# Calculate average, minimum, and maximum of squared residuals
avg_squared_resid <- mean(nyse_clean$squared_resid)
min_squared_resid <- min(nyse_clean$squared_resid)
max_squared_resid <- max(nyse_clean$squared_resid)

i. Estimate the model in equation.

cat("Part (i) Average of squared residuals:", avg_squared_resid, "\n")
## Part (i) Average of squared residuals: 4.440839
cat("Minimum of squared residuals:", min_squared_resid, "\n")
## Minimum of squared residuals: 7.35465e-06
cat("Maximum of squared residuals:", max_squared_resid, "\n")
## Maximum of squared residuals: 232.8946

ii. Use the squared OLS residuals to estimate the following model of of heteroskedasticity.

nyse_clean$return_lag1_sq <- nyse_clean$return_lag1^2
hetero_model <- lm(squared_resid ~ return_lag1 + return_lag1_sq, data = nyse_clean)
summary(hetero_model)
## 
## Call:
## lm(formula = squared_resid ~ return_lag1 + return_lag1_sq, data = nyse_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.459  -3.011  -1.975   0.676 221.469 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.25734    0.44085   7.389 4.32e-13 ***
## return_lag1    -0.78946    0.19569  -4.034 6.09e-05 ***
## return_lag1_sq  0.29666    0.03552   8.351 3.75e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared:  0.1303, Adjusted R-squared:  0.1278 
## F-statistic:  51.4 on 2 and 686 DF,  p-value: < 2.2e-16
cat("Part (ii) Coefficients and diagnostics for the heteroskedasticity model:\n")
## Part (ii) Coefficients and diagnostics for the heteroskedasticity model:
print(summary(hetero_model))
## 
## Call:
## lm(formula = squared_resid ~ return_lag1 + return_lag1_sq, data = nyse_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.459  -3.011  -1.975   0.676 221.469 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.25734    0.44085   7.389 4.32e-13 ***
## return_lag1    -0.78946    0.19569  -4.034 6.09e-05 ***
## return_lag1_sq  0.29666    0.03552   8.351 3.75e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared:  0.1303, Adjusted R-squared:  0.1278 
## F-statistic:  51.4 on 2 and 686 DF,  p-value: < 2.2e-16

iii. Sketch the conditional variance as a function of the lagged return.

delta_0 <- coef(hetero_model)[1]
delta_1 <- coef(hetero_model)[2]
delta_2 <- coef(hetero_model)[3]

lagged_returns <- seq(min(nyse$return_lag1, na.rm = TRUE), max(nyse$return_lag1, na.rm = TRUE), length.out = 100)
conditional_variance <- delta_0 + delta_1 * lagged_returns + delta_2 * lagged_returns^2

# Plot the conditional variance
plot(lagged_returns, conditional_variance, type = "l", col = "purple",
     main = "Conditional Variance as a Function of Lagged Return",
     xlab = "Lagged Return", ylab = "Conditional Variance")

min_var_index <- which.min(conditional_variance)
cat("Part (iii) Smallest variance:", conditional_variance[min_var_index], "\n")
## Part (iii) Smallest variance: 2.734254
cat("Lagged return corresponding to smallest variance:", lagged_returns[min_var_index], "\n")
## Lagged return corresponding to smallest variance: 1.245583

iv. For predicting the dynamic variance, does the model in part ii produce any negative variance estimates?

any_negative_variance <- any(conditional_variance < 0)
cat("Part (iv) Does the model produce any negative variance?", any_negative_variance, "\n")
## Part (iv) Does the model produce any negative variance? FALSE

v. Compare model (ii) to ARCH(1)

arch1_model <- lm(squared_resid ~ lag(squared_resid, 1), data = nyse_clean, na.action = na.omit)
summary(arch1_model)
## 
## Call:
## lm(formula = squared_resid ~ lag(squared_resid, 1), data = nyse_clean, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.337  -3.292  -2.157   0.556 223.981 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            2.94743    0.44023   6.695 4.49e-11 ***
## lag(squared_resid, 1)  0.33706    0.03595   9.377  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.76 on 686 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1136, Adjusted R-squared:  0.1123 
## F-statistic: 87.92 on 1 and 686 DF,  p-value: < 2.2e-16
cat("Part (v) R-squared for heteroskedasticity model:", summary(hetero_model)$r.squared, "\n")
## Part (v) R-squared for heteroskedasticity model: 0.1303208
cat("R-squared for ARCH(1) model:", summary(arch1_model)$r.squared, "\n")
## R-squared for ARCH(1) model: 0.1136065

vi. ARCH(2) regression

nyse_clean$squared_resid_lag2 <- c(NA, NA, head(nyse_clean$squared_resid, -2))
arch2_model <- lm(squared_resid ~ return_lag1 + return_lag1_sq + squared_resid_lag2, data = nyse_clean)
summary(arch2_model)
## 
## Call:
## lm(formula = squared_resid ~ return_lag1 + return_lag1_sq + squared_resid_lag2, 
##     data = nyse_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.755  -3.039  -1.958   0.596 221.737 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.16429    0.45510   6.953 8.37e-12 ***
## return_lag1        -0.78460    0.19640  -3.995 7.17e-05 ***
## return_lag1_sq      0.28459    0.03810   7.469 2.47e-13 ***
## squared_resid_lag2  0.03492    0.03827   0.913    0.362    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.67 on 683 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1313, Adjusted R-squared:  0.1274 
## F-statistic:  34.4 on 3 and 683 DF,  p-value: < 2.2e-16
cat("Part (vi) R-squared for ARCH(2) model:", summary(arch2_model)$r.squared, "\n")
## Part (vi) R-squared for ARCH(2) model: 0.1312591