install.packages("wooldridge")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://rspm/default/__linux__/focal/latest/src/contrib:
##   cannot open URL 'http://rspm/default/__linux__/focal/latest/src/contrib/PACKAGES'
## Warning: package 'wooldridge' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages

#Chapter 7 1

library(wooldridge)
data("sleep75")
data(sleep75)
model <- lm(sleep ~ totwrk +  educ + age + agesq + male , data = sleep75)
summary(model)
## 
## Call:
## lm(formula = sleep ~ totwrk + educ + age + agesq + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2378.00  -243.29     6.74   259.24  1350.19 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3840.83197  235.10870  16.336   <2e-16 ***
## totwrk        -0.16342    0.01813  -9.013   <2e-16 ***
## educ         -11.71332    5.86689  -1.997   0.0463 *  
## age           -8.69668   11.20746  -0.776   0.4380    
## agesq          0.12844    0.13390   0.959   0.3378    
## male          87.75243   34.32616   2.556   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared:  0.1228, Adjusted R-squared:  0.1165 
## F-statistic: 19.59 on 5 and 700 DF,  p-value: < 2.2e-16
# (i) All other factors being equal, is there evidence that men sleep more than women? How strong is the evidence?
#p value is equal to 87.75/34.33=2.56. That means this value is significant according to the table. We can interpret that men sleep 87.75 minutes more than women.
# (ii) Is there a statistically significant tradeoff between working and sleeping? What is the estimated tradeoff?
#t significant=0.163/0.018=9.05. it is significant statistically
#When working time increased by one minute, sleeping time will be decreased by 0.163 which is equal to 9.78 minutes. 
# (iii) What other regression do you need to run to test the null hypothesis that, holding other factors fixed, age has no effect on sleeping?
#We just have to exclude age variable.
data(sleep75)
model <- lm(sleep ~ totwrk +  educ + male , data = sleep75)
summary(model)
## 
## Call:
## lm(formula = sleep ~ totwrk + educ + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2380.27  -239.15     6.74   257.31  1370.63 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3747.51727   81.00609  46.262  < 2e-16 ***
## totwrk        -0.16734    0.01794  -9.329  < 2e-16 ***
## educ         -13.88479    5.65757  -2.454  0.01436 *  
## male          90.96919   34.27441   2.654  0.00813 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 418 on 702 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1155 
## F-statistic: 31.69 on 3 and 702 DF,  p-value: < 2.2e-16
#Rsq hasn't quite changed so age has no effect on sleeping time.

#Chapter 7 3

library(wooldridge)
data("gpa2")
data(gpa2)
model <- lm(sat ~ hsize +  hsizesq + female + black + I(female*black) , data = gpa2)
summary(model)
## 
## Call:
## lm(formula = sat ~ hsize + hsizesq + female + black + I(female * 
##     black), data = gpa2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.45  -89.54   -5.24   85.41  479.13 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1028.0972     6.2902 163.445  < 2e-16 ***
## hsize               19.2971     3.8323   5.035 4.97e-07 ***
## hsizesq             -2.1948     0.5272  -4.163 3.20e-05 ***
## female             -45.0915     4.2911 -10.508  < 2e-16 ***
## black             -169.8126    12.7131 -13.357  < 2e-16 ***
## I(female * black)   62.3064    18.1542   3.432 0.000605 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared:  0.08578,    Adjusted R-squared:  0.08468 
## F-statistic: 77.52 on 5 and 4131 DF,  p-value: < 2.2e-16
# (i) Is there strong evidence that hsize2 should be included in the model? From this equation,what is the optimal high school size?

# From this equation we don’t have the information about SE to calculate the significance of the coeficientes so we can not be sure. However R square is very small so seems that the equation is not very good. To calculate the optimal size we need to take the first derivative 19.30hsize − 2.19hsize2 the result is 4.4

# (ii) Holding hsize fixed, what is the estimated difference in SAT score between nonblack females and non-black males?

# For that we just need to use white female (female =1, black = 0) − 45.09female +62.31female = 17.22. This is just a simple comparison between white male and female

# (iii) What is the estimated difference in SAT score between non-black males and black males?

# − 169.81black
# (iv) What is the estimated difference in SAT score between black females and non-black females?

# − 169.81black +62.31female · black = -107.5

#Chapter 7 C1

library(wooldridge)
data("gpa1")
names(gpa1)
##  [1] "age"      "soph"     "junior"   "senior"   "senior5"  "male"    
##  [7] "campus"   "business" "engineer" "colGPA"   "hsGPA"    "ACT"     
## [13] "job19"    "job20"    "drive"    "bike"     "walk"     "voluntr" 
## [19] "PC"       "greek"    "car"      "siblings" "bgfriend" "clubs"   
## [25] "skipped"  "alcohol"  "gradMI"   "fathcoll" "mothcoll"
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
# (i) Add the variables mothcoll and fathcoll to the equation estimated in (7.6) and report the results in the usual form. What happens to the estimated effect of PC ownership? Is PC still statistically significant?
model <- lm(hsGPA ~ mothcoll + fathcoll, data = gpa1)

# Add mothcoll and fathcoll to the model
model_updated <- update(model, . ~ . + mothcoll + fathcoll)

# View the results
summary(model_updated)
## 
## Call:
## lm(formula = hsGPA ~ mothcoll + fathcoll, data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.99342 -0.20982  0.00926  0.20926  0.60658 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.393421   0.046719  72.635   <2e-16 ***
## mothcoll     0.019080   0.057555   0.332    0.741    
## fathcoll    -0.002679   0.058303  -0.046    0.963    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3221 on 138 degrees of freedom
## Multiple R-squared:  0.0008268,  Adjusted R-squared:  -0.01365 
## F-statistic: 0.0571 on 2 and 138 DF,  p-value: 0.9445
# (ii) Test for joint significance of mothcoll and fathcoll in the equation from part (i) and be sure to report the p-value.
library(sandwich)
joint_test <- coeftest(model, vcov = vcovHC(model, type = "HC1"), terms = c("mothcoll", "fathcoll"))
joint_test
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.3934207  0.0542083 62.5996   <2e-16 ***
## mothcoll     0.0190800  0.0571079  0.3341   0.7388    
## fathcoll    -0.0026795  0.0600842 -0.0446   0.9645    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(iii) Add hsGPA to the model

## Warning in anova.lmlist(object, ...): models with response '"PC"' removed
## because response differs from model 1
## Analysis of Variance Table
## 
## Response: hsGPA
##            Df  Sum Sq  Mean Sq F value Pr(>F)
## mothcoll    1  0.0116 0.011629  0.1121 0.7383
## fathcoll    1  0.0002 0.000219  0.0021 0.9634
## Residuals 138 14.3175 0.103750
library(wooldridge)
data("wage2")

(i) Estimate the model

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16

(ii) Add the variables exper and tenure? to the equation and show that they are jointly insignificant at even the 20% level.

# Extend the model with exper and tenure
extended_model <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban + exper + tenure, data = wage2)

# Test joint insignificance of exper and tenure
anova(model, extended_model)
## Analysis of Variance Table
## 
## Model 1: log(wage) ~ educ + exper + tenure + married + black + south + 
##     urban
## Model 2: log(wage) ~ educ + exper + tenure + married + black + south + 
##     urban + exper + tenure
##   Res.Df    RSS Df Sum of Sq F Pr(>F)
## 1    927 123.82                      
## 2    927 123.82  0         0

(iii) Extend the original model to allow the return to education to depend on race and test whether the return to education does depend on race.

## 
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## black        0.094809   0.255399   0.371 0.710561    
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16

(iv) What is the estimated wage differential between married blacks and married nonblacks?

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16
## <NA> 
##   NA

Chapter 8 1

#Which of the following are consequences of heteroskedasticity?
# (i) The OLS estimators, b^ j, are inconsistent.
# (ii) The usual F statistic no longer has an F distribution.
# (iii) The OLS estimators are no longer BLUE.
# All three statements are consequences of heteroskedasticity. It is important to detect and address heteroskedasticity to obtain valid and efficient inference in regression analysis. Common remedies include using heteroskedasticity-robust standard errors or transforming the data to stabilize the variance.

#Chapter 8 5

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)
library(ggplot2)
## 'data.frame':    807 obs. of  10 variables:
##  $ educ    : num  16 16 12 13.5 10 6 12 15 12 12 ...
##  $ cigpric : num  60.5 57.9 57.7 57.9 58.3 ...
##  $ white   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ age     : int  46 40 58 30 17 86 35 48 48 31 ...
##  $ income  : int  20000 30000 30000 20000 20000 6500 20000 30000 20000 20000 ...
##  $ cigs    : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ restaurn: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lincome : num  9.9 10.3 10.3 9.9 9.9 ...
##  $ agesq   : int  2116 1600 3364 900 289 7396 1225 2304 2304 961 ...
##  $ lcigpric: num  4.1 4.06 4.05 4.06 4.07 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
##  [1] "educ"     "cigpric"  "white"    "age"      "income"   "cigs"    
##  [7] "restaurn" "lincome"  "agesq"    "lcigpric"

(i) Differences between Standard Errors

## (Intercept)        educ       exper      tenure     married       black 
## 0.113225045 0.006250395 0.003185185 0.002452973 0.039050151 0.037666636 
##       south       urban 
## 0.026248508 0.026958329

(ii) Effect of Education on Smoking Probability

##      educ 
## 0.2617229

(iii) Age Effect on Smoking Probability

## <NA> 
##   NA
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 23.753, df = 7, p-value = 0.001259

(iv) Interpretation of Coefficient on ‘restaurn’

## <NA> 
##   NA

(v) Predicted Probability for Person 206

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 23.753, df = 7, p-value = 0.001259

#Chapter 8 C13

install.packages("sandwich")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://rspm/default/__linux__/focal/latest/src/contrib:
##   cannot open URL 'http://rspm/default/__linux__/focal/latest/src/contrib/PACKAGES'
## Warning: package 'sandwich' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
install.packages("lmtest")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(sandwich)
library(lmtest)
# Assuming 'children', 'age', 'age_squared', 'educ', 'electric', 'urban' are columns in the dataset
library(wooldridge)
data("fertil2")
model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
# Calculate robust standard errors
robust_se <- sqrt(diag(vcovHC(model)))
# Combine non-robust and robust standard errors
summary_with_robust_se <- cbind(coef(model), "Robust SE" = robust_se)
summary(model)
## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban, 
##     data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9012 -0.7136 -0.0039  0.7119  7.4318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.2225162  0.2401888 -17.580  < 2e-16 ***
## age          0.3409255  0.0165082  20.652  < 2e-16 ***
## I(age^2)    -0.0027412  0.0002718 -10.086  < 2e-16 ***
## educ        -0.0752323  0.0062966 -11.948  < 2e-16 ***
## electric    -0.3100404  0.0690045  -4.493 7.20e-06 ***
## urban       -0.2000339  0.0465062  -4.301 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 4352 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5729 
## F-statistic:  1170 on 5 and 4352 DF,  p-value: < 2.2e-16
print(summary_with_robust_se)
##                             Robust SE
## (Intercept) -4.222516228 0.2443961935
## age          0.340925520 0.0192199445
## I(age^2)    -0.002741209 0.0003513959
## educ        -0.075232323 0.0063159137
## electric    -0.310040409 0.0640737262
## urban       -0.200033857 0.0455162364
# Assuming 'relig1', 'relig2', and 'relig3' are the religious dummy variables in the dataset
joint_test <- coeftest(model, vcov = vcovHC)
print(joint_test[, "Pr(>|t|)"])
##  (Intercept)          age     I(age^2)         educ     electric        urban 
## 9.635281e-65 5.015759e-68 7.640643e-15 3.247461e-32 1.351408e-06 1.135260e-05
# Obtain fitted values and residuals
fitted_values <- fitted(model)
residuals <- resid(model)

# Regression of residuals on fitted values
hetero_test <- lm(residuals^2 ~ fitted_values)
summary(hetero_test)
## 
## Call:
## lm(formula = residuals^2 ~ fitted_values)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.336 -1.897 -0.321  0.682 49.275 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.54042    0.09451  -5.718 1.15e-08 ***
## fitted_values  1.16693    0.03347  34.863  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.717 on 4356 degrees of freedom
## Multiple R-squared:  0.2182, Adjusted R-squared:  0.218 
## F-statistic:  1215 on 1 and 4356 DF,  p-value: < 2.2e-16

Chapter 9 1

original_model <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = ceosal2)
original_r_squared <- summary(original_model)$r.squared  # R-squared for the original model

# Model with additional variables
model_with_additional <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten + ceoten^2 + comten^2, data = ceosal2)
additional_r_squared <- summary(model_with_additional)$r.squared  # R-squared for the model with additional variables

# Check for change in R-squared to assess functional form misspecification
change_in_r_squared <- additional_r_squared - original_r_squared
if (change_in_r_squared > 0) {
  cat("The addition of ceoten² and comten² slightly improved R-squared, indicating a slight improvement in model fit.\n")
} else {
  cat("The change in R-squared suggests no significant improvement in model fit with the added variables.\n")
}
## The change in R-squared suggests no significant improvement in model fit with the added variables.

#Chapter 9 5

library(wooldridge)
data("campus")

# Given estimated coefficients and standard errors
b0 <- -6.63
se_b0 <- 1.03
b1 <- 1.27
se_b1 <- 0.11
n <- nrow(campus)

# Extracting estimated coefficients from the dataset (if they're not provided)
# b0 <- coef(model)[1]  # Replace 'model' with the linear regression model of log(crime) ~ log(enroll)
# se_b0 <- sqrt(diag(vcov(model)))[1]  # Standard error of b0
# b1 <- coef(model)[2]  # Replace 'model' with the linear regression model of log(crime) ~ log(enroll)
# se_b1 <- sqrt(diag(vcov(model)))[2]  # Standard error of b1

# Calculate t-statistic
t_stat <- (b1 - 1) / se_b1

# Degrees of freedom
df <- n - 2

# Calculate critical value at alpha = 0.05
critical_value <- qt(0.95, df)

# Perform the test
if (t_stat > critical_value) {
  cat("Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.\n")
} else {
  cat("Fail to reject the null hypothesis H0: B1 = 1 at the 5% level.\n")
}
## Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.
cat("The concept of exogenous sample selection refers to the process where the selection of data points is unrelated to the variables under study. In the context of Example 4.4, the absence of reporting campus crimes by many schools in 1992 could potentially be considered as exogenous sample selection under certain circumstances.\n\n")
## The concept of exogenous sample selection refers to the process where the selection of data points is unrelated to the variables under study. In the context of Example 4.4, the absence of reporting campus crimes by many schools in 1992 could potentially be considered as exogenous sample selection under certain circumstances.
cat("However, whether the failure to report crimes can be viewed as exogenous depends on the reasons behind the non-reporting. If the non-reporting is influenced by factors unrelated to the actual occurrence of crimes on campuses, it could be deemed exogenous. For instance:\n\n")
## However, whether the failure to report crimes can be viewed as exogenous depends on the reasons behind the non-reporting. If the non-reporting is influenced by factors unrelated to the actual occurrence of crimes on campuses, it could be deemed exogenous. For instance:
cat("1. Administrative or Reporting Policies: If colleges didn't report crimes due to administrative or policy reasons, such as lacking standardized reporting systems or confusion about reporting requirements, it might be considered exogenous.\n\n")
## 1. Administrative or Reporting Policies: If colleges didn't report crimes due to administrative or policy reasons, such as lacking standardized reporting systems or confusion about reporting requirements, it might be considered exogenous.
cat("2. Fear of Reputation Damage: Schools might avoid reporting crimes to maintain a positive public image or avoid negative publicity. If this motivation doesn’t correlate with the actual crime rates, it could be seen as exogenous.\n\n")
## 2. Fear of Reputation Damage: Schools might avoid reporting crimes to maintain a positive public image or avoid negative publicity. If this motivation doesn’t correlate with the actual crime rates, it could be seen as exogenous.
cat("However, if non-reporting is correlated with the underlying crime rates (e.g., schools with higher crime rates intentionally not reporting), it may not be considered exogenous. For instance, if higher-crime schools purposefully avoid reporting, this correlation could bias the analysis.\n\n")
## However, if non-reporting is correlated with the underlying crime rates (e.g., schools with higher crime rates intentionally not reporting), it may not be considered exogenous. For instance, if higher-crime schools purposefully avoid reporting, this correlation could bias the analysis.
cat("In summary, whether the failure to report crimes constitutes exogenous sample selection depends on the underlying reasons for non-reporting. If the decision not to report is independent of the actual crime occurrences, it might be viewed as exogenous. Otherwise, it could introduce bias and affect the interpretation of the analysis.\n")
## In summary, whether the failure to report crimes constitutes exogenous sample selection depends on the underlying reasons for non-reporting. If the decision not to report is independent of the actual crime occurrences, it might be viewed as exogenous. Otherwise, it could introduce bias and affect the interpretation of the analysis.

Chapter 9 C3

library(wooldridge)
data("jtrain")

# Filter the dataset for the year 1988
jtrain_1988 <- subset(jtrain, year == 1988)

# Simple regression model using data for 1988 with grant as the explanatory variable
model_1988 <- lm(log(scrap) ~ grant, data = jtrain_1988)

# Model with lagged lscrap as an explanatory variable
model_with_lagged_lscrap <- lm(log(scrap) ~ grant + lscrap_1, data = jtrain_1988)

# Test the null hypothesis that the parameter on lscrap_1 is one against the two-sided alternative
test_lscrap_1 <- summary(model_with_lagged_lscrap)
p_value_lscrap_1 <- test_lscrap_1$coefficients["lscrap_1", "Pr(>|t|)"]

# Display results
cat("Test for lscrap_1 parameter:", ifelse(p_value_lscrap_1 < 0.05, "Statistically significant", "Not significant"), "\n")
## Test for lscrap_1 parameter: Statistically significant

#Chapter 9 C4

library(wooldridge)
data("infmrt")

# Filter the dataset for the year 1990
infmrt_1990 <- subset(infmrt, year == 1990)

# Re-estimate equation 9.43 including a dummy variable for the observation on the District of Columbia (DC)
model_with_dummy <- lm(infmort ~ log(pcinc) + log(physic) + log(popul) + DC, data = infmrt_1990)

# Print the summary of the model
summary(model_with_dummy)
## 
## Call:
## lm(formula = infmort ~ log(pcinc) + log(physic) + log(popul) + 
##     DC, data = infmrt_1990)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4964 -0.8076  0.0000  0.9358  2.6077 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  23.9548    12.4195   1.929  0.05994 .  
## log(pcinc)   -0.5669     1.6412  -0.345  0.73135    
## log(physic)  -2.7418     1.1908  -2.303  0.02588 *  
## log(popul)    0.6292     0.1911   3.293  0.00191 ** 
## DC           16.0350     1.7692   9.064 8.43e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.246 on 46 degrees of freedom
## Multiple R-squared:  0.691,  Adjusted R-squared:  0.6641 
## F-statistic: 25.71 on 4 and 46 DF,  p-value: 3.146e-11

Chapter 10 1

library(wooldridge)
data(intdef)
# Decide if you agree or disagree with each of the following statements and give a brief explanation of your decision:
# (i) Independently distributed time series observations
independence_statement <- "Disagree"
independence_explanation <- "Time series data often exhibits autocorrelation, violating independence assumptions."

# (ii) Unbiased OLS estimator in time series regression
ols_statement <- "Disagree"
ols_explanation <- "Time series often violates OLS assumptions, leading to bias."

# (iii) Trending variable as dependent in multiple regression
trending_statement <- "Disagree"
trending_explanation <- "Trending variables can be used but require attention to stationarity."

# (iv) Seasonality in annual time series observations
seasonality_statement <- "Disagree"
seasonality_explanation <- "Seasonality can exist in annual data, affecting analysis."

Chapter 10 5

#Suppose you have quarterly data on new housing starts, interest rates, and real per capita income. Specify a model for housing starts that accounts for possible trends and seasonality in the variables.

# model_housing_starts <- lm(housing_starts ~ trend_variable + seasonality_variable + interest_rates + real_income, data = data)

#Chapter 10 C1

# In October 1979, the Federal Reserve changed its policy of using finely tuned interest rate adjustments and instead began targeting the money supply. Using the data in INTDEF, define a dummy variable equal to 1 for years after 1979. Include this dummy in equation (10.15) to see if there is a shift in the interest rate equation after 1979. What do you conclude?

# Create a dummy variable for the policy change after 1979
intdef$dummy <- ifelse(intdef$year > 1979, 1, 0)

# Equation with dummy variable
model_with_dummy <- lm(inf ~ dummy + ci3 + cdef + cinf, data = intdef)
summary(model_with_dummy)
## 
## Call:
## lm(formula = inf ~ dummy + ci3 + cdef + cinf, data = intdef)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1867 -1.8047 -0.8382  0.9943  6.7831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.3937     0.5193   6.535 3.21e-08 ***
## dummy         0.9400     0.7977   1.178  0.24423    
## ci3           0.4391     0.3172   1.385  0.17233    
## cdef          0.4382     0.3370   1.300  0.19954    
## cinf          0.5707     0.2103   2.714  0.00909 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.799 on 50 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2012, Adjusted R-squared:  0.1373 
## F-statistic: 3.148 on 4 and 50 DF,  p-value: 0.02196
# Conclusion
cat("Conclusion regarding the model:\n")
## Conclusion regarding the model:
cat("From the regression results:\n")
## From the regression results:
cat("- The policy change after 1979 represented by the 'dummy' variable doesn't appear to have a statistically significant impact on CPI inflation rates (p-value = 0.24423).\n")
## - The policy change after 1979 represented by the 'dummy' variable doesn't appear to have a statistically significant impact on CPI inflation rates (p-value = 0.24423).
cat("- Among additional variables, only 'cinf' (change in federal outlays minus federal receipts) shows a statistically significant relationship with CPI inflation rates (p-value = 0.00909).\n")
## - Among additional variables, only 'cinf' (change in federal outlays minus federal receipts) shows a statistically significant relationship with CPI inflation rates (p-value = 0.00909).
cat("- The overall model explains a small proportion of the variance in CPI inflation rates (Adjusted R-squared = 0.1373).\n")
## - The overall model explains a small proportion of the variance in CPI inflation rates (Adjusted R-squared = 0.1373).
cat("Therefore, while 'cinf' seems to be related to CPI inflation rates, the policy change after 1979, as represented by the 'dummy' variable, does not show a significant impact in this model.")
## Therefore, while 'cinf' seems to be related to CPI inflation rates, the policy change after 1979, as represented by the 'dummy' variable, does not show a significant impact in this model.

#Chapter 10 C6

data(fertil3)
# (i): Regress gfr on t and tsq to obtain the residuals (gft)
model_t_tsq <- lm(gfr ~ t + tsq, data = fertil3)
residuals_gft <- resid(model_t_tsq)

# (ii): Regress gft on all variables from equation (10.35), including t and tsq
model_10_35 <- lm(residuals_gft ~ pe + year + tsq + pe_1 + pe_2 + pe_3 + pe_4 + pill + ww2 + tcu + cgfr + cpe + cpe_1 + cpe_2 + cpe_3 + cpe_4 + gfr_1 + cgfr_1 + cgfr_2 + cgfr_3 + cgfr_4 + gfr_2 + t + tsq, data = fertil3)
summary(model_10_35)
## Warning in summary.lm(model_10_35): essentially perfect fit: summary may be
## unreliable
## 
## Call:
## lm(formula = residuals_gft ~ pe + year + tsq + pe_1 + pe_2 + 
##     pe_3 + pe_4 + pill + ww2 + tcu + cgfr + cpe + cpe_1 + cpe_2 + 
##     cpe_3 + cpe_4 + gfr_1 + cgfr_1 + cgfr_2 + cgfr_3 + cgfr_4 + 
##     gfr_2 + t + tsq, data = fertil3)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -5.495e-14 -2.374e-15 -2.200e-17  2.638e-15  3.812e-14 
## 
## Coefficients: (6 not defined because of singularities)
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  3.003e+01  4.452e-12  6.746e+12   <2e-16 ***
## pe          -2.624e-16  1.381e-16 -1.900e+00   0.0634 .  
## year        -7.170e-02  2.307e-15 -3.107e+13   <2e-16 ***
## tsq          7.959e-03  6.783e-17  1.173e+14   <2e-16 ***
## pe_1         3.803e-16  1.519e-16  2.504e+00   0.0157 *  
## pe_2         2.118e-17  1.653e-16  1.280e-01   0.8986    
## pe_3        -2.758e-16  1.563e-16 -1.764e+00   0.0839 .  
## pe_4         1.363e-17  1.222e-16  1.120e-01   0.9117    
## pill         5.956e-15  1.134e-14  5.250e-01   0.6020    
## ww2         -3.761e-15  1.268e-14 -2.970e-01   0.7679    
## tcu          1.130e-18  5.813e-19  1.945e+00   0.0576 .  
## cgfr         1.000e+00  5.225e-16  1.914e+15   <2e-16 ***
## cpe                 NA         NA         NA       NA    
## cpe_1               NA         NA         NA       NA    
## cpe_2               NA         NA         NA       NA    
## cpe_3               NA         NA         NA       NA    
## cpe_4       -2.998e-17  1.239e-16 -2.420e-01   0.8098    
## gfr_1        1.000e+00  2.436e-16  4.105e+15   <2e-16 ***
## cgfr_1      -2.649e-16  5.109e-16 -5.190e-01   0.6064    
## cgfr_2       4.872e-16  5.394e-16  9.030e-01   0.3708    
## cgfr_3      -7.235e-16  4.843e-16 -1.494e+00   0.1416    
## cgfr_4       8.808e-19  4.778e-16  2.000e-03   0.9985    
## gfr_2               NA         NA         NA       NA    
## t                   NA         NA         NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.244e-14 on 49 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 6.667e+30 on 17 and 49 DF,  p-value: < 2.2e-16
# (iii): Re-estimate equation (10.35) but add the 'pe_3' as an additional variable to check stat. significance.
model_with_pe_3 <- lm(gfr ~ pe + year + tsq + pe_1 + pe_2 + pe_3 + pe_4 + pill + ww2 + tcu + cgfr + cpe + cpe_1 + cpe_2 + cpe_3 + cpe_4 + gfr_1 + cgfr_1 + cgfr_2 + cgfr_3 + cgfr_4 + gfr_2 + t + tsq + pe_3, data = fertil3)
summary(model_with_pe_3)
## Warning in summary.lm(model_with_pe_3): essentially perfect fit: summary may be
## unreliable
## 
## Call:
## lm(formula = gfr ~ pe + year + tsq + pe_1 + pe_2 + pe_3 + pe_4 + 
##     pill + ww2 + tcu + cgfr + cpe + cpe_1 + cpe_2 + cpe_3 + cpe_4 + 
##     gfr_1 + cgfr_1 + cgfr_2 + cgfr_3 + cgfr_4 + gfr_2 + t + tsq + 
##     pe_3, data = fertil3)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -6.294e-14 -3.614e-15  3.870e-16  3.960e-15  5.021e-14 
## 
## Coefficients: (6 not defined because of singularities)
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept) -5.849e-12  4.744e-12 -1.233e+00    0.224    
## pe          -6.052e-17  1.472e-16 -4.110e-01    0.683    
## year         3.028e-15  2.459e-15  1.231e+00    0.224    
## tsq         -7.141e-17  7.229e-17 -9.880e-01    0.328    
## pe_1         1.081e-16  1.618e-16  6.680e-01    0.507    
## pe_2         3.984e-18  1.762e-16  2.300e-02    0.982    
## pe_3        -2.328e-17  1.666e-16 -1.400e-01    0.889    
## pe_4        -5.775e-17  1.303e-16 -4.430e-01    0.660    
## pill        -1.025e-14  1.209e-14 -8.480e-01    0.401    
## ww2          1.107e-14  1.351e-14  8.190e-01    0.417    
## tcu          5.447e-19  6.195e-19  8.790e-01    0.384    
## cgfr         1.000e+00  5.569e-16  1.796e+15   <2e-16 ***
## cpe                 NA         NA         NA       NA    
## cpe_1               NA         NA         NA       NA    
## cpe_2               NA         NA         NA       NA    
## cpe_3               NA         NA         NA       NA    
## cpe_4        5.996e-17  1.320e-16  4.540e-01    0.652    
## gfr_1        1.000e+00  2.596e-16  3.852e+15   <2e-16 ***
## cgfr_1      -7.823e-16  5.445e-16 -1.437e+00    0.157    
## cgfr_2       5.662e-17  5.749e-16  9.900e-02    0.922    
## cgfr_3      -5.129e-16  5.162e-16 -9.940e-01    0.325    
## cgfr_4      -1.866e-16  5.092e-16 -3.660e-01    0.716    
## gfr_2               NA         NA         NA       NA    
## t                   NA         NA         NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.326e-14 on 49 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 7.853e+30 on 17 and 49 DF,  p-value: < 2.2e-16