install.packages("wooldridge")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://rspm/default/__linux__/focal/latest/src/contrib:
## cannot open URL 'http://rspm/default/__linux__/focal/latest/src/contrib/PACKAGES'
## Warning: package 'wooldridge' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
#Chapter 7 1
library(wooldridge)
data("sleep75")
data(sleep75)
model <- lm(sleep ~ totwrk + educ + age + agesq + male , data = sleep75)
summary(model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + age + agesq + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2378.00 -243.29 6.74 259.24 1350.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3840.83197 235.10870 16.336 <2e-16 ***
## totwrk -0.16342 0.01813 -9.013 <2e-16 ***
## educ -11.71332 5.86689 -1.997 0.0463 *
## age -8.69668 11.20746 -0.776 0.4380
## agesq 0.12844 0.13390 0.959 0.3378
## male 87.75243 34.32616 2.556 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared: 0.1228, Adjusted R-squared: 0.1165
## F-statistic: 19.59 on 5 and 700 DF, p-value: < 2.2e-16
# (i) All other factors being equal, is there evidence that men sleep more than women? How strong is the evidence?
#p value is equal to 87.75/34.33=2.56. That means this value is significant according to the table. We can interpret that men sleep 87.75 minutes more than women.
# (ii) Is there a statistically significant tradeoff between working and sleeping? What is the estimated tradeoff?
#t significant=0.163/0.018=9.05. it is significant statistically
#When working time increased by one minute, sleeping time will be decreased by 0.163 which is equal to 9.78 minutes.
# (iii) What other regression do you need to run to test the null hypothesis that, holding other factors fixed, age has no effect on sleeping?
#We just have to exclude age variable.
data(sleep75)
model <- lm(sleep ~ totwrk + educ + male , data = sleep75)
summary(model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2380.27 -239.15 6.74 257.31 1370.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3747.51727 81.00609 46.262 < 2e-16 ***
## totwrk -0.16734 0.01794 -9.329 < 2e-16 ***
## educ -13.88479 5.65757 -2.454 0.01436 *
## male 90.96919 34.27441 2.654 0.00813 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 418 on 702 degrees of freedom
## Multiple R-squared: 0.1193, Adjusted R-squared: 0.1155
## F-statistic: 31.69 on 3 and 702 DF, p-value: < 2.2e-16
#Rsq hasn't quite changed so age has no effect on sleeping time.
#Chapter 7 3
library(wooldridge)
data("gpa2")
data(gpa2)
model <- lm(sat ~ hsize + hsizesq + female + black + I(female*black) , data = gpa2)
summary(model)
##
## Call:
## lm(formula = sat ~ hsize + hsizesq + female + black + I(female *
## black), data = gpa2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.45 -89.54 -5.24 85.41 479.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1028.0972 6.2902 163.445 < 2e-16 ***
## hsize 19.2971 3.8323 5.035 4.97e-07 ***
## hsizesq -2.1948 0.5272 -4.163 3.20e-05 ***
## female -45.0915 4.2911 -10.508 < 2e-16 ***
## black -169.8126 12.7131 -13.357 < 2e-16 ***
## I(female * black) 62.3064 18.1542 3.432 0.000605 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared: 0.08578, Adjusted R-squared: 0.08468
## F-statistic: 77.52 on 5 and 4131 DF, p-value: < 2.2e-16
# (i) Is there strong evidence that hsize2 should be included in the model? From this equation,what is the optimal high school size?
# From this equation we don’t have the information about SE to calculate the significance of the coeficientes so we can not be sure. However R square is very small so seems that the equation is not very good. To calculate the optimal size we need to take the first derivative 19.30hsize − 2.19hsize2 the result is 4.4
# (ii) Holding hsize fixed, what is the estimated difference in SAT score between nonblack females and non-black males?
# For that we just need to use white female (female =1, black = 0) − 45.09female +62.31female = 17.22. This is just a simple comparison between white male and female
# (iii) What is the estimated difference in SAT score between non-black males and black males?
# − 169.81black
# (iv) What is the estimated difference in SAT score between black females and non-black females?
# − 169.81black +62.31female · black = -107.5
#Chapter 7 C1
library(wooldridge)
data("gpa1")
names(gpa1)
## [1] "age" "soph" "junior" "senior" "senior5" "male"
## [7] "campus" "business" "engineer" "colGPA" "hsGPA" "ACT"
## [13] "job19" "job20" "drive" "bike" "walk" "voluntr"
## [19] "PC" "greek" "car" "siblings" "bgfriend" "clubs"
## [25] "skipped" "alcohol" "gradMI" "fathcoll" "mothcoll"
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
# (i) Add the variables mothcoll and fathcoll to the equation estimated in (7.6) and report the results in the usual form. What happens to the estimated effect of PC ownership? Is PC still statistically significant?
model <- lm(hsGPA ~ mothcoll + fathcoll, data = gpa1)
# Add mothcoll and fathcoll to the model
model_updated <- update(model, . ~ . + mothcoll + fathcoll)
# View the results
summary(model_updated)
##
## Call:
## lm(formula = hsGPA ~ mothcoll + fathcoll, data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.99342 -0.20982 0.00926 0.20926 0.60658
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.393421 0.046719 72.635 <2e-16 ***
## mothcoll 0.019080 0.057555 0.332 0.741
## fathcoll -0.002679 0.058303 -0.046 0.963
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3221 on 138 degrees of freedom
## Multiple R-squared: 0.0008268, Adjusted R-squared: -0.01365
## F-statistic: 0.0571 on 2 and 138 DF, p-value: 0.9445
# (ii) Test for joint significance of mothcoll and fathcoll in the equation from part (i) and be sure to report the p-value.
library(sandwich)
joint_test <- coeftest(model, vcov = vcovHC(model, type = "HC1"), terms = c("mothcoll", "fathcoll"))
joint_test
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3934207 0.0542083 62.5996 <2e-16 ***
## mothcoll 0.0190800 0.0571079 0.3341 0.7388
## fathcoll -0.0026795 0.0600842 -0.0446 0.9645
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in anova.lmlist(object, ...): models with response '"PC"' removed
## because response differs from model 1
## Analysis of Variance Table
##
## Response: hsGPA
## Df Sum Sq Mean Sq F value Pr(>F)
## mothcoll 1 0.0116 0.011629 0.1121 0.7383
## fathcoll 1 0.0002 0.000219 0.0021 0.9634
## Residuals 138 14.3175 0.103750
library(wooldridge)
data("wage2")
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
# Extend the model with exper and tenure
extended_model <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban + exper + tenure, data = wage2)
# Test joint insignificance of exper and tenure
anova(model, extended_model)
## Analysis of Variance Table
##
## Model 1: log(wage) ~ educ + exper + tenure + married + black + south +
## urban
## Model 2: log(wage) ~ educ + exper + tenure + married + black + south +
## urban + exper + tenure
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 123.82
## 2 927 123.82 0 0
##
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## black 0.094809 0.255399 0.371 0.710561
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
## <NA>
## NA
#Which of the following are consequences of heteroskedasticity?
# (i) The OLS estimators, b^ j, are inconsistent.
# (ii) The usual F statistic no longer has an F distribution.
# (iii) The OLS estimators are no longer BLUE.
# All three statements are consequences of heteroskedasticity. It is important to detect and address heteroskedasticity to obtain valid and efficient inference in regression analysis. Common remedies include using heteroskedasticity-robust standard errors or transforming the data to stabilize the variance.
#Chapter 8 5
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)
library(ggplot2)
## 'data.frame': 807 obs. of 10 variables:
## $ educ : num 16 16 12 13.5 10 6 12 15 12 12 ...
## $ cigpric : num 60.5 57.9 57.7 57.9 58.3 ...
## $ white : int 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 46 40 58 30 17 86 35 48 48 31 ...
## $ income : int 20000 30000 30000 20000 20000 6500 20000 30000 20000 20000 ...
## $ cigs : int 0 0 3 0 0 0 0 0 0 0 ...
## $ restaurn: int 0 0 0 0 0 0 0 0 0 0 ...
## $ lincome : num 9.9 10.3 10.3 9.9 9.9 ...
## $ agesq : int 2116 1600 3364 900 289 7396 1225 2304 2304 961 ...
## $ lcigpric: num 4.1 4.06 4.05 4.06 4.07 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
## [1] "educ" "cigpric" "white" "age" "income" "cigs"
## [7] "restaurn" "lincome" "agesq" "lcigpric"
## (Intercept) educ exper tenure married black
## 0.113225045 0.006250395 0.003185185 0.002452973 0.039050151 0.037666636
## south urban
## 0.026248508 0.026958329
## educ
## 0.2617229
## <NA>
## NA
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 23.753, df = 7, p-value = 0.001259
## <NA>
## NA
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 23.753, df = 7, p-value = 0.001259
#Chapter 8 C13
install.packages("sandwich")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://rspm/default/__linux__/focal/latest/src/contrib:
## cannot open URL 'http://rspm/default/__linux__/focal/latest/src/contrib/PACKAGES'
## Warning: package 'sandwich' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
install.packages("lmtest")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(sandwich)
library(lmtest)
# Assuming 'children', 'age', 'age_squared', 'educ', 'electric', 'urban' are columns in the dataset
library(wooldridge)
data("fertil2")
model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
# Calculate robust standard errors
robust_se <- sqrt(diag(vcovHC(model)))
# Combine non-robust and robust standard errors
summary_with_robust_se <- cbind(coef(model), "Robust SE" = robust_se)
summary(model)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban,
## data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9012 -0.7136 -0.0039 0.7119 7.4318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2225162 0.2401888 -17.580 < 2e-16 ***
## age 0.3409255 0.0165082 20.652 < 2e-16 ***
## I(age^2) -0.0027412 0.0002718 -10.086 < 2e-16 ***
## educ -0.0752323 0.0062966 -11.948 < 2e-16 ***
## electric -0.3100404 0.0690045 -4.493 7.20e-06 ***
## urban -0.2000339 0.0465062 -4.301 1.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 4352 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.5734, Adjusted R-squared: 0.5729
## F-statistic: 1170 on 5 and 4352 DF, p-value: < 2.2e-16
print(summary_with_robust_se)
## Robust SE
## (Intercept) -4.222516228 0.2443961935
## age 0.340925520 0.0192199445
## I(age^2) -0.002741209 0.0003513959
## educ -0.075232323 0.0063159137
## electric -0.310040409 0.0640737262
## urban -0.200033857 0.0455162364
# Assuming 'relig1', 'relig2', and 'relig3' are the religious dummy variables in the dataset
joint_test <- coeftest(model, vcov = vcovHC)
print(joint_test[, "Pr(>|t|)"])
## (Intercept) age I(age^2) educ electric urban
## 9.635281e-65 5.015759e-68 7.640643e-15 3.247461e-32 1.351408e-06 1.135260e-05
# Obtain fitted values and residuals
fitted_values <- fitted(model)
residuals <- resid(model)
# Regression of residuals on fitted values
hetero_test <- lm(residuals^2 ~ fitted_values)
summary(hetero_test)
##
## Call:
## lm(formula = residuals^2 ~ fitted_values)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.336 -1.897 -0.321 0.682 49.275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.54042 0.09451 -5.718 1.15e-08 ***
## fitted_values 1.16693 0.03347 34.863 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.717 on 4356 degrees of freedom
## Multiple R-squared: 0.2182, Adjusted R-squared: 0.218
## F-statistic: 1215 on 1 and 4356 DF, p-value: < 2.2e-16
original_model <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = ceosal2)
original_r_squared <- summary(original_model)$r.squared # R-squared for the original model
# Model with additional variables
model_with_additional <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten + ceoten^2 + comten^2, data = ceosal2)
additional_r_squared <- summary(model_with_additional)$r.squared # R-squared for the model with additional variables
# Check for change in R-squared to assess functional form misspecification
change_in_r_squared <- additional_r_squared - original_r_squared
if (change_in_r_squared > 0) {
cat("The addition of ceoten² and comten² slightly improved R-squared, indicating a slight improvement in model fit.\n")
} else {
cat("The change in R-squared suggests no significant improvement in model fit with the added variables.\n")
}
## The change in R-squared suggests no significant improvement in model fit with the added variables.
#Chapter 9 5
library(wooldridge)
data("campus")
# Given estimated coefficients and standard errors
b0 <- -6.63
se_b0 <- 1.03
b1 <- 1.27
se_b1 <- 0.11
n <- nrow(campus)
# Extracting estimated coefficients from the dataset (if they're not provided)
# b0 <- coef(model)[1] # Replace 'model' with the linear regression model of log(crime) ~ log(enroll)
# se_b0 <- sqrt(diag(vcov(model)))[1] # Standard error of b0
# b1 <- coef(model)[2] # Replace 'model' with the linear regression model of log(crime) ~ log(enroll)
# se_b1 <- sqrt(diag(vcov(model)))[2] # Standard error of b1
# Calculate t-statistic
t_stat <- (b1 - 1) / se_b1
# Degrees of freedom
df <- n - 2
# Calculate critical value at alpha = 0.05
critical_value <- qt(0.95, df)
# Perform the test
if (t_stat > critical_value) {
cat("Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.\n")
} else {
cat("Fail to reject the null hypothesis H0: B1 = 1 at the 5% level.\n")
}
## Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.
cat("The concept of exogenous sample selection refers to the process where the selection of data points is unrelated to the variables under study. In the context of Example 4.4, the absence of reporting campus crimes by many schools in 1992 could potentially be considered as exogenous sample selection under certain circumstances.\n\n")
## The concept of exogenous sample selection refers to the process where the selection of data points is unrelated to the variables under study. In the context of Example 4.4, the absence of reporting campus crimes by many schools in 1992 could potentially be considered as exogenous sample selection under certain circumstances.
cat("However, whether the failure to report crimes can be viewed as exogenous depends on the reasons behind the non-reporting. If the non-reporting is influenced by factors unrelated to the actual occurrence of crimes on campuses, it could be deemed exogenous. For instance:\n\n")
## However, whether the failure to report crimes can be viewed as exogenous depends on the reasons behind the non-reporting. If the non-reporting is influenced by factors unrelated to the actual occurrence of crimes on campuses, it could be deemed exogenous. For instance:
cat("1. Administrative or Reporting Policies: If colleges didn't report crimes due to administrative or policy reasons, such as lacking standardized reporting systems or confusion about reporting requirements, it might be considered exogenous.\n\n")
## 1. Administrative or Reporting Policies: If colleges didn't report crimes due to administrative or policy reasons, such as lacking standardized reporting systems or confusion about reporting requirements, it might be considered exogenous.
cat("2. Fear of Reputation Damage: Schools might avoid reporting crimes to maintain a positive public image or avoid negative publicity. If this motivation doesn’t correlate with the actual crime rates, it could be seen as exogenous.\n\n")
## 2. Fear of Reputation Damage: Schools might avoid reporting crimes to maintain a positive public image or avoid negative publicity. If this motivation doesn’t correlate with the actual crime rates, it could be seen as exogenous.
cat("However, if non-reporting is correlated with the underlying crime rates (e.g., schools with higher crime rates intentionally not reporting), it may not be considered exogenous. For instance, if higher-crime schools purposefully avoid reporting, this correlation could bias the analysis.\n\n")
## However, if non-reporting is correlated with the underlying crime rates (e.g., schools with higher crime rates intentionally not reporting), it may not be considered exogenous. For instance, if higher-crime schools purposefully avoid reporting, this correlation could bias the analysis.
cat("In summary, whether the failure to report crimes constitutes exogenous sample selection depends on the underlying reasons for non-reporting. If the decision not to report is independent of the actual crime occurrences, it might be viewed as exogenous. Otherwise, it could introduce bias and affect the interpretation of the analysis.\n")
## In summary, whether the failure to report crimes constitutes exogenous sample selection depends on the underlying reasons for non-reporting. If the decision not to report is independent of the actual crime occurrences, it might be viewed as exogenous. Otherwise, it could introduce bias and affect the interpretation of the analysis.
library(wooldridge)
data("jtrain")
# Filter the dataset for the year 1988
jtrain_1988 <- subset(jtrain, year == 1988)
# Simple regression model using data for 1988 with grant as the explanatory variable
model_1988 <- lm(log(scrap) ~ grant, data = jtrain_1988)
# Model with lagged lscrap as an explanatory variable
model_with_lagged_lscrap <- lm(log(scrap) ~ grant + lscrap_1, data = jtrain_1988)
# Test the null hypothesis that the parameter on lscrap_1 is one against the two-sided alternative
test_lscrap_1 <- summary(model_with_lagged_lscrap)
p_value_lscrap_1 <- test_lscrap_1$coefficients["lscrap_1", "Pr(>|t|)"]
# Display results
cat("Test for lscrap_1 parameter:", ifelse(p_value_lscrap_1 < 0.05, "Statistically significant", "Not significant"), "\n")
## Test for lscrap_1 parameter: Statistically significant
#Chapter 9 C4
library(wooldridge)
data("infmrt")
# Filter the dataset for the year 1990
infmrt_1990 <- subset(infmrt, year == 1990)
# Re-estimate equation 9.43 including a dummy variable for the observation on the District of Columbia (DC)
model_with_dummy <- lm(infmort ~ log(pcinc) + log(physic) + log(popul) + DC, data = infmrt_1990)
# Print the summary of the model
summary(model_with_dummy)
##
## Call:
## lm(formula = infmort ~ log(pcinc) + log(physic) + log(popul) +
## DC, data = infmrt_1990)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4964 -0.8076 0.0000 0.9358 2.6077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.9548 12.4195 1.929 0.05994 .
## log(pcinc) -0.5669 1.6412 -0.345 0.73135
## log(physic) -2.7418 1.1908 -2.303 0.02588 *
## log(popul) 0.6292 0.1911 3.293 0.00191 **
## DC 16.0350 1.7692 9.064 8.43e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.246 on 46 degrees of freedom
## Multiple R-squared: 0.691, Adjusted R-squared: 0.6641
## F-statistic: 25.71 on 4 and 46 DF, p-value: 3.146e-11
library(wooldridge)
data(intdef)
# Decide if you agree or disagree with each of the following statements and give a brief explanation of your decision:
# (i) Independently distributed time series observations
independence_statement <- "Disagree"
independence_explanation <- "Time series data often exhibits autocorrelation, violating independence assumptions."
# (ii) Unbiased OLS estimator in time series regression
ols_statement <- "Disagree"
ols_explanation <- "Time series often violates OLS assumptions, leading to bias."
# (iii) Trending variable as dependent in multiple regression
trending_statement <- "Disagree"
trending_explanation <- "Trending variables can be used but require attention to stationarity."
# (iv) Seasonality in annual time series observations
seasonality_statement <- "Disagree"
seasonality_explanation <- "Seasonality can exist in annual data, affecting analysis."
#Suppose you have quarterly data on new housing starts, interest rates, and real per capita income. Specify a model for housing starts that accounts for possible trends and seasonality in the variables.
# model_housing_starts <- lm(housing_starts ~ trend_variable + seasonality_variable + interest_rates + real_income, data = data)
#Chapter 10 C1
# In October 1979, the Federal Reserve changed its policy of using finely tuned interest rate adjustments and instead began targeting the money supply. Using the data in INTDEF, define a dummy variable equal to 1 for years after 1979. Include this dummy in equation (10.15) to see if there is a shift in the interest rate equation after 1979. What do you conclude?
# Create a dummy variable for the policy change after 1979
intdef$dummy <- ifelse(intdef$year > 1979, 1, 0)
# Equation with dummy variable
model_with_dummy <- lm(inf ~ dummy + ci3 + cdef + cinf, data = intdef)
summary(model_with_dummy)
##
## Call:
## lm(formula = inf ~ dummy + ci3 + cdef + cinf, data = intdef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1867 -1.8047 -0.8382 0.9943 6.7831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3937 0.5193 6.535 3.21e-08 ***
## dummy 0.9400 0.7977 1.178 0.24423
## ci3 0.4391 0.3172 1.385 0.17233
## cdef 0.4382 0.3370 1.300 0.19954
## cinf 0.5707 0.2103 2.714 0.00909 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.799 on 50 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2012, Adjusted R-squared: 0.1373
## F-statistic: 3.148 on 4 and 50 DF, p-value: 0.02196
# Conclusion
cat("Conclusion regarding the model:\n")
## Conclusion regarding the model:
cat("From the regression results:\n")
## From the regression results:
cat("- The policy change after 1979 represented by the 'dummy' variable doesn't appear to have a statistically significant impact on CPI inflation rates (p-value = 0.24423).\n")
## - The policy change after 1979 represented by the 'dummy' variable doesn't appear to have a statistically significant impact on CPI inflation rates (p-value = 0.24423).
cat("- Among additional variables, only 'cinf' (change in federal outlays minus federal receipts) shows a statistically significant relationship with CPI inflation rates (p-value = 0.00909).\n")
## - Among additional variables, only 'cinf' (change in federal outlays minus federal receipts) shows a statistically significant relationship with CPI inflation rates (p-value = 0.00909).
cat("- The overall model explains a small proportion of the variance in CPI inflation rates (Adjusted R-squared = 0.1373).\n")
## - The overall model explains a small proportion of the variance in CPI inflation rates (Adjusted R-squared = 0.1373).
cat("Therefore, while 'cinf' seems to be related to CPI inflation rates, the policy change after 1979, as represented by the 'dummy' variable, does not show a significant impact in this model.")
## Therefore, while 'cinf' seems to be related to CPI inflation rates, the policy change after 1979, as represented by the 'dummy' variable, does not show a significant impact in this model.
#Chapter 10 C6
data(fertil3)
# (i): Regress gfr on t and tsq to obtain the residuals (gft)
model_t_tsq <- lm(gfr ~ t + tsq, data = fertil3)
residuals_gft <- resid(model_t_tsq)
# (ii): Regress gft on all variables from equation (10.35), including t and tsq
model_10_35 <- lm(residuals_gft ~ pe + year + tsq + pe_1 + pe_2 + pe_3 + pe_4 + pill + ww2 + tcu + cgfr + cpe + cpe_1 + cpe_2 + cpe_3 + cpe_4 + gfr_1 + cgfr_1 + cgfr_2 + cgfr_3 + cgfr_4 + gfr_2 + t + tsq, data = fertil3)
summary(model_10_35)
## Warning in summary.lm(model_10_35): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = residuals_gft ~ pe + year + tsq + pe_1 + pe_2 +
## pe_3 + pe_4 + pill + ww2 + tcu + cgfr + cpe + cpe_1 + cpe_2 +
## cpe_3 + cpe_4 + gfr_1 + cgfr_1 + cgfr_2 + cgfr_3 + cgfr_4 +
## gfr_2 + t + tsq, data = fertil3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.495e-14 -2.374e-15 -2.200e-17 2.638e-15 3.812e-14
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.003e+01 4.452e-12 6.746e+12 <2e-16 ***
## pe -2.624e-16 1.381e-16 -1.900e+00 0.0634 .
## year -7.170e-02 2.307e-15 -3.107e+13 <2e-16 ***
## tsq 7.959e-03 6.783e-17 1.173e+14 <2e-16 ***
## pe_1 3.803e-16 1.519e-16 2.504e+00 0.0157 *
## pe_2 2.118e-17 1.653e-16 1.280e-01 0.8986
## pe_3 -2.758e-16 1.563e-16 -1.764e+00 0.0839 .
## pe_4 1.363e-17 1.222e-16 1.120e-01 0.9117
## pill 5.956e-15 1.134e-14 5.250e-01 0.6020
## ww2 -3.761e-15 1.268e-14 -2.970e-01 0.7679
## tcu 1.130e-18 5.813e-19 1.945e+00 0.0576 .
## cgfr 1.000e+00 5.225e-16 1.914e+15 <2e-16 ***
## cpe NA NA NA NA
## cpe_1 NA NA NA NA
## cpe_2 NA NA NA NA
## cpe_3 NA NA NA NA
## cpe_4 -2.998e-17 1.239e-16 -2.420e-01 0.8098
## gfr_1 1.000e+00 2.436e-16 4.105e+15 <2e-16 ***
## cgfr_1 -2.649e-16 5.109e-16 -5.190e-01 0.6064
## cgfr_2 4.872e-16 5.394e-16 9.030e-01 0.3708
## cgfr_3 -7.235e-16 4.843e-16 -1.494e+00 0.1416
## cgfr_4 8.808e-19 4.778e-16 2.000e-03 0.9985
## gfr_2 NA NA NA NA
## t NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.244e-14 on 49 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 6.667e+30 on 17 and 49 DF, p-value: < 2.2e-16
# (iii): Re-estimate equation (10.35) but add the 'pe_3' as an additional variable to check stat. significance.
model_with_pe_3 <- lm(gfr ~ pe + year + tsq + pe_1 + pe_2 + pe_3 + pe_4 + pill + ww2 + tcu + cgfr + cpe + cpe_1 + cpe_2 + cpe_3 + cpe_4 + gfr_1 + cgfr_1 + cgfr_2 + cgfr_3 + cgfr_4 + gfr_2 + t + tsq + pe_3, data = fertil3)
summary(model_with_pe_3)
## Warning in summary.lm(model_with_pe_3): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = gfr ~ pe + year + tsq + pe_1 + pe_2 + pe_3 + pe_4 +
## pill + ww2 + tcu + cgfr + cpe + cpe_1 + cpe_2 + cpe_3 + cpe_4 +
## gfr_1 + cgfr_1 + cgfr_2 + cgfr_3 + cgfr_4 + gfr_2 + t + tsq +
## pe_3, data = fertil3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.294e-14 -3.614e-15 3.870e-16 3.960e-15 5.021e-14
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.849e-12 4.744e-12 -1.233e+00 0.224
## pe -6.052e-17 1.472e-16 -4.110e-01 0.683
## year 3.028e-15 2.459e-15 1.231e+00 0.224
## tsq -7.141e-17 7.229e-17 -9.880e-01 0.328
## pe_1 1.081e-16 1.618e-16 6.680e-01 0.507
## pe_2 3.984e-18 1.762e-16 2.300e-02 0.982
## pe_3 -2.328e-17 1.666e-16 -1.400e-01 0.889
## pe_4 -5.775e-17 1.303e-16 -4.430e-01 0.660
## pill -1.025e-14 1.209e-14 -8.480e-01 0.401
## ww2 1.107e-14 1.351e-14 8.190e-01 0.417
## tcu 5.447e-19 6.195e-19 8.790e-01 0.384
## cgfr 1.000e+00 5.569e-16 1.796e+15 <2e-16 ***
## cpe NA NA NA NA
## cpe_1 NA NA NA NA
## cpe_2 NA NA NA NA
## cpe_3 NA NA NA NA
## cpe_4 5.996e-17 1.320e-16 4.540e-01 0.652
## gfr_1 1.000e+00 2.596e-16 3.852e+15 <2e-16 ***
## cgfr_1 -7.823e-16 5.445e-16 -1.437e+00 0.157
## cgfr_2 5.662e-17 5.749e-16 9.900e-02 0.922
## cgfr_3 -5.129e-16 5.162e-16 -9.940e-01 0.325
## cgfr_4 -1.866e-16 5.092e-16 -3.660e-01 0.716
## gfr_2 NA NA NA NA
## t NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.326e-14 on 49 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 7.853e+30 on 17 and 49 DF, p-value: < 2.2e-16