install.packages("wooldridge")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(wooldridge)
data("countymurders")
data("countymurders")
countymurders_1996 <- subset(countymurders, year == 1996)
#how many counties had zero murders in 1996?
count_zero_murders <- sum(countymurders_1996$murders == 0)
print(count_zero_murders)
## [1] 1051
#How many counties had at least one execution?
count_at_least_one_execution <- sum(countymurders_1996$execs >= 1)
print(count_at_least_one_execution)
## [1] 31
#What is the largest number execution?
largest_execution_count <- max(countymurders_1996$execs)
print(largest_execution_count)
## [1] 3
data("countymurders")
countymurders_1996 <- subset(countymurders, year == 1996)
data("countymurders_1996")
## Warning in data("countymurders_1996"): data set 'countymurders_1996' not found
model <- lm(murders ~ execs , data = countymurders)
summary(model)
##
## Call:
## lm(formula = murders ~ execs, data = countymurders)
##
## Residuals:
## Min 1Q Median 3Q Max
## -202.23 -6.84 -5.84 -3.84 1937.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8382 0.2418 28.28 <2e-16 ***
## execs 65.4650 2.1463 30.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.64 on 37347 degrees of freedom
## Multiple R-squared: 0.02431, Adjusted R-squared: 0.02428
## F-statistic: 930.4 on 1 and 37347 DF, p-value: < 2.2e-16
min_executions <- min(countymurders_1996$execs)
min_predicted_murders <- coef(model)[1] + coef(model)[2] * min_executions
cat("Smallest number of murders predicted:", min_predicted_murders, "\n")
## Smallest number of murders predicted: 6.838201
# What is the residual for a county with zero executions and zero murders?
zero_exec_zero_murders_residual <- predict(model, newdata = data.frame(execs = 0), type = "response")
cat("Residual for a county with zero executions and zero murders:", zero_exec_zero_murders_residual, "\n")
## Residual for a county with zero executions and zero murders: 6.838201
#Chapter 3
library(wooldridge)
data("discrim")
mean_prpblck <- mean(discrim$prpblck)
sd_prpblck <- sd(discrim$prpblck)
mean_income <- mean(discrim$income)
sd_income <- sd(discrim$income)
cat("Average prpblck:", mean_prpblck, "\n")
## Average prpblck: NA
cat("Standard deviation of prpblck:", sd_prpblck, "\n")
## Standard deviation of prpblck: NA
cat("Average income:", mean_income, "\n")
## Average income: NA
cat("Standard deviation of income:", sd_income, "\n\n")
## Standard deviation of income: NA
cat("Units of measurement: prpblck is the proportion of the population that is black (in percentage),\n")
## Units of measurement: prpblck is the proportion of the population that is black (in percentage),
cat("and income is the median income in the zip code.\n\n")
## and income is the median income in the zip code.
model <- lm(psoda ~ prpblck + income, data = discrim)
summary(model)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
simple_model <- lm(psoda ~ prpblck, data = discrim)
summary(simple_model)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
log_model <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(log_model)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
percentage_change <- exp(coef(log_model)[2] * 0.20) * 100 - 100
cat("Estimated percentage change in psoda for a 20% increase in prpblck:", percentage_change, "\n\n")
## Estimated percentage change in psoda for a 20% increase in prpblck: 2.46141
model_with_prppov <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model_with_prppov)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
correlation_log_income_prppov <- cor(discrim$lincome, discrim$prppov)
cat("Correlation between lincome and prppov:", correlation_log_income_prppov, "\n\n")
## Correlation between lincome and prppov: NA
cat("The statement 'Because lincome and prppov are so highly correlated, they have no business being in the same regression.'\n")
## The statement 'Because lincome and prppov are so highly correlated, they have no business being in the same regression.'
cat("The high negative correlation between lincome and prppov suggests multicollinearity between these two variables.\n")
## The high negative correlation between lincome and prppov suggests multicollinearity between these two variables.
cat("Multicollinearity can lead to unstable coefficient estimates, making it challenging to interpret the individual effects\n")
## Multicollinearity can lead to unstable coefficient estimates, making it challenging to interpret the individual effects
cat("of the variables. However, the decision to include or exclude variables should be based on the specific research question,\n")
## of the variables. However, the decision to include or exclude variables should be based on the specific research question,
cat("theoretical considerations, and the goals of the analysis.\n")
## theoretical considerations, and the goals of the analysis.
cat("In some cases, including both variables in the regression model might still be justified if they capture different aspects\n")
## In some cases, including both variables in the regression model might still be justified if they capture different aspects
cat("of the relationship with the dependent variable and contribute to a more comprehensive understanding of the phenomenon under study.\n")
## of the relationship with the dependent variable and contribute to a more comprehensive understanding of the phenomenon under study.
#Chapter 4
coeff_log_sales <- 0.321
se_log_sales <- 0.216
coeff_profmarg <- 0.50
se_profmarg <- 0.46
percentage_change <- coeff_log_sales * 10
t_stat_log_sales <- coeff_log_sales / se_log_sales
p_value_log_sales <- 2 * (1 - pt(abs(t_stat_log_sales), df = 29)) # two-tailed test
cat("b) p-value for the test on log(sales) coefficient:", p_value_log_sales, "\n")
## b) p-value for the test on log(sales) coefficient: 0.1480413
cat(" (At 5% level):", ifelse(p_value_log_sales < 0.05, "Reject H0", "Fail to reject H0"), "\n")
## (At 5% level): Fail to reject H0
cat(" (At 10% level):", ifelse(p_value_log_sales < 0.10, "Reject H0", "Fail to reject H0"), "\n")
## (At 10% level): Fail to reject H0
cat("c) Coefficient on profmarg:", coeff_profmarg, "\n")
## c) Coefficient on profmarg: 0.5
t_stat_profmarg <- coeff_profmarg / se_profmarg
p_value_profmarg <- 2 * (1 - pt(abs(t_stat_profmarg), df = 29)) # two-tailed test
cat("d) p-value for the test on profmarg coefficient:", p_value_profmarg, "\n")
## d) p-value for the test on profmarg coefficient: 0.2860082
cat(" (At 5% level):", ifelse(p_value_profmarg < 0.05, "Reject H0", "Fail to reject H0"), "\n")
## (At 5% level): Fail to reject H0
cat(" (At 10% level):", ifelse(p_value_profmarg < 0.10, "Reject H0", "Fail to reject H0"), "\n")
## (At 10% level): Fail to reject H0
#C8
library(wooldridge)
data("k401ksubs")
single_person_households <- subset(k401ksubs, fsize == 1)
num_single_person_households <- nrow(single_person_households)
cat("Number of single-person households:", num_single_person_households, "\n\n")
## Number of single-person households: 2017
model <- lm(nettfa ~ inc + age, data = single_person_households)
summary(model)
##
## Call:
## lm(formula = nettfa ~ inc + age, data = single_person_households)
##
## Residuals:
## Min 1Q Median 3Q Max
## -179.95 -14.16 -3.42 6.03 1113.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.03981 4.08039 -10.548 <2e-16 ***
## inc 0.79932 0.05973 13.382 <2e-16 ***
## age 0.84266 0.09202 9.158 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared: 0.1193, Adjusted R-squared: 0.1185
## F-statistic: 136.5 on 2 and 2014 DF, p-value: < 2.2e-16
cat("Interpretation of slope coefficients:\n")
## Interpretation of slope coefficients:
cat("B1 (inc): The estimated change in nettfa for a one-unit change in inc (annual family income).\n")
## B1 (inc): The estimated change in nettfa for a one-unit change in inc (annual family income).
cat("B2 (age): The estimated change in nettfa for a one-unit change in age.\n")
## B2 (age): The estimated change in nettfa for a one-unit change in age.
cat("There might be surprises depending on the context and expectations of the relationship between variables.\n\n")
## There might be surprises depending on the context and expectations of the relationship between variables.
cat("The intercept (B0) represents the estimated net financial wealth (nettfa) when both inc and age are zero.\n")
## The intercept (B0) represents the estimated net financial wealth (nettfa) when both inc and age are zero.
cat("In this context, it may not have a meaningful interpretation, as having zero income and age is not practically meaningful.\n\n")
## In this context, it may not have a meaningful interpretation, as having zero income and age is not practically meaningful.
test_result <- summary(model)$coefficient[3, "Pr(>|t|)"]
cat("p-value for the test H0: B₂ = 1 against H₁: B₂ < 1:", test_result, "\n")
## p-value for the test H0: B₂ = 1 against H₁: B₂ < 1: 1.265959e-19
cat("At the 1% significance level, we would reject H0 if the p-value is less than 0.01.\n")
## At the 1% significance level, we would reject H0 if the p-value is less than 0.01.
if (test_result < 0.01) {cat("We reject H0; there is evidence that B₂ is less than 1.\n\n")
} else {
cat("We do not reject H0; there is not enough evidence to conclude that B₂ is less than 1.\n\n")
}
## We reject H0; there is evidence that B₂ is less than 1.
simple_model <- lm(nettfa ~ inc, data = single_person_households)
summary(simple_model)
##
## Call:
## lm(formula = nettfa ~ inc, data = single_person_households)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185.12 -12.85 -4.85 1.78 1112.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.5709 2.0607 -5.13 3.18e-07 ***
## inc 0.8207 0.0609 13.48 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared: 0.08267, Adjusted R-squared: 0.08222
## F-statistic: 181.6 on 1 and 2015 DF, p-value: < 2.2e-16
cat("Comparison of the estimated coefficient on inc:\n")
## Comparison of the estimated coefficient on inc:
cat("The estimated coefficient on inc in the simple regression is compared to the estimate in part (ii).\n")
## The estimated coefficient on inc in the simple regression is compared to the estimate in part (ii).
cat("Differences may arise due to the inclusion of age in the multiple regression model, which may affect\n")
## Differences may arise due to the inclusion of age in the multiple regression model, which may affect
cat("the relationship between nettfa and inc. The context and goals of the analysis will determine\n")
## the relationship between nettfa and inc. The context and goals of the analysis will determine
cat("whether the inclusion of age improves the model.\n")
## whether the inclusion of age improves the model.
# Chapter 5
library(wooldridge)
data('econmath')
library(ggplot2)
library(stats)
data <- econmath$score
hist_data <- hist(data, breaks = 30, plot = FALSE)
mu <- mean(data)
sigma <- sd(data)
x <- seq(min(data), max(data), length = 100)
y <- dnorm(x, mean = mu, sd = sigma)
hist_plot <- ggplot() +
geom_histogram(aes(x = data, y = ..density..), bins = 30, fill = "lightyellow", color = "black") +
geom_line(aes(x = x, y = y), color = "purple", size = 1) +
labs(title = "Histogram and Fitted Normal Distribution",
x = "Score",
y = "Density") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
hist_plot <- ggplot() +
geom_histogram(aes(x = data, y = ..density..), bins = 30, fill = "lightyellow", color = "green") +
geom_line(aes(x = x, y = y), color = "orange", size = 1) +
labs(title = "Histogram and Fitted Normal Distribution",
x = "Score",
y = "Density") +
theme_minimal()
print(hist_plot)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

prob_exceed_100 <- 1 - pnorm(100, mean = mu, sd = sigma)
cat("i. Probability that 'score' exceeds 100 using the normal distribution:", prob_exceed_100, "\n")
## i. Probability that 'score' exceeds 100 using the normal distribution: 0.02044288
qqnorm(data)
qqline(data)

shapiro.test(data)
##
## Shapiro-Wilk normality test
##
## data: data
## W = 0.96973, p-value = 2.454e-12
#C1
data("wage1")
wage_data <- wage1
model_level <- lm(wage ~ educ + exper + tenure, data = wage_data)
residuals_level <- residuals(model_level)
summary(model_level)
##
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6068 -1.7747 -0.6279 1.1969 14.6536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.87273 0.72896 -3.941 9.22e-05 ***
## educ 0.59897 0.05128 11.679 < 2e-16 ***
## exper 0.02234 0.01206 1.853 0.0645 .
## tenure 0.16927 0.02164 7.820 2.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared: 0.3064, Adjusted R-squared: 0.3024
## F-statistic: 76.87 on 3 and 522 DF, p-value: < 2.2e-16
model_log_level <- lm(log(wage) ~ educ + exper + tenure, data = wage_data)
residuals_log_level <- residuals(model_log_level)
# Plot histogram of residuals for the log-level model
hist(residuals_log_level, main = "Histogram of Residuals (Log-Level Model)", col = "green", border = "orange")

summary(model_log_level)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05802 -0.29645 -0.03265 0.28788 1.42809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.284360 0.104190 2.729 0.00656 **
## educ 0.092029 0.007330 12.555 < 2e-16 ***
## exper 0.004121 0.001723 2.391 0.01714 *
## tenure 0.022067 0.003094 7.133 3.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared: 0.316, Adjusted R-squared: 0.3121
## F-statistic: 80.39 on 3 and 522 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2)) # Set up a 2x2 grid for Q-Q plots
qqnorm(residuals_level, main = "Q-Q Plot - Level-Level Model")
qqline(residuals_level)
qqnorm(residuals_log_level, main = "Q-Q Plot - Log-Level Model")
qqline(residuals_log_level)

par(mfrow = c(1, 1))
cat("\nSummary Statistics - Level-Level Model:\n")
##
## Summary Statistics - Level-Level Model:
summary(model_level)
##
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6068 -1.7747 -0.6279 1.1969 14.6536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.87273 0.72896 -3.941 9.22e-05 ***
## educ 0.59897 0.05128 11.679 < 2e-16 ***
## exper 0.02234 0.01206 1.853 0.0645 .
## tenure 0.16927 0.02164 7.820 2.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared: 0.3064, Adjusted R-squared: 0.3024
## F-statistic: 76.87 on 3 and 522 DF, p-value: < 2.2e-16
cat("\nSummary Statistics - Log-Level Model:\n")
##
## Summary Statistics - Log-Level Model:
summary(model_log_level)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05802 -0.29645 -0.03265 0.28788 1.42809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.284360 0.104190 2.729 0.00656 **
## educ 0.092029 0.007330 12.555 < 2e-16 ***
## exper 0.004121 0.001723 2.391 0.01714 *
## tenure 0.022067 0.003094 7.133 3.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared: 0.316, Adjusted R-squared: 0.3121
## F-statistic: 80.39 on 3 and 522 DF, p-value: < 2.2e-16
#Chapter 6
library(wooldridge)
data("rdchem")
sales_negative_marginal_effect <- -coef(model)[2] / (2 * coef(model)[3])
cat("The marginal effect of sales on rdintens becomes negative when sales is greater than", sales_negative_marginal_effect, "\n")
## The marginal effect of sales on rdintens becomes negative when sales is greater than -0.4742839
cat("The decision to keep the quadratic term depends on the significance of the coefficient and the context of the analysis.\n")
## The decision to keep the quadratic term depends on the significance of the coefficient and the context of the analysis.
cat("If the quadratic term is statistically significant and improves the model fit, it may be kept.\n\n")
## If the quadratic term is statistically significant and improves the model fit, it may be kept.
rdchem$salesbil <- rdchem$sales / 1000
model_salesbil <- lm(rdintens ~ salesbil + I(salesbil^2), data = rdchem)
summary(model_salesbil)
##
## Call:
## lm(formula = rdintens ~ salesbil + I(salesbil^2), data = rdchem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1418 -1.3630 -0.2257 1.0688 5.5808
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.612512 0.429442 6.084 1.27e-06 ***
## salesbil 0.300571 0.139295 2.158 0.0394 *
## I(salesbil^2) -0.006946 0.003726 -1.864 0.0725 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.788 on 29 degrees of freedom
## Multiple R-squared: 0.1484, Adjusted R-squared: 0.08969
## F-statistic: 2.527 on 2 and 29 DF, p-value: 0.09733
model1 <- lm(rdintens ~ sales + I(sales^2), data = rdchem)
# Second Model
model2 <- lm(rdintens ~ salesbil + I(salesbil^2), data = rdchem)
# Compare Adjusted R-squared
adjusted_r_squared_model1 <- summary(model1)$adj.r.squared
adjusted_r_squared_model2 <- summary(model2)$adj.r.squared
# Compare Coefficients
coefficients_model1 <- coef(model1)
coefficients_model2 <- coef(model2)
# Compare Residuals (optional, for additional diagnostics)
residuals_model1 <- residuals(model1)
residuals_model2 <- residuals(model2)
if (adjusted_r_squared_model1 == adjusted_r_squared_model2) {
preference <- "Both models have the same adjusted R-squared."
} else if (adjusted_r_squared_model1 > adjusted_r_squared_model2) {
preference <- "Model 1 is preferred based on adjusted R-squared."
} else {preference <- "Model 2 is preferred based on adjusted R-squared."
}
# Report Coefficients (you can customize this based on your needs)
report_model1 <- coef(model1)
report_model2 <- coef(model2)
# Print results
cat("Adjusted R-squared - Model 1:", adjusted_r_squared_model1, "\n")
## Adjusted R-squared - Model 1: 0.08969224
cat("Adjusted R-squared - Model 2:", adjusted_r_squared_model2, "\n")
## Adjusted R-squared - Model 2: 0.08969224
cat("Preference:", preference, "\n")
## Preference: Both models have the same adjusted R-squared.
cat("\nCoefficients - Model 1:\n")
##
## Coefficients - Model 1:
print(report_model1)
## (Intercept) sales I(sales^2)
## 2.612512e+00 3.005713e-04 -6.945939e-09
cat("\nCoefficients - Model 2:\n")
##
## Coefficients - Model 2:
print(report_model2)
## (Intercept) salesbil I(salesbil^2)
## 2.612512085 0.300571301 -0.006945939
#10
library(wooldridge)
data("meapsingle")
cat("The first equation is more relevant because it provides a direct estimate of the effect of per-student spending on math test performance.\n")
## The first equation is more relevant because it provides a direct estimate of the effect of per-student spending on math test performance.
cat("In the first equation, the coefficient on lexppp represents the estimated effect of a one-unit increase in log expenditures per student.\n")
## In the first equation, the coefficient on lexppp represents the estimated effect of a one-unit increase in log expenditures per student.
cat("To get the estimated effect of a 10% increase, you can multiply the coefficient by 0.1.\n")
## To get the estimated effect of a 10% increase, you can multiply the coefficient by 0.1.
model1 <- lm(math4 ~ lexppp + free + lmedinc + pctsgle, data = meapsingle)
summary(model1)
##
## Call:
## lm(formula = math4 ~ lexppp + free + lmedinc + pctsgle, data = meapsingle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.259 -7.422 1.615 7.274 49.524
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.48949 59.23781 0.413 0.6797
## lexppp 9.00648 4.03530 2.232 0.0266 *
## free -0.42164 0.07064 -5.969 9.27e-09 ***
## lmedinc -0.75221 5.35816 -0.140 0.8885
## pctsgle -0.27444 0.16086 -1.706 0.0894 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.59 on 224 degrees of freedom
## Multiple R-squared: 0.4716, Adjusted R-squared: 0.4622
## F-statistic: 49.98 on 4 and 224 DF, p-value: < 2.2e-16
model2 <- lm(math4 ~ lexppp + free + lmedinc + pctsgle + read4, data = meapsingle)
summary(model2)
##
## Call:
## lm(formula = math4 ~ lexppp + free + lmedinc + pctsgle + read4,
## data = meapsingle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.5690 -4.6729 -0.0349 4.3644 24.8425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.37870 41.70293 3.582 0.000419 ***
## lexppp 1.93215 2.82480 0.684 0.494688
## free -0.06004 0.05399 -1.112 0.267297
## lmedinc -10.77595 3.75746 -2.868 0.004529 **
## pctsgle -0.39663 0.11143 -3.559 0.000454 ***
## read4 0.66656 0.04249 15.687 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.012 on 223 degrees of freedom
## Multiple R-squared: 0.7488, Adjusted R-squared: 0.7432
## F-statistic: 132.9 on 5 and 223 DF, p-value: < 2.2e-16
cat("Comparison of Equations:\n")
## Comparison of Equations:
cat("1. Equation without read4:\n")
## 1. Equation without read4:
summary(model1)
##
## Call:
## lm(formula = math4 ~ lexppp + free + lmedinc + pctsgle, data = meapsingle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.259 -7.422 1.615 7.274 49.524
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.48949 59.23781 0.413 0.6797
## lexppp 9.00648 4.03530 2.232 0.0266 *
## free -0.42164 0.07064 -5.969 9.27e-09 ***
## lmedinc -0.75221 5.35816 -0.140 0.8885
## pctsgle -0.27444 0.16086 -1.706 0.0894 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.59 on 224 degrees of freedom
## Multiple R-squared: 0.4716, Adjusted R-squared: 0.4622
## F-statistic: 49.98 on 4 and 224 DF, p-value: < 2.2e-16
cat("\n2. Equation with read4:\n")
##
## 2. Equation with read4:
summary(model2)
##
## Call:
## lm(formula = math4 ~ lexppp + free + lmedinc + pctsgle + read4,
## data = meapsingle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.5690 -4.6729 -0.0349 4.3644 24.8425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.37870 41.70293 3.582 0.000419 ***
## lexppp 1.93215 2.82480 0.684 0.494688
## free -0.06004 0.05399 -1.112 0.267297
## lmedinc -10.77595 3.75746 -2.868 0.004529 **
## pctsgle -0.39663 0.11143 -3.559 0.000454 ***
## read4 0.66656 0.04249 15.687 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.012 on 223 degrees of freedom
## Multiple R-squared: 0.7488, Adjusted R-squared: 0.7432
## F-statistic: 132.9 on 5 and 223 DF, p-value: < 2.2e-16
# C3
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data("wage2")
# Fit the regression model
model <- lm(log(wage) ~ educ + exper + educ * exper, data = wage2)
#i. Show that the return to another year of education, holding exper fixed, is b1 + b3 * exper
coefs <- coef(model)
return_to_education <- coefs["educ"] + coefs["educ:exper"]
cat("Return to another year of education (holding exper fixed):", return_to_education, "\n")
## Return to another year of education (holding exper fixed): 0.04725277
summary(model)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + educ * exper, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88558 -0.24553 0.03558 0.26171 1.28836
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.949455 0.240826 24.704 <2e-16 ***
## educ 0.044050 0.017391 2.533 0.0115 *
## exper -0.021496 0.019978 -1.076 0.2822
## educ:exper 0.003203 0.001529 2.095 0.0365 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3923 on 931 degrees of freedom
## Multiple R-squared: 0.1349, Adjusted R-squared: 0.1321
## F-statistic: 48.41 on 3 and 931 DF, p-value: < 2.2e-16
#iv. Return to Education When exper = 10
# Extract coefficients
B_educ <- coef(model)["educ"]
B_exper <- coef(model)["exper"]
# Calculate the return to education when exper = 10
theta <- B_educ + 10 * B_exper
# Get standard error for the estimate
se_theta <- summary(model)$coefficients["exper", "Std. Error"]
# Calculate the 95% confidence interval
confidence_interval <- c(theta - 1.96 * se_theta, theta + 1.96 * se_theta)
cat("Theta:", theta, "\n")
## Theta: -0.1709096
cat("95% Confidence Interval:", confidence_interval, "\n")
## 95% Confidence Interval: -0.210067 -0.1317521
#C12
install.packages("wooldridge")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(wooldridge)
data("k401ksubs")
subsetted_data <- k401ksubs[k401ksubs$fsize == 1, ]
youngest_age <- min(k401ksubs$age)
count_youngest_age <- sum(k401ksubs$age == youngest_age)
cat("Youngest age:", youngest_age, "\n")
## Youngest age: 25
cat("Number of people at the youngest age:", count_youngest_age, "\n")
## Number of people at the youngest age: 211
model <- lm(nettfa ~ inc + age + agesq , data = k401ksubs)
summary(model)
##
## Call:
## lm(formula = nettfa ~ inc + age + agesq, data = k401ksubs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -504.93 -18.61 -3.08 9.96 1464.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.680388 10.080986 0.464 0.642
## inc 0.978252 0.025489 38.379 < 2e-16 ***
## age -2.231489 0.489712 -4.557 5.26e-06 ***
## agesq 0.037722 0.005621 6.710 2.05e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 58.18 on 9271 degrees of freedom
## Multiple R-squared: 0.1731, Adjusted R-squared: 0.1728
## F-statistic: 646.8 on 3 and 9271 DF, p-value: < 2.2e-16