##Chapter 2
##C9
library(wooldridge)
data(countymurders)
county_murders_1996 <- subset(countymurders, year == 1996)
# Count counties with zero murders
zero_murders <- sum(county_murders_1996$murders == 0)
# Count counties with at least one execution
at_least_one_execution <- sum(county_murders_1996$execs > 0)
# Find the largest number of executions
max_executions <- max(county_murders_1996$execs)
# Run the OLS regression
model <- lm(murders ~ execs, data = county_murders_1996)
summary(model)
##
## Call:
## lm(formula = murders ~ execs, data = county_murders_1996)
##
## Residuals:
## Min 1Q Median 3Q Max
## -149.12 -5.46 -4.46 -2.46 1338.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.4572 0.8348 6.537 7.79e-11 ***
## execs 58.5555 5.8333 10.038 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared: 0.04389, Adjusted R-squared: 0.04346
## F-statistic: 100.8 on 1 and 2195 DF, p-value: < 2.2e-16
# Predict minimum murders with zero executions
min_predicted_murders <- predict(model, newdata = data.frame(execs = 0))
# Calculate residual for a county with zero executions and zero murders
residual_zero <- 0 - min_predicted_murders
#Simple regression model might not accurately capture the deterrent effect of capital punishment on murders because of omitted variable bias, confounding factors, and the difficulty in establishing causality.
##Chapter 3
##5
# Assuming each student spends a total of 168 hours per week across study, sleep, work, and leisure.
set.seed(123) # For reproducibility
# Create a dataset with 100 students
n <- 100
study_hours <- runif(n, 10, 40) # Random study hours between 10 and 40
sleep_hours <- runif(n, 40, 60) # Random sleep hours between 40 and 60
work_hours <- runif(n, 10, 50) # Random work hours between 10 and 50
# Calculate leisure hours to ensure the sum is 168 hours
leisure_hours <- 168 - (study_hours + sleep_hours + work_hours)
# Generate random GPA values based on the hours, adding some noise
GPA <- 2.5 + 0.01 * study_hours - 0.005 * sleep_hours + 0.003 * work_hours + rnorm(n, 0, 0.2)
# Combine into a data frame
data <- data.frame(GPA, study_hours, sleep_hours, work_hours, leisure_hours)
# Rename columns for easier referencing
names(data) <- c("GPA", "study", "sleep", "work", "leisure")
# Display the first few rows of the dataset to understand the structure
head(data)
## GPA study sleep work leisure
## 1 2.642469 18.62733 51.99978 19.54904 77.82385
## 2 2.902501 33.64915 46.65647 48.49436 39.20002
## 3 2.642436 22.26931 49.77226 34.05463 61.90380
## 4 2.459586 36.49052 59.08948 30.60119 41.81881
## 5 2.688268 38.21402 49.65805 26.10293 54.02500
## 6 2.404182 11.36669 57.80700 45.20986 53.61644
# Part (iii): Reformulate the model by removing "leisure" to avoid multicollinearity
# This allows us to estimate the effect of study, sleep, and work relative to leisure
model <- lm(GPA ~ study + sleep + work, data = data)
# Summary of the model to see the coefficients and significance
summary(model)
##
## Call:
## lm(formula = GPA ~ study + sleep + work, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30905 -0.12573 -0.02513 0.11220 0.58041
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.140964 0.205409 10.423 < 2e-16 ***
## study 0.009698 0.002203 4.403 2.77e-05 ***
## sleep 0.001941 0.003581 0.542 0.5891
## work 0.003700 0.001602 2.309 0.0231 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1861 on 96 degrees of freedom
## Multiple R-squared: 0.1952, Adjusted R-squared: 0.17
## F-statistic: 7.76 on 3 and 96 DF, p-value: 0.0001079
# Interpretation:
# The coefficients of study, sleep, and work represent the effect of each activity on GPA
# relative to leisure, as leisure hours are implicitly defined by the constraint.
##10
# Set seed for reproducibility
set.seed(123)
# Create mock data
n <- 100 # Number of observations
# Generate x1, x2, and x3 based on different correlation scenarios
# x1, x2, x3 represent attendance, GPA, and test scores, respectively.
# For demonstration purposes, let's create the variables with different levels of correlation
# (i) x1 highly correlated with x2 and x3, and x2 and x3 have large partial effects on y
x1_i <- rnorm(n, mean = 75, sd = 10)
x2_i <- x1_i + rnorm(n, mean = 0, sd = 2) # x2 highly correlated with x1
x3_i <- x1_i + rnorm(n, mean = 0, sd = 2) # x3 highly correlated with x1
y_i <- 0.5 * x1_i + 0.6 * x2_i + 0.4 * x3_i + rnorm(n, mean = 0, sd = 5) # Large partial effects
# (ii) x1 almost uncorrelated with x2 and x3, but x2 and x3 highly correlated
x1_ii <- rnorm(n, mean = 75, sd = 10)
x2_ii <- rnorm(n, mean = 70, sd = 10)
x3_ii <- x2_ii + rnorm(n, mean = 0, sd = 2) # x3 highly correlated with x2
y_ii <- 0.5 * x1_ii + 0.6 * x2_ii + 0.4 * x3_ii + rnorm(n, mean = 0, sd = 5)
# (iii) x1 highly correlated with x2 and x3, but x2 and x3 have small partial effects on y
x1_iii <- rnorm(n, mean = 75, sd = 10)
x2_iii <- x1_iii + rnorm(n, mean = 0, sd = 2)
x3_iii <- x1_iii + rnorm(n, mean = 0, sd = 2)
y_iii <- 0.5 * x1_iii + 0.05 * x2_iii + 0.03 * x3_iii + rnorm(n, mean = 0, sd = 5) # Small partial effects
# (iv) x1 almost uncorrelated with x2 and x3, x2 and x3 have large partial effects
x1_iv <- rnorm(n, mean = 75, sd = 10)
x2_iv <- rnorm(n, mean = 70, sd = 10)
x3_iv <- x2_iv + rnorm(n, mean = 0, sd = 2) # x3 highly correlated with x2
y_iv <- 0.5 * x1_iv + 0.6 * x2_iv + 0.4 * x3_iv + rnorm(n, mean = 0, sd = 5) # Large partial effects
# Function to run simple and multiple regression and get summaries
run_regression_analysis <- function(y, x1, x2, x3) {
# Simple regression of y on x1 only
simple_model <- lm(y ~ x1)
cat("\nSimple Regression of y on x1 only:\n")
print(summary(simple_model))
# Multiple regression of y on x1, x2, x3
multiple_model <- lm(y ~ x1 + x2 + x3)
cat("\nMultiple Regression of y on x1, x2, and x3:\n")
print(summary(multiple_model))
}
# (i) Run regression for case (i): x1 highly correlated with x2 and x3, large partial effects
cat("Case (i): x1 highly correlated with x2 and x3, large partial effects on y\n")
## Case (i): x1 highly correlated with x2 and x3, large partial effects on y
run_regression_analysis(y_i, x1_i, x2_i, x3_i)
##
## Simple Regression of y on x1 only:
##
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.274 -3.169 -0.114 3.404 12.470
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.98413 4.55419 0.655 0.514
## x1 1.45787 0.05957 24.471 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.411 on 98 degrees of freedom
## Multiple R-squared: 0.8594, Adjusted R-squared: 0.8579
## F-statistic: 598.9 on 1 and 98 DF, p-value: < 2.2e-16
##
##
## Multiple Regression of y on x1, x2, and x3:
##
## Call:
## lm(formula = y ~ x1 + x2 + x3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.4569 -3.2696 0.2832 3.3516 12.6605
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.9828 4.4736 0.443 0.6586
## x1 0.5002 0.3842 1.302 0.1961
## x2 0.7155 0.2736 2.615 0.0104 *
## x3 0.2565 0.2806 0.914 0.3628
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.258 on 96 degrees of freedom
## Multiple R-squared: 0.8699, Adjusted R-squared: 0.8659
## F-statistic: 214 on 3 and 96 DF, p-value: < 2.2e-16
# (ii) Run regression for case (ii): x1 uncorrelated with x2 and x3, x2 and x3 highly correlated
cat("\nCase (ii): x1 uncorrelated with x2 and x3, x2 and x3 highly correlated\n")
##
## Case (ii): x1 uncorrelated with x2 and x3, x2 and x3 highly correlated
run_regression_analysis(y_ii, x1_ii, x2_ii, x3_ii)
##
## Simple Regression of y on x1 only:
##
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.150 -6.007 0.597 6.713 39.452
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.5774 8.6041 7.041 2.65e-10 ***
## x1 0.6237 0.1122 5.559 2.34e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.04 on 98 degrees of freedom
## Multiple R-squared: 0.2398, Adjusted R-squared: 0.232
## F-statistic: 30.91 on 1 and 98 DF, p-value: 2.338e-07
##
##
## Multiple Regression of y on x1, x2, and x3:
##
## Call:
## lm(formula = y ~ x1 + x2 + x3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.2006 -2.4961 0.2769 3.3389 14.0425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.19395 4.94876 0.443 0.6585
## x1 0.41460 0.05229 7.929 4.01e-12 ***
## x2 0.65100 0.24923 2.612 0.0104 *
## x3 0.41851 0.24664 1.697 0.0930 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.036 on 96 degrees of freedom
## Multiple R-squared: 0.8451, Adjusted R-squared: 0.8403
## F-statistic: 174.6 on 3 and 96 DF, p-value: < 2.2e-16
# (iii) Run regression for case (iii): x1 highly correlated with x2 and x3, small partial effects
cat("\nCase (iii): x1 highly correlated with x2 and x3, small partial effects on y\n")
##
## Case (iii): x1 highly correlated with x2 and x3, small partial effects on y
run_regression_analysis(y_iii, x1_iii, x2_iii, x3_iii)
##
## Simple Regression of y on x1 only:
##
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8194 -3.6396 -0.1729 2.6429 16.1017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.51429 3.66613 0.14 0.889
## x1 0.57203 0.04783 11.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.005 on 98 degrees of freedom
## Multiple R-squared: 0.5934, Adjusted R-squared: 0.5893
## F-statistic: 143.1 on 1 and 98 DF, p-value: < 2.2e-16
##
##
## Multiple Regression of y on x1, x2, and x3:
##
## Call:
## lm(formula = y ~ x1 + x2 + x3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6351 -3.4572 -0.3161 2.9107 16.6733
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3943 3.7063 0.106 0.9155
## x1 0.6738 0.3214 2.097 0.0386 *
## x2 -0.2207 0.2517 -0.877 0.3827
## x3 0.1200 0.2471 0.486 0.6283
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.034 on 96 degrees of freedom
## Multiple R-squared: 0.5972, Adjusted R-squared: 0.5846
## F-statistic: 47.44 on 3 and 96 DF, p-value: < 2.2e-16
# (iv) Run regression for case (iv): x1 uncorrelated with x2 and x3, x2 and x3 large partial effects
cat("\nCase (iv): x1 uncorrelated with x2 and x3, x2 and x3 large partial effects on y\n")
##
## Case (iv): x1 uncorrelated with x2 and x3, x2 and x3 large partial effects on y
run_regression_analysis(y_iv, x1_iv, x2_iv, x3_iv)
##
## Simple Regression of y on x1 only:
##
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.949 -10.341 -0.202 8.593 32.713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.5087 9.2668 8.148 1.22e-12 ***
## x1 0.4219 0.1227 3.439 0.000859 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.33 on 98 degrees of freedom
## Multiple R-squared: 0.1077, Adjusted R-squared: 0.09857
## F-statistic: 11.83 on 1 and 98 DF, p-value: 0.0008593
##
##
## Multiple Regression of y on x1, x2, and x3:
##
## Call:
## lm(formula = y ~ x1 + x2 + x3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.1807 -3.1910 0.4197 3.0325 11.3661
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3962 4.8176 0.705 0.48254
## x1 0.4394 0.0490 8.968 2.46e-14 ***
## x2 0.7774 0.2377 3.270 0.00149 **
## x3 0.2421 0.2309 1.048 0.29712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.47 on 96 degrees of freedom
## Multiple R-squared: 0.8639, Adjusted R-squared: 0.8596
## F-statistic: 203 on 3 and 96 DF, p-value: < 2.2e-16
##C8
# Load the DISCRIM dataset
data("discrim")
# Part (i): Find the average values of prpblck and income and their standard deviations
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)
cat("Average proportion of black residents (prpblck):", mean_prpblck, "\n")
## Average proportion of black residents (prpblck): 0.1134864
cat("Standard deviation of prpblck:", sd_prpblck, "\n")
## Standard deviation of prpblck: 0.1824165
cat("Average median income (income):", mean_income, "\n")
## Average median income (income): 47053.78
cat("Standard deviation of income:", sd_income, "\n")
## Standard deviation of income: 13179.29
# Part (ii): Estimate the model psoda = β0 + β1*prpblck + β2*income + u
model1 <- lm(psoda ~ prpblck + income, data = discrim)
summary(model1)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
# Interpretation of the coefficient on prpblck
cat("Coefficient on prpblck:", coef(model1)["prpblck"], "\n")
## Coefficient on prpblck: 0.1149882
cat("Interpretation: This coefficient represents the impact of a 1-unit increase in the proportion of black residents on the price of soda.\n")
## Interpretation: This coefficient represents the impact of a 1-unit increase in the proportion of black residents on the price of soda.
# Part (iii): Simple regression of psoda on prpblck
model2 <- lm(psoda ~ prpblck, data = discrim)
summary(model2)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
# Compare the estimate of prpblck from model1 and model2
cat("Coefficient on prpblck in model with income:", coef(model1)["prpblck"], "\n")
## Coefficient on prpblck in model with income: 0.1149882
cat("Coefficient on prpblck in simple regression:", coef(model2)["prpblck"], "\n")
## Coefficient on prpblck in simple regression: 0.06492687
# Part (iv): Log-log model to estimate constant elasticity
model3 <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(model3)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
# Calculate the estimated percentage change in psoda for a 20 percentage point increase in prpblck
percent_change <- coef(model3)["prpblck"] * 0.2 * 100
cat("Estimated percentage change in psoda for a 20 percentage point increase in prpblck:", percent_change, "%\n")
## Estimated percentage change in psoda for a 20 percentage point increase in prpblck: 2.431605 %
# Part (v): Add prppov to the regression in part (iv)
model4 <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model4)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
# Observe what happens to β_hat on prpblck
cat("Coefficient on prpblck in model with prppov:", coef(model4)["prpblck"], "\n")
## Coefficient on prpblck in model with prppov: 0.07280726
# Part (vi): Find the correlation between log(income) and prppov
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
cat("Correlation between log(income) and prppov:", correlation, "\n")
## Correlation between log(income) and prppov: -0.838467
# Part (vii): Evaluate the statement about log(income) and prppov correlation
cat("Since log(income) and prppov are highly correlated, including both in the same regression could lead to multicollinearity issues, making it difficult to interpret the individual effects of these variables.\n")
## Since log(income) and prppov are highly correlated, including both in the same regression could lead to multicollinearity issues, making it difficult to interpret the individual effects of these variables.
##Chapter 4
##3
# Load the rdchem dataset
data("rdchem", package = "wooldridge")
# Run the regression model
model <- lm(rdintens ~ log(sales) + profmarg, data = rdchem)
summary(model)
##
## Call:
## lm(formula = rdintens ~ log(sales) + profmarg, data = rdchem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3016 -1.2707 -0.6895 0.8785 6.0369
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.47225 1.67606 0.282 0.780
## log(sales) 0.32135 0.21557 1.491 0.147
## profmarg 0.05004 0.04578 1.093 0.283
##
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared: 0.09847, Adjusted R-squared: 0.0363
## F-statistic: 1.584 on 2 and 29 DF, p-value: 0.2224
# (i) Calculate the estimated percentage change in rdintens for a 10% increase in sales
coef_log_sales <- coef(model)["log(sales)"]
percentage_change <- coef_log_sales * 10
cat("Estimated percentage change in rdintens for a 10% increase in sales:", percentage_change, "\n")
## Estimated percentage change in rdintens for a 10% increase in sales: 3.213484
# (ii) Test the significance of log(sales)
p_value_log_sales <- summary(model)$coefficients["log(sales)", "Pr(>|t|)"]
cat("P-value for log(sales):", p_value_log_sales, "\n")
## P-value for log(sales): 0.1468382
# (iii) Interpret the coefficient on profmarg
coef_profmarg <- coef(model)["profmarg"]
cat("Coefficient on profmarg:", coef_profmarg, "\n")
## Coefficient on profmarg: 0.0500367
# (iv) Check if profmarg is statistically significant
p_value_profmarg <- summary(model)$coefficients["profmarg", "Pr(>|t|)"]
cat("P-value for profmarg:", p_value_profmarg, "\n")
## P-value for profmarg: 0.2833658
##C8
# Load the k401ksubs dataset and filter for single-person households
data("k401ksubs", package = "wooldridge")
single_person <- subset(k401ksubs, fsize == 1)
# (i) Count the number of single-person households
num_single_person <- nrow(single_person)
cat("Number of single-person households:", num_single_person, "\n")
## Number of single-person households: 2017
# (ii) Estimate the model netffa = β0 + β1 * inc + β2 * age + u
model <- lm(nettfa ~ inc + age, data = single_person)
summary(model)
##
## Call:
## lm(formula = nettfa ~ inc + age, data = single_person)
##
## Residuals:
## Min 1Q Median 3Q Max
## -179.95 -14.16 -3.42 6.03 1113.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.03981 4.08039 -10.548 <2e-16 ***
## inc 0.79932 0.05973 13.382 <2e-16 ***
## age 0.84266 0.09202 9.158 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared: 0.1193, Adjusted R-squared: 0.1185
## F-statistic: 136.5 on 2 and 2014 DF, p-value: < 2.2e-16
# (iii) Interpret the intercept
intercept <- coef(model)["(Intercept)"]
cat("Intercept (β0):", intercept, "\n")
## Intercept (β0): -43.03981
# (iv) Test H0: β2 = 1 against H1: β2 < 1
beta2 <- coef(model)["age"]
se_beta2 <- summary(model)$coefficients["age", "Std. Error"]
t_stat <- (beta2 - 1) / se_beta2
p_value <- pt(t_stat, df = model$df.residual) # One-sided test
cat("t-statistic:", t_stat, "\n")
## t-statistic: -1.709944
cat("p-value for H0: β2 = 1 against H1: β2 < 1:", p_value, "\n")
## p-value for H0: β2 = 1 against H1: β2 < 1: 0.04371514
# (v) Simple regression of netffa on inc
simple_model <- lm(nettfa ~ inc, data = single_person)
summary(simple_model)
##
## Call:
## lm(formula = nettfa ~ inc, data = single_person)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185.12 -12.85 -4.85 1.78 1112.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.5709 2.0607 -5.13 3.18e-07 ***
## inc 0.8207 0.0609 13.48 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared: 0.08267, Adjusted R-squared: 0.08222
## F-statistic: 181.6 on 1 and 2015 DF, p-value: < 2.2e-16
# Compare coefficients on inc
coef_simple_inc <- coef(simple_model)["inc"]
coef_full_inc <- coef(model)["inc"]
cat("Coefficient on inc in simple model:", coef_simple_inc, "\n")
## Coefficient on inc in simple model: 0.8206815
cat("Coefficient on inc in full model:", coef_full_inc, "\n")
## Coefficient on inc in full model: 0.7993167
##Chapter 5
# Summary statistics
summary(econmath)
## age work study econhs
## Min. :18.00 Min. : 0.000 Min. : 0.00 Min. :0.0000
## 1st Qu.:19.00 1st Qu.: 0.000 1st Qu.: 8.50 1st Qu.:0.0000
## Median :19.00 Median : 8.000 Median :12.00 Median :0.0000
## Mean :19.41 Mean : 8.626 Mean :13.92 Mean :0.3703
## 3rd Qu.:20.00 3rd Qu.:15.000 3rd Qu.:18.00 3rd Qu.:1.0000
## Max. :29.00 Max. :37.500 Max. :50.00 Max. :1.0000
##
## colgpa hsgpa acteng actmth
## Min. :0.875 Min. :2.355 Min. :12.00 Min. :12.00
## 1st Qu.:2.446 1st Qu.:3.110 1st Qu.:20.00 1st Qu.:20.00
## Median :2.813 Median :3.321 Median :23.00 Median :23.00
## Mean :2.815 Mean :3.342 Mean :22.59 Mean :23.21
## 3rd Qu.:3.207 3rd Qu.:3.589 3rd Qu.:25.00 3rd Qu.:26.00
## Max. :4.000 Max. :4.260 Max. :34.00 Max. :36.00
## NA's :42 NA's :42
## act mathscr male calculus
## Min. :13.00 Min. : 0.000 Min. :0.0 Min. :0.0000
## 1st Qu.:21.00 1st Qu.: 7.000 1st Qu.:0.0 1st Qu.:0.0000
## Median :23.00 Median : 8.000 Median :0.5 Median :1.0000
## Mean :23.12 Mean : 7.875 Mean :0.5 Mean :0.6764
## 3rd Qu.:25.00 3rd Qu.: 9.000 3rd Qu.:1.0 3rd Qu.:1.0000
## Max. :33.00 Max. :10.000 Max. :1.0 Max. :1.0000
## NA's :42
## attexc attgood fathcoll mothcoll
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.2967 Mean :0.5864 Mean :0.5245 Mean :0.6285
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## score
## Min. :19.53
## 1st Qu.:64.06
## Median :74.22
## Mean :72.60
## 3rd Qu.:82.79
## Max. :98.44
##
# Histogram of score
hist(econmath$score, breaks = 30, freq = FALSE)
curve(dnorm(x, mean = mean(econmath$score), sd = sd(econmath$score)), add = TRUE)
# Q1(i)
# The probability of score exceeding 100 using normal distribution would not be exactly zero, but it would be extremely small.
# This contradicts the assumption of a normal distribution because the normal distribution theoretically extends to infinity on both sides, while the score variable is bounded between 0 and 100.
# Q1(ii)
# The left tail of the histogram shows a spike at 0, indicating a large number of observations with a score of 0. This suggests that the normal distribution may not fit well in the left tail, as it assumes a continuous distribution with no such spikes.
data("wage1")
# Estimate the model
model <- lm(wage ~ educ + exper + tenure, data = wage1)
summary(model)
##
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6068 -1.7747 -0.6279 1.1969 14.6536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.87273 0.72896 -3.941 9.22e-05 ***
## educ 0.59897 0.05128 11.679 < 2e-16 ***
## exper 0.02234 0.01206 1.853 0.0645 .
## tenure 0.16927 0.02164 7.820 2.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared: 0.3064, Adjusted R-squared: 0.3024
## F-statistic: 76.87 on 3 and 522 DF, p-value: < 2.2e-16
# Extract residuals
residuals <- model$residuals
# Plot histogram of residuals
hist(residuals, main = "Histogram of Residuals", xlab = "Residuals", col = "lightblue", border = "black")
# Estimate model with log(wage)
log_model <- lm(log(wage) ~ educ + exper + tenure, data = wage1)
summary(log_model)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05802 -0.29645 -0.03265 0.28788 1.42809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.284360 0.104190 2.729 0.00656 **
## educ 0.092029 0.007330 12.555 < 2e-16 ***
## exper 0.004121 0.001723 2.391 0.01714 *
## tenure 0.022067 0.003094 7.133 3.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared: 0.316, Adjusted R-squared: 0.3121
## F-statistic: 80.39 on 3 and 522 DF, p-value: < 2.2e-16
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Breusch-Pagan test for original model
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 43.096, df = 3, p-value = 2.349e-09
# Breusch-Pagan test for log-transformed model
bptest(log_model)
##
## studentized Breusch-Pagan test
##
## data: log_model
## BP = 10.761, df = 3, p-value = 0.01309
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.