#Chapter 2 - C9
library(wooldridge)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data("countymurders")
murders_1996 <- countymurders[countymurders$year == 1996, ]
#1. Counties with zero murders and counties with executions
zero_murders <- sum(murders_1996$murders == 0)
cat("Number of counties with zero murders:", zero_murders, "\n")
## Number of counties with zero murders: 1051
counties_with_executions <- sum(murders_1996$execs > 0)
cat("Number of counties with at least one execution:", counties_with_executions, "\n")
## Number of counties with at least one execution: 31
max_executions <- max(murders_1996$execs)
cat("Largest number of executions:", max_executions, "\n")
## Largest number of executions: 3
#OLS Regression
model <- lm(murders ~ execs, data = murders_1996)
summary_stats <- summary(model)
print(summary_stats)
##
## Call:
## lm(formula = murders ~ execs, data = murders_1996)
##
## Residuals:
## Min 1Q Median 3Q Max
## -149.12 -5.46 -4.46 -2.46 1338.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.4572 0.8348 6.537 7.79e-11 ***
## execs 58.5555 5.8333 10.038 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared: 0.04389, Adjusted R-squared: 0.04346
## F-statistic: 100.8 on 1 and 2195 DF, p-value: < 2.2e-16
n <- nobs(model)
r_squared <- summary_stats$r.squared
adj_r_squared <- summary_stats$adj.r.squared
cat("\nRegression Results:")
##
## Regression Results:
cat("\nSample size:", n)
##
## Sample size: 2197
cat("\nR-squared:", round(r_squared, 4))
##
## R-squared: 0.0439
cat("\nAdjusted R-squared:", round(adj_r_squared, 4))
##
## Adjusted R-squared: 0.0435
#3. Visualization and Interpretation
ggplot(murders_1996, aes(x = execs, y = murders)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Relationship between Executions and Murders",
x = "Number of Executions",
y = "Number of Murders") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

coef_summary <- coef(summary(model))
beta1 <- coef_summary["execs", "Estimate"]
se_beta1 <- coef_summary["execs", "Std. Error"]
t_stat <- coef_summary["execs", "t value"]
p_value <- coef_summary["execs", "Pr(>|t|)"]
cat("\nSlope Coefficient Analysis:")
##
## Slope Coefficient Analysis:
cat("\nβ1 (slope):", round(beta1, 4))
##
## β1 (slope): 58.5555
cat("\nStandard Error:", round(se_beta1, 4))
##
## Standard Error: 5.8333
cat("\nt-statistic:", round(t_stat, 4))
##
## t-statistic: 10.0382
cat("\np-value:", format.pval(p_value, digits = 4))
##
## p-value: < 2.2e-16
#4. Predictions and Residuals
new_data <- data.frame(execs = 0)
predicted_murders <- predict(model, newdata = new_data)
actual_murders <- 0
residual <- actual_murders - predicted_murders[1]
cat("\n\nPredictions for County with Zero Executions:")
##
##
## Predictions for County with Zero Executions:
cat("\nPredicted murders:", round(predicted_murders[1], 4))
##
## Predicted murders: 5.4572
cat("\nResidual (assuming zero actual murders):", round(residual, 4))
##
## Residual (assuming zero actual murders): -5.4572
#5. Diagnostic Plots for Regression Assumptions
par(mfrow = c(2, 2))
plot(model)

bp_test <- car::ncvTest(model)
cat("\n\nDiagnostic Tests:")
##
##
## Diagnostic Tests:
cat("\nBreusch-Pagan test p-value:", format.pval(bp_test$p, digits = 4))
##
## Breusch-Pagan test p-value: < 2.2e-16
sw_test <- shapiro.test(residuals(model))
cat("\nShapiro-Wilk test p-value:", format.pval(sw_test$p, digits = 4))
##
## Shapiro-Wilk test p-value: < 2.2e-16
#Chapter 3 - 5
set.seed(123)
n <- 100 # number of students
generate_time <- function(n) {
times <- matrix(0, nrow = n, ncol = 4)
colnames(times) <- c("study", "sleep", "work", "leisure")
for(i in 1:n) {
props <- runif(4)
props <- props/sum(props) * 168
times[i,] <- props
}
return(as.data.frame(times))
}
student_data <- generate_time(n)
student_data$GPA <- with(student_data,
2 + 0.02*study - 0.01*sleep + 0.005*work - 0.015*leisure + rnorm(n, 0, 0.5))
print("Sum of hours for first few students:")
## [1] "Sum of hours for first few students:"
head(rowSums(student_data[,1:4]))
## [1] 168 168 168 168 168 168
model_original <- try(lm(GPA ~ study + sleep + work + leisure, data = student_data))
print("\nOriginal model results:")
## [1] "\nOriginal model results:"
print(summary(model_original))
##
## Call:
## lm(formula = GPA ~ study + sleep + work + leisure, data = student_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00634 -0.32642 -0.08686 0.34959 0.99598
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.017485 0.278461 -0.063 0.950
## study 0.032280 0.002747 11.752 < 2e-16 ***
## sleep 0.002054 0.002502 0.821 0.414
## work 0.015001 0.002544 5.896 5.55e-08 ***
## leisure NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4727 on 96 degrees of freedom
## Multiple R-squared: 0.6641, Adjusted R-squared: 0.6536
## F-statistic: 63.26 on 3 and 96 DF, p-value: < 2.2e-16
student_data$study_prop <- student_data$study/168
student_data$sleep_prop <- student_data$sleep/168
student_data$work_prop <- student_data$work/168
model_reformed <- lm(GPA ~ study_prop + sleep_prop + work_prop, data = student_data)
print("\nReformed model results:")
## [1] "\nReformed model results:"
print(summary(model_reformed))
##
## Call:
## lm(formula = GPA ~ study_prop + sleep_prop + work_prop, data = student_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00634 -0.32642 -0.08686 0.34959 0.99598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01749 0.27846 -0.063 0.950
## study_prop 5.42308 0.46148 11.752 < 2e-16 ***
## sleep_prop 0.34506 0.42025 0.821 0.414
## work_prop 2.52018 0.42745 5.896 5.55e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4727 on 96 degrees of freedom
## Multiple R-squared: 0.6641, Adjusted R-squared: 0.6536
## F-statistic: 63.26 on 3 and 96 DF, p-value: < 2.2e-16
#Chapter 3 - 10
set.seed(123)
generate_correlated_data <- function(n, cor_matrix) {
library(MASS)
mu <- rep(0, ncol(cor_matrix))
data <- mvrnorm(n, mu, cor_matrix)
return(data)
}
# Function to analyze regression results
analyze_regression <- function(data) {
# Simple regression
simple_reg <- lm(y ~ x1, data = data.frame(data))
# Multiple regression
multiple_reg <- lm(y ~ x1 + x2 + x3, data = data.frame(data))
# Extract coefficients and standard errors
beta1_simple <- coef(simple_reg)[2]
beta1_multiple <- coef(multiple_reg)[2]
se_simple <- summary(simple_reg)$coefficients[2,2]
se_multiple <- summary(multiple_reg)$coefficients[2,2]
return(list(
beta1_simple = beta1_simple,
beta1_multiple = beta1_multiple,
se_simple = se_simple,
se_multiple = se_multiple
))
}
# Sample size
n <- 1000
# Scenario (i): x1 highly correlated with x2, x3; large partial effects
cor_matrix_1 <- matrix(c(
1.0, 0.8, 0.8, 0.6,
0.8, 1.0, 0.7, 0.7,
0.8, 0.7, 1.0, 0.7,
0.6, 0.7, 0.7, 1.0
), nrow=4)
data_1 <- generate_correlated_data(n, cor_matrix_1)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:wooldridge':
##
## cement
colnames(data_1) <- c("x1", "x2", "x3", "y")
results_1 <- analyze_regression(data_1)
# Scenario (ii): x1 uncorrelated with x2, x3; x2 and x3 highly correlated
cor_matrix_2 <- matrix(c(
1.0, 0.1, 0.1, 0.3,
0.1, 1.0, 0.8, 0.6,
0.1, 0.8, 1.0, 0.6,
0.3, 0.6, 0.6, 1.0
), nrow=4)
data_2 <- generate_correlated_data(n, cor_matrix_2)
colnames(data_2) <- c("x1", "x2", "x3", "y")
results_2 <- analyze_regression(data_2)
# Scenario (iii): x1 highly correlated with x2, x3; small partial effects
cor_matrix_3 <- matrix(c(
1.0, 0.8, 0.8, 0.3,
0.8, 1.0, 0.7, 0.2,
0.8, 0.7, 1.0, 0.2,
0.3, 0.2, 0.2, 1.0
), nrow=4)
data_3 <- generate_correlated_data(n, cor_matrix_3)
colnames(data_3) <- c("x1", "x2", "x3", "y")
results_3 <- analyze_regression(data_3)
# Scenario (iv): x1 uncorrelated with x2, x3; large partial effects
cor_matrix_4 <- matrix(c(
1.0, 0.1, 0.1, 0.6,
0.1, 1.0, 0.8, 0.7,
0.1, 0.8, 1.0, 0.7,
0.6, 0.7, 0.7, 1.0
), nrow=4)
data_4 <- generate_correlated_data(n, cor_matrix_4)
colnames(data_4) <- c("x1", "x2", "x3", "y")
results_4 <- analyze_regression(data_4)
# Print results for all scenarios
print_scenario_results <- function(scenario, results) {
cat(sprintf("\nScenario %s:\n", scenario))
cat(sprintf("Simple regression β1: %.3f (SE: %.3f)\n",
results$beta1_simple, results$se_simple))
cat(sprintf("Multiple regression β1: %.3f (SE: %.3f)\n",
results$beta1_multiple, results$se_multiple))
}
print_scenario_results("(i)", results_1)
##
## Scenario (i):
## Simple regression β1: 0.644 (SE: 0.027)
## Multiple regression β1: -0.253 (SE: 0.042)
print_scenario_results("(ii)", results_2)
##
## Scenario (ii):
## Simple regression β1: 0.286 (SE: 0.031)
## Multiple regression β1: 0.223 (SE: 0.024)
print_scenario_results("(iii)", results_3)
##
## Scenario (iii):
## Simple regression β1: 0.323 (SE: 0.029)
## Multiple regression β1: 0.488 (SE: 0.060)
print_scenario_results("(iv)", results_4)
##
## Scenario (iv):
## Simple regression β1: 0.612 (SE: 0.025)
## Multiple regression β1: 0.533 (SE: 0.013)
#Chapter 3 - C8
#1: Calculate the average values and standard deviations of prpblck and income
library(wooldridge)
library(dplyr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data("discrim")
summary_stats <- discrim %>%
summarise(
avg_prpblck = mean(prpblck, na.rm = TRUE),
sd_prpblck = sd(prpblck, na.rm = TRUE),
avg_income = mean(income, na.rm = TRUE),
sd_income = sd(income, na.rm = TRUE)
)
print(summary_stats)
## avg_prpblck sd_prpblck avg_income sd_income
## 1 0.1134864 0.1824165 47053.78 13179.29
#2: Estimate psoda = β₀ + β₁*prpblck + β₂*income + u using OLS and interpret prpblck
model1 <- lm(psoda ~ prpblck + income, data = discrim)
summary(model1)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
#3: Compare with a simple regression of psoda on prpblck only
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
#4: Estimate the model with log-transformed income (constant price elasticity with respect to income)
model_log <- lm(psoda ~ prpblck + log(income), data = discrim)
summary(model_log)
##
## Call:
## lm(formula = psoda ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29484 -0.05085 0.00346 0.04283 0.44069
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.18553 0.18800 0.987 0.324
## prpblck 0.12583 0.02697 4.665 4.23e-06 ***
## log(income) 0.07882 0.01739 4.533 7.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08602 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06628, Adjusted R-squared: 0.06159
## F-statistic: 14.13 on 2 and 398 DF, p-value: 1.184e-06
#5: Add prppov to the regression and observe the change in prpblck
model_with_prppov <- lm(psoda ~ prpblck + log(income) + prppov, data = discrim)
summary(model_with_prppov)
##
## Call:
## lm(formula = psoda ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28083 -0.05006 0.00305 0.04247 0.44286
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.51208 0.30777 -1.664 0.09693 .
## prpblck 0.07501 0.03214 2.334 0.02011 *
## log(income) 0.14180 0.02804 5.058 6.5e-07 ***
## prppov 0.39629 0.13915 2.848 0.00463 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08526 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08497, Adjusted R-squared: 0.07806
## F-statistic: 12.29 on 3 and 397 DF, p-value: 1.053e-07
#6: Find the correlation between log(income) and prppov
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
print(correlation)
## [1] -0.838467
#7: Check multicollinearity between log(income) and prppov using VIF
vif(model_with_prppov)
## prpblck log(income) prppov
## 1.922160 3.518721 4.915220
#Chapter 4 - 3
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ MASS::select() masks dplyr::select()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
n <- 32
sales <- runif(n, 10, 100)
log_sales <- log(sales)
profmarg <- runif(n, 0, 20)
rdintens <- 0.472 + 0.321 * log_sales + 0.050 * profmarg + rnorm(n, 0, 1)
# Create a data frame
data <- data.frame(rdintens = rdintens, sales = sales, log_sales = log_sales, profmarg = profmarg)
# Run the regression model
model <- lm(rdintens ~ log_sales + profmarg, data = data)
# Output model summary to check coefficients and significance
summary(model)
##
## Call:
## lm(formula = rdintens ~ log_sales + profmarg, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.08745 -0.42009 0.04894 0.65782 1.88213
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.69781 1.06735 1.591 0.123
## log_sales 0.03833 0.27801 0.138 0.891
## profmarg 0.01837 0.03164 0.581 0.566
##
## Residual standard error: 1.092 on 29 degrees of freedom
## Multiple R-squared: 0.01261, Adjusted R-squared: -0.05549
## F-statistic: 0.1851 on 2 and 29 DF, p-value: 0.832
# (i) Estimated percentage change in rdintens when sales increases by 10%
estimated_effect <- 0.321 * log(1.10)
estimated_effect
## [1] 0.03059457
#Chapter 4 - C8
library(wooldridge)
library(dplyr)
library(sandwich)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
data("k401ksubs")
# First filter for single-person households
singles <- subset(k401ksubs, fsize == 1)
# (i) Count number of single-person households
n_singles <- nrow(singles)
cat("Number of single-person households:", n_singles, "\n\n")
## Number of single-person households: 2017
# (ii) Run multiple regression
model_full <- lm(nettfa ~ inc + age, data = singles)
summary(model_full)
##
## Call:
## lm(formula = nettfa ~ inc + age, data = singles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -179.95 -14.16 -3.42 6.03 1113.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.03981 4.08039 -10.548 <2e-16 ***
## inc 0.79932 0.05973 13.382 <2e-16 ***
## age 0.84266 0.09202 9.158 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared: 0.1193, Adjusted R-squared: 0.1185
## F-statistic: 136.5 on 2 and 2014 DF, p-value: < 2.2e-16
# Calculate robust standard errors
library(sandwich)
library(lmtest)
robust_se <- coeftest(model_full, vcov = vcovHC(model_full, type = "HC1"))
print(robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.03981 5.52433 -7.7910 1.057e-14 ***
## inc 0.79932 0.10078 7.9313 3.562e-15 ***
## age 0.84266 0.11937 7.0589 2.300e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iv) Test H0: β2 = 1 against H1: β2 < 1
# Extract coefficient and standard error for age
age_coef <- coef(model_full)["age"]
age_se <- sqrt(vcovHC(model_full, type = "HC1")["age", "age"])
# Calculate t-statistic for H0: β2 = 1
t_stat <- (age_coef - 1) / age_se
# Calculate p-value for one-sided test
p_value <- pt(t_stat, df = nrow(singles) - 3)
cat("\nTest H0: β2 = 1 against H1: β2 < 1")
##
## Test H0: β2 = 1 against H1: β2 < 1
cat("\nt-statistic:", t_stat)
##
## t-statistic: -1.318064
cat("\np-value:", p_value, "\n\n")
##
## p-value: 0.09381599
# (v) Simple regression of nettfa on inc
model_simple <- lm(nettfa ~ inc, data = singles)
summary(model_simple)
##
## Call:
## lm(formula = nettfa ~ inc, data = singles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185.12 -12.85 -4.85 1.78 1112.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.5709 2.0607 -5.13 3.18e-07 ***
## inc 0.8207 0.0609 13.48 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared: 0.08267, Adjusted R-squared: 0.08222
## F-statistic: 181.6 on 1 and 2015 DF, p-value: < 2.2e-16
# Compare coefficients
cat("Coefficient on inc in multiple regression:", coef(model_full)["inc"], "\n")
## Coefficient on inc in multiple regression: 0.7993167
cat("Coefficient on inc in simple regression:", coef(model_simple)["inc"], "\n")
## Coefficient on inc in simple regression: 0.8206815
#Chapter 5 - 5
set.seed(123)
n <- 1000
mean_score <- 77
sd_score <- 15
scores <- rnorm(n, mean = mean_score, sd = sd_score)
scores <- pmax(pmin(scores, 100), 20)
hist(scores, breaks = 30, probability = TRUE,
main = "Distribution of Course Scores",
xlab = "Course Score (in percentage form)",
ylab = "Proportion in cell",
xlim = c(20, 100),
ylim = c(0, 0.1))
curve(dnorm(x, mean = mean(scores), sd = sd(scores)),
add = TRUE, col = "blue")
prob_exceed_100 <- 1 - pnorm(100, mean = mean(scores), sd = sd(scores))
cat("Probability of score exceeding 100:", prob_exceed_100, "\n")
## Probability of score exceeding 100: 0.0480284
prob_below_40 <- pnorm(40, mean = mean(scores), sd = sd(scores))
cat("Probability of score below 40:", prob_below_40, "\n")
## Probability of score below 40: 0.00420035
#Chapter 5 - C1
library(wooldridge)
library(ggplot2)
data("wage1")
# (i) Estimate the model with wage as the dependent variable
model1 <- lm(wage ~ educ + exper + tenure, data = wage1)
summary(model1)
##
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6068 -1.7747 -0.6279 1.1969 14.6536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.87273 0.72896 -3.941 9.22e-05 ***
## educ 0.59897 0.05128 11.679 < 2e-16 ***
## exper 0.02234 0.01206 1.853 0.0645 .
## tenure 0.16927 0.02164 7.820 2.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared: 0.3064, Adjusted R-squared: 0.3024
## F-statistic: 76.87 on 3 and 522 DF, p-value: < 2.2e-16
residuals1 <- residuals(model1)
ggplot(data.frame(residuals1), aes(x = residuals1)) +
geom_histogram(bins = 20, fill = "blue", color = "black") +
labs(title = "Histogram of Residuals for Wage Model", x = "Residuals", y = "Frequency")
# (ii) Estimate the model with log(wage) as the dependent variable
model2 <- lm(log(wage) ~ educ + exper + tenure, data = wage1)
summary(model2)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05802 -0.29645 -0.03265 0.28788 1.42809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.284360 0.104190 2.729 0.00656 **
## educ 0.092029 0.007330 12.555 < 2e-16 ***
## exper 0.004121 0.001723 2.391 0.01714 *
## tenure 0.022067 0.003094 7.133 3.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared: 0.316, Adjusted R-squared: 0.3121
## F-statistic: 80.39 on 3 and 522 DF, p-value: < 2.2e-16
residuals2 <- residuals(model2)
ggplot(data.frame(residuals2), aes(x = residuals2)) +
geom_histogram(bins = 20, fill = "green", color = "black") +
labs(title = "Histogram of Residuals for Log(Wage) Model", x = "Residuals", y = "Frequency")

