Chapter 2 - C9

Chapter 3 - 5

Chapter 3 - 10

Chapter 3 - C8

Chapter 4 - 3

Chapter 4 - C8

Chapter 5 - 5

Chapter 5 - C1

Chapter 2 - C9

library(wooldridge) 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) 
data("countymurders")
murders_1996 <- countymurders[countymurders$year == 1996, ]
  1. Counties with zero murders and counties with executions
zero_murders <- sum(murders_1996$murders == 0)
cat("Number of counties with zero murders:", zero_murders, "\n")
## Number of counties with zero murders: 1051
counties_with_executions <- sum(murders_1996$execs > 0)
cat("Number of counties with at least one execution:", counties_with_executions, "\n")
## Number of counties with at least one execution: 31
max_executions <- max(murders_1996$execs)
cat("Largest number of executions:", max_executions, "\n")
## Largest number of executions: 3
  1. OLS Regression
model <- lm(murders ~ execs, data = murders_1996)
summary_stats <- summary(model)
print(summary_stats)
## 
## Call:
## lm(formula = murders ~ execs, data = murders_1996)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.12   -5.46   -4.46   -2.46 1338.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.4572     0.8348   6.537 7.79e-11 ***
## execs        58.5555     5.8333  10.038  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared:  0.04389,    Adjusted R-squared:  0.04346 
## F-statistic: 100.8 on 1 and 2195 DF,  p-value: < 2.2e-16
n <- nobs(model)
r_squared <- summary_stats$r.squared
adj_r_squared <- summary_stats$adj.r.squared

cat("\nRegression Results:")
## 
## Regression Results:
cat("\nSample size:", n)
## 
## Sample size: 2197
cat("\nR-squared:", round(r_squared, 4))
## 
## R-squared: 0.0439
cat("\nAdjusted R-squared:", round(adj_r_squared, 4))
## 
## Adjusted R-squared: 0.0435

iii. Visualization and Interpretation

ggplot(murders_1996, aes(x = execs, y = murders)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Relationship between Executions and Murders",
       x = "Number of Executions",
       y = "Number of Murders") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Extract and format coefficient information
coef_summary <- coef(summary(model))
beta1 <- coef_summary["execs", "Estimate"]
se_beta1 <- coef_summary["execs", "Std. Error"]
t_stat <- coef_summary["execs", "t value"]
p_value <- coef_summary["execs", "Pr(>|t|)"]

cat("\nSlope Coefficient Analysis:")
## 
## Slope Coefficient Analysis:
cat("\nβ1 (slope):", round(beta1, 4))
## 
## β1 (slope): 58.5555
cat("\nStandard Error:", round(se_beta1, 4))
## 
## Standard Error: 5.8333
cat("\nt-statistic:", round(t_stat, 4))
## 
## t-statistic: 10.0382
cat("\np-value:", format.pval(p_value, digits = 4))
## 
## p-value: < 2.2e-16

iv. Predictions and Residuals

new_data <- data.frame(execs = 0)
predicted_murders <- predict(model, newdata = new_data)
actual_murders <- 0
residual <- actual_murders - predicted_murders[1]
cat("\n\nPredictions for County with Zero Executions:")
## 
## 
## Predictions for County with Zero Executions:
cat("\nPredicted murders:", round(predicted_murders[1], 4))
## 
## Predicted murders: 5.4572
cat("\nResidual (assuming zero actual murders):", round(residual, 4))
## 
## Residual (assuming zero actual murders): -5.4572

v. Diagnostic Plots for Regression Assumptions

par(mfrow = c(2, 2))
plot(model)

# Additional tests for regression assumptions
# Breusch-Pagan test for heteroskedasticity
bp_test <- car::ncvTest(model)
cat("\n\nDiagnostic Tests:")
## 
## 
## Diagnostic Tests:
cat("\nBreusch-Pagan test p-value:", format.pval(bp_test$p, digits = 4))
## 
## Breusch-Pagan test p-value: < 2.2e-16
# Shapiro-Wilk test for normality of residuals
sw_test <- shapiro.test(residuals(model))
cat("\nShapiro-Wilk test p-value:", format.pval(sw_test$p, digits = 4))
## 
## Shapiro-Wilk test p-value: < 2.2e-16

Chapter 3 - 5

# Create example data to illustrate the problem
set.seed(123)
n <- 100  # number of students

# Generate time allocations ensuring they sum to 168
generate_time <- function(n) {
  # Initialize matrices
  times <- matrix(0, nrow = n, ncol = 4)
  colnames(times) <- c("study", "sleep", "work", "leisure")
  
  for(i in 1:n) {
    # Generate random proportions that sum to 168
    props <- runif(4)
    props <- props/sum(props) * 168
    times[i,] <- props
  }
  return(as.data.frame(times))
}

# Generate data
student_data <- generate_time(n)
student_data$GPA <- with(student_data, 
                        2 + 0.02*study - 0.01*sleep + 0.005*work - 0.015*leisure + rnorm(n, 0, 0.5))

# Demonstrate perfect multicollinearity
print("Sum of hours for first few students:")
## [1] "Sum of hours for first few students:"
head(rowSums(student_data[,1:4]))
## [1] 168 168 168 168 168 168
# Try to fit the original model
model_original <- try(lm(GPA ~ study + sleep + work + leisure, data = student_data))
print("\nOriginal model results:")
## [1] "\nOriginal model results:"
print(summary(model_original))
## 
## Call:
## lm(formula = GPA ~ study + sleep + work + leisure, data = student_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00634 -0.32642 -0.08686  0.34959  0.99598 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.017485   0.278461  -0.063    0.950    
## study        0.032280   0.002747  11.752  < 2e-16 ***
## sleep        0.002054   0.002502   0.821    0.414    
## work         0.015001   0.002544   5.896 5.55e-08 ***
## leisure            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4727 on 96 degrees of freedom
## Multiple R-squared:  0.6641, Adjusted R-squared:  0.6536 
## F-statistic: 63.26 on 3 and 96 DF,  p-value: < 2.2e-16
# Reformulated model using proportions of time
student_data$study_prop <- student_data$study/168
student_data$sleep_prop <- student_data$sleep/168
student_data$work_prop <- student_data$work/168
# Note: leisure proportion is omitted as base category

# Fit the reformulated model
model_reformed <- lm(GPA ~ study_prop + sleep_prop + work_prop, data = student_data)
print("\nReformed model results:")
## [1] "\nReformed model results:"
print(summary(model_reformed))
## 
## Call:
## lm(formula = GPA ~ study_prop + sleep_prop + work_prop, data = student_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00634 -0.32642 -0.08686  0.34959  0.99598 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.01749    0.27846  -0.063    0.950    
## study_prop   5.42308    0.46148  11.752  < 2e-16 ***
## sleep_prop   0.34506    0.42025   0.821    0.414    
## work_prop    2.52018    0.42745   5.896 5.55e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4727 on 96 degrees of freedom
## Multiple R-squared:  0.6641, Adjusted R-squared:  0.6536 
## F-statistic: 63.26 on 3 and 96 DF,  p-value: < 2.2e-16

Chapter 3 - 10

set.seed(123)

generate_correlated_data <- function(n, cor_matrix) {
  library(MASS)
  mu <- rep(0, ncol(cor_matrix))
  data <- mvrnorm(n, mu, cor_matrix)
  return(data)
}

# Function to analyze regression results
analyze_regression <- function(data) {
  # Simple regression
  simple_reg <- lm(y ~ x1, data = data.frame(data))
  
  # Multiple regression
  multiple_reg <- lm(y ~ x1 + x2 + x3, data = data.frame(data))
  
  # Extract coefficients and standard errors
  beta1_simple <- coef(simple_reg)[2]
  beta1_multiple <- coef(multiple_reg)[2]
  se_simple <- summary(simple_reg)$coefficients[2,2]
  se_multiple <- summary(multiple_reg)$coefficients[2,2]
  
  return(list(
    beta1_simple = beta1_simple,
    beta1_multiple = beta1_multiple,
    se_simple = se_simple,
    se_multiple = se_multiple
  ))
}

# Sample size
n <- 1000

# Scenario (i): x1 highly correlated with x2, x3; large partial effects
cor_matrix_1 <- matrix(c(
  1.0, 0.8, 0.8, 0.6,
  0.8, 1.0, 0.7, 0.7,
  0.8, 0.7, 1.0, 0.7,
  0.6, 0.7, 0.7, 1.0
), nrow=4)
data_1 <- generate_correlated_data(n, cor_matrix_1)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:wooldridge':
## 
##     cement
colnames(data_1) <- c("x1", "x2", "x3", "y")
results_1 <- analyze_regression(data_1)

# Scenario (ii): x1 uncorrelated with x2, x3; x2 and x3 highly correlated
cor_matrix_2 <- matrix(c(
  1.0, 0.1, 0.1, 0.3,
  0.1, 1.0, 0.8, 0.6,
  0.1, 0.8, 1.0, 0.6,
  0.3, 0.6, 0.6, 1.0
), nrow=4)
data_2 <- generate_correlated_data(n, cor_matrix_2)
colnames(data_2) <- c("x1", "x2", "x3", "y")
results_2 <- analyze_regression(data_2)

# Scenario (iii): x1 highly correlated with x2, x3; small partial effects
cor_matrix_3 <- matrix(c(
  1.0, 0.8, 0.8, 0.3,
  0.8, 1.0, 0.7, 0.2,
  0.8, 0.7, 1.0, 0.2,
  0.3, 0.2, 0.2, 1.0
), nrow=4)
data_3 <- generate_correlated_data(n, cor_matrix_3)
colnames(data_3) <- c("x1", "x2", "x3", "y")
results_3 <- analyze_regression(data_3)

# Scenario (iv): x1 uncorrelated with x2, x3; large partial effects
cor_matrix_4 <- matrix(c(
  1.0, 0.1, 0.1, 0.6,
  0.1, 1.0, 0.8, 0.7,
  0.1, 0.8, 1.0, 0.7,
  0.6, 0.7, 0.7, 1.0
), nrow=4)
data_4 <- generate_correlated_data(n, cor_matrix_4)
colnames(data_4) <- c("x1", "x2", "x3", "y")
results_4 <- analyze_regression(data_4)

# Print results for all scenarios
print_scenario_results <- function(scenario, results) {
  cat(sprintf("\nScenario %s:\n", scenario))
  cat(sprintf("Simple regression β1: %.3f (SE: %.3f)\n", 
              results$beta1_simple, results$se_simple))
  cat(sprintf("Multiple regression β1: %.3f (SE: %.3f)\n", 
              results$beta1_multiple, results$se_multiple))
}

print_scenario_results("(i)", results_1)
## 
## Scenario (i):
## Simple regression β1: 0.644 (SE: 0.027)
## Multiple regression β1: -0.253 (SE: 0.042)
print_scenario_results("(ii)", results_2)
## 
## Scenario (ii):
## Simple regression β1: 0.286 (SE: 0.031)
## Multiple regression β1: 0.223 (SE: 0.024)
print_scenario_results("(iii)", results_3)
## 
## Scenario (iii):
## Simple regression β1: 0.323 (SE: 0.029)
## Multiple regression β1: 0.488 (SE: 0.060)
print_scenario_results("(iv)", results_4)
## 
## Scenario (iv):
## Simple regression β1: 0.612 (SE: 0.025)
## Multiple regression β1: 0.533 (SE: 0.013)

Chapter 3 - C8

library(wooldridge)
library(dplyr)
library(car)  
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode

Part (i): Calculate the average values and standard deviations of prpblck and income

data("discrim")
summary_stats <- discrim %>%
  summarise(
    avg_prpblck = mean(prpblck, na.rm = TRUE),
    sd_prpblck = sd(prpblck, na.rm = TRUE),
    avg_income = mean(income, na.rm = TRUE),
    sd_income = sd(income, na.rm = TRUE)
  )

print(summary_stats)
##   avg_prpblck sd_prpblck avg_income sd_income
## 1   0.1134864  0.1824165   47053.78  13179.29

Part (ii): Estimate psoda = β₀ + β₁*prpblck + β₂*income + u using OLS and interpret prpblck

# Run the regression model
model1 <- lm(psoda ~ prpblck + income, data = discrim)
summary(model1)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06

Part (iii): Compare with a simple regression of psoda on prpblck only

# Simple regression model of psoda on prpblck
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)
## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015

Part (iv): Estimate the model with log-transformed income (constant price elasticity with respect to income)

# Log-transformed model
model_log <- lm(psoda ~ prpblck + log(income), data = discrim)
summary(model_log)
## 
## Call:
## lm(formula = psoda ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29484 -0.05085  0.00346  0.04283  0.44069 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.18553    0.18800   0.987    0.324    
## prpblck      0.12583    0.02697   4.665 4.23e-06 ***
## log(income)  0.07882    0.01739   4.533 7.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08602 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06628,    Adjusted R-squared:  0.06159 
## F-statistic: 14.13 on 2 and 398 DF,  p-value: 1.184e-06

To estimate the percentage change in psoda when prpblck increases by 0.20, multiply the coefficient for prpblck by 0.20.

Part (v): Add prppov to the regression and observe the change in prpblck

# Add prppov to the model
model_with_prppov <- lm(psoda ~ prpblck + log(income) + prppov, data = discrim)
summary(model_with_prppov)
## 
## Call:
## lm(formula = psoda ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28083 -0.05006  0.00305  0.04247  0.44286 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.51208    0.30777  -1.664  0.09693 .  
## prpblck      0.07501    0.03214   2.334  0.02011 *  
## log(income)  0.14180    0.02804   5.058  6.5e-07 ***
## prppov       0.39629    0.13915   2.848  0.00463 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08526 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08497,    Adjusted R-squared:  0.07806 
## F-statistic: 12.29 on 3 and 397 DF,  p-value: 1.053e-07

Part (vi): Find the correlation between log(income) and prppov

# Correlation between log(income) and prppov
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
print(correlation)
## [1] -0.838467

Part (vii): Check multicollinearity between log(income) and prppov using VIF

# Check for multicollinearity using VIF
vif(model_with_prppov)
##     prpblck log(income)      prppov 
##    1.922160    3.518721    4.915220

Chapter 4 - 3

# library(tidyverse)
n <- 32
sales <- runif(n, 10, 100)
log_sales <- log(sales)
profmarg <- runif(n, 0, 20)
rdintens <- 0.472 + 0.321 * log_sales + 0.050 * profmarg + rnorm(n, 0, 1)

# Create a data frame
data <- data.frame(rdintens = rdintens, sales = sales, log_sales = log_sales, profmarg = profmarg)

# Run the regression model
model <- lm(rdintens ~ log_sales + profmarg, data = data)

# Output model summary to check coefficients and significance
summary(model)
## 
## Call:
## lm(formula = rdintens ~ log_sales + profmarg, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.08745 -0.42009  0.04894  0.65782  1.88213 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.69781    1.06735   1.591    0.123
## log_sales    0.03833    0.27801   0.138    0.891
## profmarg     0.01837    0.03164   0.581    0.566
## 
## Residual standard error: 1.092 on 29 degrees of freedom
## Multiple R-squared:  0.01261,    Adjusted R-squared:  -0.05549 
## F-statistic: 0.1851 on 2 and 29 DF,  p-value: 0.832
# (i) Estimated percentage change in rdintens when sales increases by 10%
estimated_effect <- 0.321 * log(1.10)
estimated_effect
## [1] 0.03059457

Part (i): Estimation of the Regression Model

Part (ii): Hypothesis Test for log(sales)

We fail to reject the null hypothesis for log⁡(sales)at both 5% and 10% levels, meaning log⁡(sales)does not have a statistically significant effect on rdintens.

Part (iii): Interpretation of profmarg Coefficient

The interpretation is similar to Part (i). In the output from summary(model), look at the coefficient of profmarg.

Part (iv): Statistical Significance of profmarg

We fail to reject the null hypothesis for profmarg at both 5% and 10% levels, meaning profmarg does not have a statistically significant effect on rdintens

Chapter 4 - C8

# Load required libraries
library(wooldridge)
library(dplyr)
library(sandwich)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Load the dataset (it's "k401ksubs" in the wooldridge package)
data("k401ksubs")

# First filter for single-person households
singles <- subset(k401ksubs, fsize == 1)

# (i) Count number of single-person households
n_singles <- nrow(singles)
cat("Number of single-person households:", n_singles, "\n\n")
## Number of single-person households: 2017
# (ii) Run multiple regression
model_full <- lm(nettfa ~ inc + age, data = singles)
summary(model_full)
## 
## Call:
## lm(formula = nettfa ~ inc + age, data = singles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179.95  -14.16   -3.42    6.03 1113.94 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43.03981    4.08039 -10.548   <2e-16 ***
## inc           0.79932    0.05973  13.382   <2e-16 ***
## age           0.84266    0.09202   9.158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1185 
## F-statistic: 136.5 on 2 and 2014 DF,  p-value: < 2.2e-16
# Calculate robust standard errors
library(sandwich)
library(lmtest)
robust_se <- coeftest(model_full, vcov = vcovHC(model_full, type = "HC1"))
print(robust_se)
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -43.03981    5.52433 -7.7910 1.057e-14 ***
## inc           0.79932    0.10078  7.9313 3.562e-15 ***
## age           0.84266    0.11937  7.0589 2.300e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iv) Test H0: β2 = 1 against H1: β2 < 1
# Extract coefficient and standard error for age
age_coef <- coef(model_full)["age"]
age_se <- sqrt(vcovHC(model_full, type = "HC1")["age", "age"])

# Calculate t-statistic for H0: β2 = 1
t_stat <- (age_coef - 1) / age_se
# Calculate p-value for one-sided test
p_value <- pt(t_stat, df = nrow(singles) - 3)
cat("\nTest H0: β2 = 1 against H1: β2 < 1")
## 
## Test H0: β2 = 1 against H1: β2 < 1
cat("\nt-statistic:", t_stat)
## 
## t-statistic: -1.318064
cat("\np-value:", p_value, "\n\n")
## 
## p-value: 0.09381599
# (v) Simple regression of nettfa on inc
model_simple <- lm(nettfa ~ inc, data = singles)
summary(model_simple)
## 
## Call:
## lm(formula = nettfa ~ inc, data = singles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -185.12  -12.85   -4.85    1.78 1112.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.5709     2.0607   -5.13 3.18e-07 ***
## inc           0.8207     0.0609   13.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared:  0.08267,    Adjusted R-squared:  0.08222 
## F-statistic: 181.6 on 1 and 2015 DF,  p-value: < 2.2e-16
# Compare coefficients
cat("Coefficient on inc in multiple regression:", coef(model_full)["inc"], "\n")
## Coefficient on inc in multiple regression: 0.7993167
cat("Coefficient on inc in simple regression:", coef(model_simple)["inc"], "\n")
## Coefficient on inc in simple regression: 0.8206815
  1. Number of single-person households: 2017 single-person households in the dataset (fsize = 1).

  2. OLS Regression Results Interpretation:

Based on the regression results:

  1. Income (inc) coefficient: The positive coefficient indicates that for each $1,000 increase in annual family income, net financial wealth increases by $β₁ thousand, holding age constant. This positive relationship makes economic sense as higher income typically leads to greater wealth accumulation.

  2. Age (age) coefficient: The positive coefficient β₂ shows that for each additional year of age, net financial wealth increases by $β₂ thousand, holding income constant. This also makes economic sense as older people have had more time to accumulate wealth.

  1. Intercept Interpretation: The intercept (β₀) represents the predicted net financial wealth for a person with zero income and age zero. While this isn’t practically meaningful (since you can’t have negative age and income is rarely zero), it serves as a mathematical starting point for the regression plane.

  2. Testing H₀: β₂ = 1 vs H₁: β₂ < 1 From test results:

  1. Simple vs Multiple Regression Comparison: The difference between the income coefficients in the simple and multiple regressions tells us about the role of age in wealth accumulation:

Chapter 5 - 5

set.seed(123) 
n <- 1000  
mean_score <- 77  
sd_score <- 15 
scores <- rnorm(n, mean = mean_score, sd = sd_score)
scores <- pmax(pmin(scores, 100), 20)
hist(scores, breaks = 30, probability = TRUE, 
     main = "Distribution of Course Scores",
     xlab = "Course Score (in percentage form)",
     ylab = "Proportion in cell",
     xlim = c(20, 100),
     ylim = c(0, 0.1))

curve(dnorm(x, mean = mean(scores), sd = sd(scores)), 
      add = TRUE, col = "blue")

prob_exceed_100 <- 1 - pnorm(100, mean = mean(scores), sd = sd(scores))
cat("Probability of score exceeding 100:", prob_exceed_100, "\n")
## Probability of score exceeding 100: 0.0480284
prob_below_40 <- pnorm(40, mean = mean(scores), sd = sd(scores))
cat("Probability of score below 40:", prob_below_40, "\n")
## Probability of score below 40: 0.00420035

No, the probability would not be zero. Using the normal distribution, there would be a small positive probability of scores exceeding 100 (as shown by our calculation). This contradicts the assumption of a normal distribution for score because:

  1. Course scores are bounded at 100% - no student can get more than 100%

  2. The normal distribution is unbounded and continues infinitely in both directions

  3. This is a fundamental mismatch between the theoretical normal distribution and the actual constraints of percentage scores

Looking at the left tail of the histogram:

  1. The actual data shows very few scores below 40%

  2. The histogram bars in the left tail (20-40 range) are shorter than what the normal curve predicts

  3. The normal distribution doesn’t fit well in the left tail because:

-   It overestimates the probability of very low scores

-   The actual distribution appears to be truncated or bounded at the lower end

-   This suggests there might be a minimum passing score or grade threshold

-   The real data shows a slight positive skew (longer right tail) compared to the normal distribution

This mismatch in both tails (scores <40 and >100) shows that while the normal distribution is a reasonable approximation for the middle range of scores, it doesn’t capture the bounded nature of percentage scores well at the extremes.

Chapter 5 - C1

library(wooldridge) 
library(ggplot2)
data("wage1")

# (i) Estimate the model with wage as the dependent variable
model1 <- lm(wage ~ educ + exper + tenure, data = wage1)
summary(model1)
## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16
residuals1 <- residuals(model1)
ggplot(data.frame(residuals1), aes(x = residuals1)) +
  geom_histogram(bins = 20, fill = "blue", color = "black") +
  labs(title = "Histogram of Residuals for Wage Model", x = "Residuals", y = "Frequency")

# (ii) Estimate the model with log(wage) as the dependent variable
model2 <- lm(log(wage) ~ educ + exper + tenure, data = wage1)
summary(model2)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16
residuals2 <- residuals(model2)
ggplot(data.frame(residuals2), aes(x = residuals2)) +
  geom_histogram(bins = 20, fill = "green", color = "black") +
  labs(title = "Histogram of Residuals for Log(Wage) Model", x = "Residuals", y = "Frequency")