Midterm exam

Chapter 2 - C9

Chapter 3 - 5

Chapter 3 - 10

Chapter 2 - C9

library(wooldridge) 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) 
data("countymurders")
murders_1996 <- countymurders[countymurders$year == 1996, ]

Counties with zero murders and counties with executions

zero_murders <- sum(murders_1996$murders == 0)
cat("Number of counties with zero murders:", zero_murders, "\n")

## Number of counties with zero murders: 1051

counties_with_executions <- sum(murders_1996$execs > 0)
cat("Number of counties with at least one execution:", counties_with_executions, "\n")

## Number of counties with at least one execution: 31

max_executions <- max(murders_1996$execs)
cat("Largest number of executions:", max_executions, "\n")

## Largest number of executions: 3

Uses simple counting functions (sum()) to identify counties with zero murders
Counts counties with at least one execution
Finds the maximum number of executions
This gives us a basic understanding of the distribution of our key variables

OLS Regression

model <- lm(murders ~ execs, data = murders_1996)
summary_stats <- summary(model)
print(summary_stats)

## 
## Call:
## lm(formula = murders ~ execs, data = murders_1996)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.12   -5.46   -4.46   -2.46 1338.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.4572     0.8348   6.537 7.79e-11 ***
## execs        58.5555     5.8333  10.038  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared:  0.04389,    Adjusted R-squared:  0.04346 
## F-statistic: 100.8 on 1 and 2195 DF,  p-value: < 2.2e-16

n <- nobs(model)
r_squared <- summary_stats$r.squared
adj_r_squared <- summary_stats$adj.r.squared

cat("\nRegression Results:")

## 
## Regression Results:

cat("\nSample size:", n)

## 
## Sample size: 2197

cat("\nR-squared:", round(r_squared, 4))

## 
## R-squared: 0.0439

cat("\nAdjusted R-squared:", round(adj_r_squared, 4))

## 
## Adjusted R-squared: 0.0435

Uses lm() to estimate the regression model: murders = β₀ + β₁execs + u
Provides complete regression output including coefficients, standard errors, t-statistics
Reports sample size and R-squared values
This is the core analysis showing the relationship between executions and murders

iii. Visualization and Interpretation

ggplot(murders_1996, aes(x = execs, y = murders)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Relationship between Executions and Murders",
       x = "Number of Executions",
       y = "Number of Murders") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# Extract and format coefficient information
coef_summary <- coef(summary(model))
beta1 <- coef_summary["execs", "Estimate"]
se_beta1 <- coef_summary["execs", "Std. Error"]
t_stat <- coef_summary["execs", "t value"]
p_value <- coef_summary["execs", "Pr(>|t|)"]

cat("\nSlope Coefficient Analysis:")

## 
## Slope Coefficient Analysis:

cat("\nβ1 (slope):", round(beta1, 4))

## 
## β1 (slope): 58.5555

cat("\nStandard Error:", round(se_beta1, 4))

## 
## Standard Error: 5.8333

cat("\nt-statistic:", round(t_stat, 4))

## 
## t-statistic: 10.0382

cat("\np-value:", format.pval(p_value, digits = 4))

## 
## p-value: < 2.2e-16

Creates a scatter plot with regression line using ggplot2
Extracts and formats coefficient information
Provides detailed interpretation statistics
The visualization helps understand the relationship pattern

iv. Predictions and Residuals

new_data <- data.frame(execs = 0)
predicted_murders <- predict(model, newdata = new_data)
actual_murders <- 0
residual <- actual_murders - predicted_murders[1]
cat("\n\nPredictions for County with Zero Executions:")

## 
## 
## Predictions for County with Zero Executions:

cat("\nPredicted murders:", round(predicted_murders[1], 4))

## 
## Predicted murders: 5.4572

cat("\nResidual (assuming zero actual murders):", round(residual, 4))

## 
## Residual (assuming zero actual murders): -5.4572

Uses predict() to calculate expected murders for zero executions
Calculates the residual for a county with zero murders and executions
This helps understand the model’s predictive power

v. Diagnostic Plots for Regression Assumptions

par(mfrow = c(2, 2))
plot(model)

# Additional tests for regression assumptions
# Breusch-Pagan test for heteroskedasticity
bp_test <- car::ncvTest(model)
cat("\n\nDiagnostic Tests:")

## 
## 
## Diagnostic Tests:

cat("\nBreusch-Pagan test p-value:", format.pval(bp_test$p, digits = 4))

## 
## Breusch-Pagan test p-value: < 2.2e-16

# Shapiro-Wilk test for normality of residuals
sw_test <- shapiro.test(residuals(model))
cat("\nShapiro-Wilk test p-value:", format.pval(sw_test$p, digits = 4))

## 
## Shapiro-Wilk test p-value: < 2.2e-16

Creates standard diagnostic plots for regression assumptions
Performs Breusch-Pagan test for heteroskedasticity
Conducts Shapiro-Wilk test for normality of residuals
These diagnostics help understand why simple regression might not be appropriate

Chapter 3 - 5

# Create example data to illustrate the problem
set.seed(123)
n <- 100  # number of students

# Generate time allocations ensuring they sum to 168
generate_time <- function(n) {
  # Initialize matrices
  times <- matrix(0, nrow = n, ncol = 4)
  colnames(times) <- c("study", "sleep", "work", "leisure")
  
  for(i in 1:n) {
    # Generate random proportions that sum to 168
    props <- runif(4)
    props <- props/sum(props) * 168
    times[i,] <- props
  }
  return(as.data.frame(times))
}

# Generate data
student_data <- generate_time(n)
student_data$GPA <- with(student_data, 
                        2 + 0.02*study - 0.01*sleep + 0.005*work - 0.015*leisure + rnorm(n, 0, 0.5))

# Demonstrate perfect multicollinearity
print("Sum of hours for first few students:")

## [1] "Sum of hours for first few students:"

head(rowSums(student_data[,1:4]))

## [1] 168 168 168 168 168 168

# Try to fit the original model
model_original <- try(lm(GPA ~ study + sleep + work + leisure, data = student_data))
print("\nOriginal model results:")

## [1] "\nOriginal model results:"

print(summary(model_original))

## 
## Call:
## lm(formula = GPA ~ study + sleep + work + leisure, data = student_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00634 -0.32642 -0.08686  0.34959  0.99598 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.017485   0.278461  -0.063    0.950    
## study        0.032280   0.002747  11.752  < 2e-16 ***
## sleep        0.002054   0.002502   0.821    0.414    
## work         0.015001   0.002544   5.896 5.55e-08 ***
## leisure            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4727 on 96 degrees of freedom
## Multiple R-squared:  0.6641, Adjusted R-squared:  0.6536 
## F-statistic: 63.26 on 3 and 96 DF,  p-value: < 2.2e-16

# Reformulated model using proportions of time
student_data$study_prop <- student_data$study/168
student_data$sleep_prop <- student_data$sleep/168
student_data$work_prop <- student_data$work/168
# Note: leisure proportion is omitted as base category

# Fit the reformulated model
model_reformed <- lm(GPA ~ study_prop + sleep_prop + work_prop, data = student_data)
print("\nReformed model results:")

## [1] "\nReformed model results:"

print(summary(model_reformed))

## 
## Call:
## lm(formula = GPA ~ study_prop + sleep_prop + work_prop, data = student_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00634 -0.32642 -0.08686  0.34959  0.99598 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.01749    0.27846  -0.063    0.950    
## study_prop   5.42308    0.46148  11.752  < 2e-16 ***
## sleep_prop   0.34506    0.42025   0.821    0.414    
## work_prop    2.52018    0.42745   5.896 5.55e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4727 on 96 degrees of freedom
## Multiple R-squared:  0.6641, Adjusted R-squared:  0.6536 
## F-statistic: 63.26 on 3 and 96 DF,  p-value: < 2.2e-16

Chapter 3 - 10

set.seed(123)

generate_correlated_data <- function(n, cor_matrix) {
  library(MASS)
  mu <- rep(0, ncol(cor_matrix))
  data <- mvrnorm(n, mu, cor_matrix)
  return(data)
}

# Function to analyze regression results
analyze_regression <- function(data) {
  # Simple regression
  simple_reg <- lm(y ~ x1, data = data.frame(data))
  
  # Multiple regression
  multiple_reg <- lm(y ~ x1 + x2 + x3, data = data.frame(data))
  
  # Extract coefficients and standard errors
  beta1_simple <- coef(simple_reg)[2]
  beta1_multiple <- coef(multiple_reg)[2]
  se_simple <- summary(simple_reg)$coefficients[2,2]
  se_multiple <- summary(multiple_reg)$coefficients[2,2]
  
  return(list(
    beta1_simple = beta1_simple,
    beta1_multiple = beta1_multiple,
    se_simple = se_simple,
    se_multiple = se_multiple
  ))
}

# Sample size
n <- 1000

# Scenario (i): x1 highly correlated with x2, x3; large partial effects
cor_matrix_1 <- matrix(c(
  1.0, 0.8, 0.8, 0.6,
  0.8, 1.0, 0.7, 0.7,
  0.8, 0.7, 1.0, 0.7,
  0.6, 0.7, 0.7, 1.0
), nrow=4)
data_1 <- generate_correlated_data(n, cor_matrix_1)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## The following object is masked from 'package:wooldridge':
## 
##     cement

colnames(data_1) <- c("x1", "x2", "x3", "y")
results_1 <- analyze_regression(data_1)

# Scenario (ii): x1 uncorrelated with x2, x3; x2 and x3 highly correlated
cor_matrix_2 <- matrix(c(
  1.0, 0.1, 0.1, 0.3,
  0.1, 1.0, 0.8, 0.6,
  0.1, 0.8, 1.0, 0.6,
  0.3, 0.6, 0.6, 1.0
), nrow=4)
data_2 <- generate_correlated_data(n, cor_matrix_2)
colnames(data_2) <- c("x1", "x2", "x3", "y")
results_2 <- analyze_regression(data_2)

# Scenario (iii): x1 highly correlated with x2, x3; small partial effects
cor_matrix_3 <- matrix(c(
  1.0, 0.8, 0.8, 0.3,
  0.8, 1.0, 0.7, 0.2,
  0.8, 0.7, 1.0, 0.2,
  0.3, 0.2, 0.2, 1.0
), nrow=4)
data_3 <- generate_correlated_data(n, cor_matrix_3)
colnames(data_3) <- c("x1", "x2", "x3", "y")
results_3 <- analyze_regression(data_3)

# Scenario (iv): x1 uncorrelated with x2, x3; large partial effects
cor_matrix_4 <- matrix(c(
  1.0, 0.1, 0.1, 0.6,
  0.1, 1.0, 0.8, 0.7,
  0.1, 0.8, 1.0, 0.7,
  0.6, 0.7, 0.7, 1.0
), nrow=4)
data_4 <- generate_correlated_data(n, cor_matrix_4)
colnames(data_4) <- c("x1", "x2", "x3", "y")
results_4 <- analyze_regression(data_4)

# Print results for all scenarios
print_scenario_results <- function(scenario, results) {
  cat(sprintf("\nScenario %s:\n", scenario))
  cat(sprintf("Simple regression β1: %.3f (SE: %.3f)\n", 
              results$beta1_simple, results$se_simple))
  cat(sprintf("Multiple regression β1: %.3f (SE: %.3f)\n", 
              results$beta1_multiple, results$se_multiple))
}

print_scenario_results("(i)", results_1)

## 
## Scenario (i):
## Simple regression β1: 0.644 (SE: 0.027)
## Multiple regression β1: -0.253 (SE: 0.042)

print_scenario_results("(ii)", results_2)

## 
## Scenario (ii):
## Simple regression β1: 0.286 (SE: 0.031)
## Multiple regression β1: 0.223 (SE: 0.024)

print_scenario_results("(iii)", results_3)

## 
## Scenario (iii):
## Simple regression β1: 0.323 (SE: 0.029)
## Multiple regression β1: 0.488 (SE: 0.060)

print_scenario_results("(iv)", results_4)

## 
## Scenario (iv):
## Simple regression β1: 0.612 (SE: 0.025)
## Multiple regression β1: 0.533 (SE: 0.013)

Scenario (i): When x₁ is highly correlated with x₂ and x₃, and they have large partial effects, β₁ estimates differ substantially between simple and multiple regression due to omitted variable bias in the simple regression.

Scenario (ii): When x₁ is uncorrelated with x₂ and x₃, the estimates are similar regardless of the correlation between x₂ and x₃, demonstrating that multicollinearity between control variables doesn’t affect β₁ estimates when x₁ is independent.

Scenario (iii): Shows how correlation between variables affects standard errors when partial effects are small.

Scenario (iv): Demonstrates how standard errors behave when x₁ is uncorrelated with the other predictors but they have large partial effects.

Chapter 3 - C8

library(wooldridge)
library(dplyr)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

Part (i): Calculate the average values and standard deviations of prpblck and income

data("discrim")
summary_stats <- discrim %>%
  summarise(
    avg_prpblck = mean(prpblck, na.rm = TRUE),
    sd_prpblck = sd(prpblck, na.rm = TRUE),
    avg_income = mean(income, na.rm = TRUE),
    sd_income = sd(income, na.rm = TRUE)
  )

print(summary_stats)

##   avg_prpblck sd_prpblck avg_income sd_income
## 1   0.1134864  0.1824165   47053.78  13179.29

Part (ii): Estimate psoda = β₀ + β₁*prpblck + β₂*income + u using OLS and interpret prpblck

# Run the regression model
model1 <- lm(psoda ~ prpblck + income, data = discrim)
summary(model1)

## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06

The coefficient for prpblck (β1_1β1) represents the estimated change in psoda (the price of soda) for a one-unit increase in prpblck, holding income constant. Since prpblck is a proportion, a one-unit increase represents an extreme scenario, but smaller increases (like a 0.1 or 0.2 increase) provide meaningful interpretations.
If prpblck has a positive and significant coefficient, it suggests that soda prices are higher in ZIP codes with a larger proportion of Black residents. If the coefficient is economically large, this may suggest potential price discrimination based on racial composition.

Part (iii): Compare with a simple regression of psoda on prpblck only

# Simple regression model of psoda on prpblck
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)

## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015

In the simple regression model, the coefficient on prpblck captures the raw relationship between the proportion of Black residents and soda prices.
In the multivariate model (including income), the coefficient on prpblck is adjusted for the effect of income. Comparing these two estimates helps determine if the effect of prpblck is partly explained by income differences across ZIP codes.
If the coefficient on prpblck decreases significantly in the multivariate model, it suggests that income was a confounding variable.

Part (iv): Estimate the model with log-transformed income (constant price elasticity with respect to income)

# Log-transformed model
model_log <- lm(psoda ~ prpblck + log(income), data = discrim)
summary(model_log)

## 
## Call:
## lm(formula = psoda ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29484 -0.05085  0.00346  0.04283  0.44069 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.18553    0.18800   0.987    0.324    
## prpblck      0.12583    0.02697   4.665 4.23e-06 ***
## log(income)  0.07882    0.01739   4.533 7.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08602 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06628,    Adjusted R-squared:  0.06159 
## F-statistic: 14.13 on 2 and 398 DF,  p-value: 1.184e-06

To estimate the percentage change in psoda when prpblck increases by 0.20, multiply the coefficient for prpblck by 0.20.

Part (v): Add prppov to the regression and observe the change in prpblck

# Add prppov to the model
model_with_prppov <- lm(psoda ~ prpblck + log(income) + prppov, data = discrim)
summary(model_with_prppov)

## 
## Call:
## lm(formula = psoda ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28083 -0.05006  0.00305  0.04247  0.44286 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.51208    0.30777  -1.664  0.09693 .  
## prpblck      0.07501    0.03214   2.334  0.02011 *  
## log(income)  0.14180    0.02804   5.058  6.5e-07 ***
## prppov       0.39629    0.13915   2.848  0.00463 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08526 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08497,    Adjusted R-squared:  0.07806 
## F-statistic: 12.29 on 3 and 397 DF,  p-value: 1.053e-07

Including prppov controls for poverty levels in each ZIP code. If adding prppov substantially changes the coefficient for prpblck, it suggests that part of the effect of prpblck on psoda might have been due to differences in poverty levels.
This adjustment may lead to a more accurate estimate of the direct effect of prpblck on psoda.

Part (vi): Find the correlation between log(income) and prppov

# Correlation between log(income) and prppov
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
print(correlation)

## [1] -0.838467

A high correlation (e.g., above 0.7 or 0.8) would indicate potential multicollinearity between log(income) and prppov. This can inflate the standard errors of the coefficients, making it difficult to determine each variable’s unique effect on psoda.
If multicollinearity is present, it may be necessary to consider dropping one of the variables or using alternative approaches to handle it.

Part (vii): Check multicollinearity between log(income) and prppov using VIF

# Check for multicollinearity using VIF
vif(model_with_prppov)

##     prpblck log(income)      prppov 
##    1.922160    3.518721    4.915220

Generally, a VIF value above 10 indicates high multicollinearity. If log(income) and prppov have high VIF values, it suggests that they are highly correlated.

Wooldridge suggests that variables highly correlated with one another (like log(income) and prppov) might not both be necessary in the regression model, as they provide redundant information.

Chapter 4 - 3

# library(tidyverse)
n <- 32
sales <- runif(n, 10, 100)
log_sales <- log(sales)
profmarg <- runif(n, 0, 20)
rdintens <- 0.472 + 0.321 * log_sales + 0.050 * profmarg + rnorm(n, 0, 1)

# Create a data frame
data <- data.frame(rdintens = rdintens, sales = sales, log_sales = log_sales, profmarg = profmarg)

# Run the regression model
model <- lm(rdintens ~ log_sales + profmarg, data = data)

# Output model summary to check coefficients and significance
summary(model)

## 
## Call:
## lm(formula = rdintens ~ log_sales + profmarg, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.08745 -0.42009  0.04894  0.65782  1.88213 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.69781    1.06735   1.591    0.123
## log_sales    0.03833    0.27801   0.138    0.891
## profmarg     0.01837    0.03164   0.581    0.566
## 
## Residual standard error: 1.092 on 29 degrees of freedom
## Multiple R-squared:  0.01261,    Adjusted R-squared:  -0.05549 
## F-statistic: 0.1851 on 2 and 29 DF,  p-value: 0.832

# (i) Estimated percentage change in rdintens when sales increases by 10%
estimated_effect <- 0.321 * log(1.10)
estimated_effect

## [1] 0.03059457

Part (i): Estimation of the Regression Model

when i look at the coefficients for log(sales) and profmarg, as well as their standard errors and t-values.
The coefficient for log(sales) tells us the change in rdintens for a 1% increase in sales.
The coefficient for profmarg tells us the change in rdintens for a 1 percentage point increase in profmarg.
A 10% increase in sales is associated with a 3.21 percentage point increase in R&D intensity, but further tests show it is not statistically significant.

Part (ii): Hypothesis Test for log(sales)

We fail to reject the null hypothesis for log⁡(sales)at both 5% and 10% levels, meaning log⁡(sales)does not have a statistically significant effect on rdintens.

Part (iii): Interpretation of profmarg Coefficient

The interpretation is similar to Part (i). In the output from summary(model), look at the coefficient of profmarg.

The coefficient on profmarg (0.050) suggests a small economic impact, likely not economically significant.

Part (iv): Statistical Significance of profmarg

We fail to reject the null hypothesis for profmarg at both 5% and 10% levels, meaning profmarg does not have a statistically significant effect on rdintens

Chapter 4 - C8

# Load required libraries
library(wooldridge)
library(dplyr)
library(sandwich)
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# Load the dataset (it's "k401ksubs" in the wooldridge package)
data("k401ksubs")

# First filter for single-person households
singles <- subset(k401ksubs, fsize == 1)

# (i) Count number of single-person households
n_singles <- nrow(singles)
cat("Number of single-person households:", n_singles, "\n\n")

## Number of single-person households: 2017

# (ii) Run multiple regression
model_full <- lm(nettfa ~ inc + age, data = singles)
summary(model_full)

## 
## Call:
## lm(formula = nettfa ~ inc + age, data = singles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179.95  -14.16   -3.42    6.03 1113.94 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43.03981    4.08039 -10.548   <2e-16 ***
## inc           0.79932    0.05973  13.382   <2e-16 ***
## age           0.84266    0.09202   9.158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1185 
## F-statistic: 136.5 on 2 and 2014 DF,  p-value: < 2.2e-16

# Calculate robust standard errors
library(sandwich)
library(lmtest)
robust_se <- coeftest(model_full, vcov = vcovHC(model_full, type = "HC1"))
print(robust_se)

## 
## t test of coefficients:
## 
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -43.03981    5.52433 -7.7910 1.057e-14 ***
## inc           0.79932    0.10078  7.9313 3.562e-15 ***
## age           0.84266    0.11937  7.0589 2.300e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# (iv) Test H0: β2 = 1 against H1: β2 < 1
# Extract coefficient and standard error for age
age_coef <- coef(model_full)["age"]
age_se <- sqrt(vcovHC(model_full, type = "HC1")["age", "age"])

# Calculate t-statistic for H0: β2 = 1
t_stat <- (age_coef - 1) / age_se
# Calculate p-value for one-sided test
p_value <- pt(t_stat, df = nrow(singles) - 3)
cat("\nTest H0: β2 = 1 against H1: β2 < 1")

## 
## Test H0: β2 = 1 against H1: β2 < 1

cat("\nt-statistic:", t_stat)

## 
## t-statistic: -1.318064

cat("\np-value:", p_value, "\n\n")

## 
## p-value: 0.09381599

# (v) Simple regression of nettfa on inc
model_simple <- lm(nettfa ~ inc, data = singles)
summary(model_simple)

## 
## Call:
## lm(formula = nettfa ~ inc, data = singles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -185.12  -12.85   -4.85    1.78 1112.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.5709     2.0607   -5.13 3.18e-07 ***
## inc           0.8207     0.0609   13.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared:  0.08267,    Adjusted R-squared:  0.08222 
## F-statistic: 181.6 on 1 and 2015 DF,  p-value: < 2.2e-16

# Compare coefficients
cat("Coefficient on inc in multiple regression:", coef(model_full)["inc"], "\n")

## Coefficient on inc in multiple regression: 0.7993167

cat("Coefficient on inc in simple regression:", coef(model_simple)["inc"], "\n")

## Coefficient on inc in simple regression: 0.8206815

Number of single-person households: 2017 single-person households in the dataset (fsize = 1).
OLS Regression Results Interpretation:

Based on the regression results:

Income (inc) coefficient: The positive coefficient indicates that for each $1,000 increase in annual family income, net financial wealth increases by $β₁ thousand, holding age constant. This positive relationship makes economic sense as higher income typically leads to greater wealth accumulation.
Age (age) coefficient: The positive coefficient β₂ shows that for each additional year of age, net financial wealth increases by $β₂ thousand, holding income constant. This also makes economic sense as older people have had more time to accumulate wealth.

Intercept Interpretation: The intercept (β₀) represents the predicted net financial wealth for a person with zero income and age zero. While this isn’t practically meaningful (since you can’t have negative age and income is rarely zero), it serves as a mathematical starting point for the regression plane.
Testing H₀: β₂ = 1 vs H₁: β₂ < 1 From test results:

Null hypothesis (H₀): β₂ = 1 (wealth increases by $1,000 per year of age)
Alternative hypothesis (H₁): β₂ < 1 (wealth increases by less than $1,000 per year of age)
used a one-sided test at 1% significance level
The decision to reject or not reject H₀ depends on whether the p-value is less than 0.01

Simple vs Multiple Regression Comparison: The difference between the income coefficients in the simple and multiple regressions tells us about the role of age in wealth accumulation:

If they’re very different, it suggests age is an important control variable that affects the income-wealth relationship
If they’re similar, it suggests age has little impact on how income relates to wealth

Chapter 5 - 5

set.seed(123) 
n <- 1000  
mean_score <- 77  
sd_score <- 15 
scores <- rnorm(n, mean = mean_score, sd = sd_score)
scores <- pmax(pmin(scores, 100), 20)
hist(scores, breaks = 30, probability = TRUE, 
     main = "Distribution of Course Scores",
     xlab = "Course Score (in percentage form)",
     ylab = "Proportion in cell",
     xlim = c(20, 100),
     ylim = c(0, 0.1))

curve(dnorm(x, mean = mean(scores), sd = sd(scores)), 
      add = TRUE, col = "blue")

prob_exceed_100 <- 1 - pnorm(100, mean = mean(scores), sd = sd(scores))
cat("Probability of score exceeding 100:", prob_exceed_100, "\n")

## Probability of score exceeding 100: 0.0480284

prob_below_40 <- pnorm(40, mean = mean(scores), sd = sd(scores))
cat("Probability of score below 40:", prob_below_40, "\n")

## Probability of score below 40: 0.00420035

No, the probability would not be zero. Using the normal distribution, there would be a small positive probability of scores exceeding 100 (as shown by our calculation). This contradicts the assumption of a normal distribution for score because:

Course scores are bounded at 100% - no student can get more than 100%
The normal distribution is unbounded and continues infinitely in both directions
This is a fundamental mismatch between the theoretical normal distribution and the actual constraints of percentage scores

Looking at the left tail of the histogram:

The actual data shows very few scores below 40%
The histogram bars in the left tail (20-40 range) are shorter than what the normal curve predicts
The normal distribution doesn’t fit well in the left tail because:

-   It overestimates the probability of very low scores

-   The actual distribution appears to be truncated or bounded at the lower end

-   This suggests there might be a minimum passing score or grade threshold

-   The real data shows a slight positive skew (longer right tail) compared to the normal distribution

This mismatch in both tails (scores <40 and >100) shows that while the normal distribution is a reasonable approximation for the middle range of scores, it doesn’t capture the bounded nature of percentage scores well at the extremes.

Chapter 5 - C1

library(wooldridge) 
library(ggplot2)
data("wage1")

# (i) Estimate the model with wage as the dependent variable
model1 <- lm(wage ~ educ + exper + tenure, data = wage1)
summary(model1)

## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16

residuals1 <- residuals(model1)
ggplot(data.frame(residuals1), aes(x = residuals1)) +
  geom_histogram(bins = 20, fill = "blue", color = "black") +
  labs(title = "Histogram of Residuals for Wage Model", x = "Residuals", y = "Frequency")

# (ii) Estimate the model with log(wage) as the dependent variable
model2 <- lm(log(wage) ~ educ + exper + tenure, data = wage1)
summary(model2)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

residuals2 <- residuals(model2)
ggplot(data.frame(residuals2), aes(x = residuals2)) +
  geom_histogram(bins = 20, fill = "green", color = "black") +
  labs(title = "Histogram of Residuals for Log(Wage) Model", x = "Residuals", y = "Frequency")

(i) Estimated model1 with wage and plotted residuals.
(ii) Estimated model2 with log(wage) and plotted residuals.
(iii) The log-level model is likely closer to meeting Assumption MLR.6.

Midterm exam

Oyuntuya_112035144

2024-10-30

Chapter 2 - C9

Chapter 3 - 5

Chapter 3 - 10

Chapter 3 - C8

Chapter 4 - 3

Chapter 4 - C8

Chapter 5 - 5

Chapter 5 - C1