library(wooldridge)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data("countymurders")
murders_1996 <- countymurders[countymurders$year == 1996, ]
zero_murders <- sum(murders_1996$murders == 0)
cat("Number of counties with zero murders:", zero_murders, "\n")
## Number of counties with zero murders: 1051
counties_with_executions <- sum(murders_1996$execs > 0)
cat("Number of counties with at least one execution:", counties_with_executions, "\n")
## Number of counties with at least one execution: 31
max_executions <- max(murders_1996$execs)
cat("Largest number of executions:", max_executions, "\n")
## Largest number of executions: 3
sum()
) to identify
counties with zero murdersmodel <- lm(murders ~ execs, data = murders_1996)
summary_stats <- summary(model)
print(summary_stats)
##
## Call:
## lm(formula = murders ~ execs, data = murders_1996)
##
## Residuals:
## Min 1Q Median 3Q Max
## -149.12 -5.46 -4.46 -2.46 1338.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.4572 0.8348 6.537 7.79e-11 ***
## execs 58.5555 5.8333 10.038 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared: 0.04389, Adjusted R-squared: 0.04346
## F-statistic: 100.8 on 1 and 2195 DF, p-value: < 2.2e-16
n <- nobs(model)
r_squared <- summary_stats$r.squared
adj_r_squared <- summary_stats$adj.r.squared
cat("\nRegression Results:")
##
## Regression Results:
cat("\nSample size:", n)
##
## Sample size: 2197
cat("\nR-squared:", round(r_squared, 4))
##
## R-squared: 0.0439
cat("\nAdjusted R-squared:", round(adj_r_squared, 4))
##
## Adjusted R-squared: 0.0435
lm()
to estimate the regression model: murders =
β₀ + β₁execs + uiii. Visualization and Interpretation
ggplot(murders_1996, aes(x = execs, y = murders)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Relationship between Executions and Murders",
x = "Number of Executions",
y = "Number of Murders") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Extract and format coefficient information
coef_summary <- coef(summary(model))
beta1 <- coef_summary["execs", "Estimate"]
se_beta1 <- coef_summary["execs", "Std. Error"]
t_stat <- coef_summary["execs", "t value"]
p_value <- coef_summary["execs", "Pr(>|t|)"]
cat("\nSlope Coefficient Analysis:")
##
## Slope Coefficient Analysis:
cat("\nβ1 (slope):", round(beta1, 4))
##
## β1 (slope): 58.5555
cat("\nStandard Error:", round(se_beta1, 4))
##
## Standard Error: 5.8333
cat("\nt-statistic:", round(t_stat, 4))
##
## t-statistic: 10.0382
cat("\np-value:", format.pval(p_value, digits = 4))
##
## p-value: < 2.2e-16
iv. Predictions and Residuals
new_data <- data.frame(execs = 0)
predicted_murders <- predict(model, newdata = new_data)
actual_murders <- 0
residual <- actual_murders - predicted_murders[1]
cat("\n\nPredictions for County with Zero Executions:")
##
##
## Predictions for County with Zero Executions:
cat("\nPredicted murders:", round(predicted_murders[1], 4))
##
## Predicted murders: 5.4572
cat("\nResidual (assuming zero actual murders):", round(residual, 4))
##
## Residual (assuming zero actual murders): -5.4572
predict()
to calculate expected murders for zero
executionsv. Diagnostic Plots for Regression Assumptions
par(mfrow = c(2, 2))
plot(model)
# Additional tests for regression assumptions
# Breusch-Pagan test for heteroskedasticity
bp_test <- car::ncvTest(model)
cat("\n\nDiagnostic Tests:")
##
##
## Diagnostic Tests:
cat("\nBreusch-Pagan test p-value:", format.pval(bp_test$p, digits = 4))
##
## Breusch-Pagan test p-value: < 2.2e-16
# Shapiro-Wilk test for normality of residuals
sw_test <- shapiro.test(residuals(model))
cat("\nShapiro-Wilk test p-value:", format.pval(sw_test$p, digits = 4))
##
## Shapiro-Wilk test p-value: < 2.2e-16
# Create example data to illustrate the problem
set.seed(123)
n <- 100 # number of students
# Generate time allocations ensuring they sum to 168
generate_time <- function(n) {
# Initialize matrices
times <- matrix(0, nrow = n, ncol = 4)
colnames(times) <- c("study", "sleep", "work", "leisure")
for(i in 1:n) {
# Generate random proportions that sum to 168
props <- runif(4)
props <- props/sum(props) * 168
times[i,] <- props
}
return(as.data.frame(times))
}
# Generate data
student_data <- generate_time(n)
student_data$GPA <- with(student_data,
2 + 0.02*study - 0.01*sleep + 0.005*work - 0.015*leisure + rnorm(n, 0, 0.5))
# Demonstrate perfect multicollinearity
print("Sum of hours for first few students:")
## [1] "Sum of hours for first few students:"
head(rowSums(student_data[,1:4]))
## [1] 168 168 168 168 168 168
# Try to fit the original model
model_original <- try(lm(GPA ~ study + sleep + work + leisure, data = student_data))
print("\nOriginal model results:")
## [1] "\nOriginal model results:"
print(summary(model_original))
##
## Call:
## lm(formula = GPA ~ study + sleep + work + leisure, data = student_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00634 -0.32642 -0.08686 0.34959 0.99598
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.017485 0.278461 -0.063 0.950
## study 0.032280 0.002747 11.752 < 2e-16 ***
## sleep 0.002054 0.002502 0.821 0.414
## work 0.015001 0.002544 5.896 5.55e-08 ***
## leisure NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4727 on 96 degrees of freedom
## Multiple R-squared: 0.6641, Adjusted R-squared: 0.6536
## F-statistic: 63.26 on 3 and 96 DF, p-value: < 2.2e-16
# Reformulated model using proportions of time
student_data$study_prop <- student_data$study/168
student_data$sleep_prop <- student_data$sleep/168
student_data$work_prop <- student_data$work/168
# Note: leisure proportion is omitted as base category
# Fit the reformulated model
model_reformed <- lm(GPA ~ study_prop + sleep_prop + work_prop, data = student_data)
print("\nReformed model results:")
## [1] "\nReformed model results:"
print(summary(model_reformed))
##
## Call:
## lm(formula = GPA ~ study_prop + sleep_prop + work_prop, data = student_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00634 -0.32642 -0.08686 0.34959 0.99598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.01749 0.27846 -0.063 0.950
## study_prop 5.42308 0.46148 11.752 < 2e-16 ***
## sleep_prop 0.34506 0.42025 0.821 0.414
## work_prop 2.52018 0.42745 5.896 5.55e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4727 on 96 degrees of freedom
## Multiple R-squared: 0.6641, Adjusted R-squared: 0.6536
## F-statistic: 63.26 on 3 and 96 DF, p-value: < 2.2e-16
set.seed(123)
generate_correlated_data <- function(n, cor_matrix) {
library(MASS)
mu <- rep(0, ncol(cor_matrix))
data <- mvrnorm(n, mu, cor_matrix)
return(data)
}
# Function to analyze regression results
analyze_regression <- function(data) {
# Simple regression
simple_reg <- lm(y ~ x1, data = data.frame(data))
# Multiple regression
multiple_reg <- lm(y ~ x1 + x2 + x3, data = data.frame(data))
# Extract coefficients and standard errors
beta1_simple <- coef(simple_reg)[2]
beta1_multiple <- coef(multiple_reg)[2]
se_simple <- summary(simple_reg)$coefficients[2,2]
se_multiple <- summary(multiple_reg)$coefficients[2,2]
return(list(
beta1_simple = beta1_simple,
beta1_multiple = beta1_multiple,
se_simple = se_simple,
se_multiple = se_multiple
))
}
# Sample size
n <- 1000
# Scenario (i): x1 highly correlated with x2, x3; large partial effects
cor_matrix_1 <- matrix(c(
1.0, 0.8, 0.8, 0.6,
0.8, 1.0, 0.7, 0.7,
0.8, 0.7, 1.0, 0.7,
0.6, 0.7, 0.7, 1.0
), nrow=4)
data_1 <- generate_correlated_data(n, cor_matrix_1)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:wooldridge':
##
## cement
colnames(data_1) <- c("x1", "x2", "x3", "y")
results_1 <- analyze_regression(data_1)
# Scenario (ii): x1 uncorrelated with x2, x3; x2 and x3 highly correlated
cor_matrix_2 <- matrix(c(
1.0, 0.1, 0.1, 0.3,
0.1, 1.0, 0.8, 0.6,
0.1, 0.8, 1.0, 0.6,
0.3, 0.6, 0.6, 1.0
), nrow=4)
data_2 <- generate_correlated_data(n, cor_matrix_2)
colnames(data_2) <- c("x1", "x2", "x3", "y")
results_2 <- analyze_regression(data_2)
# Scenario (iii): x1 highly correlated with x2, x3; small partial effects
cor_matrix_3 <- matrix(c(
1.0, 0.8, 0.8, 0.3,
0.8, 1.0, 0.7, 0.2,
0.8, 0.7, 1.0, 0.2,
0.3, 0.2, 0.2, 1.0
), nrow=4)
data_3 <- generate_correlated_data(n, cor_matrix_3)
colnames(data_3) <- c("x1", "x2", "x3", "y")
results_3 <- analyze_regression(data_3)
# Scenario (iv): x1 uncorrelated with x2, x3; large partial effects
cor_matrix_4 <- matrix(c(
1.0, 0.1, 0.1, 0.6,
0.1, 1.0, 0.8, 0.7,
0.1, 0.8, 1.0, 0.7,
0.6, 0.7, 0.7, 1.0
), nrow=4)
data_4 <- generate_correlated_data(n, cor_matrix_4)
colnames(data_4) <- c("x1", "x2", "x3", "y")
results_4 <- analyze_regression(data_4)
# Print results for all scenarios
print_scenario_results <- function(scenario, results) {
cat(sprintf("\nScenario %s:\n", scenario))
cat(sprintf("Simple regression β1: %.3f (SE: %.3f)\n",
results$beta1_simple, results$se_simple))
cat(sprintf("Multiple regression β1: %.3f (SE: %.3f)\n",
results$beta1_multiple, results$se_multiple))
}
print_scenario_results("(i)", results_1)
##
## Scenario (i):
## Simple regression β1: 0.644 (SE: 0.027)
## Multiple regression β1: -0.253 (SE: 0.042)
print_scenario_results("(ii)", results_2)
##
## Scenario (ii):
## Simple regression β1: 0.286 (SE: 0.031)
## Multiple regression β1: 0.223 (SE: 0.024)
print_scenario_results("(iii)", results_3)
##
## Scenario (iii):
## Simple regression β1: 0.323 (SE: 0.029)
## Multiple regression β1: 0.488 (SE: 0.060)
print_scenario_results("(iv)", results_4)
##
## Scenario (iv):
## Simple regression β1: 0.612 (SE: 0.025)
## Multiple regression β1: 0.533 (SE: 0.013)
library(wooldridge)
library(dplyr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
Part (i): Calculate the average values and standard deviations of
prpblck
and income
data("discrim")
summary_stats <- discrim %>%
summarise(
avg_prpblck = mean(prpblck, na.rm = TRUE),
sd_prpblck = sd(prpblck, na.rm = TRUE),
avg_income = mean(income, na.rm = TRUE),
sd_income = sd(income, na.rm = TRUE)
)
print(summary_stats)
## avg_prpblck sd_prpblck avg_income sd_income
## 1 0.1134864 0.1824165 47053.78 13179.29
Part (ii): Estimate
psoda = β₀ + β₁*prpblck + β₂*income + u
using OLS and
interpret prpblck
# Run the regression model
model1 <- lm(psoda ~ prpblck + income, data = discrim)
summary(model1)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
The coefficient for prpblck
(β1_1β1) represents the
estimated change in psoda
(the price of soda) for a
one-unit increase in prpblck
, holding income constant.
Since prpblck
is a proportion, a one-unit increase
represents an extreme scenario, but smaller increases (like a 0.1 or 0.2
increase) provide meaningful interpretations.
If prpblck
has a positive and significant
coefficient, it suggests that soda prices are higher in ZIP codes with a
larger proportion of Black residents. If the coefficient is economically
large, this may suggest potential price discrimination based on racial
composition.
Part (iii): Compare with a simple regression of psoda
on
prpblck
only
# Simple regression model of psoda on prpblck
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
In the simple regression model, the coefficient on
prpblck
captures the raw relationship between the
proportion of Black residents and soda prices.
In the multivariate model (including income
), the
coefficient on prpblck
is adjusted for the effect of
income. Comparing these two estimates helps determine if the effect of
prpblck
is partly explained by income differences across
ZIP codes.
If the coefficient on prpblck
decreases
significantly in the multivariate model, it suggests that income was a
confounding variable.
Part (iv): Estimate the model with log-transformed
income
(constant price elasticity with respect to
income)
# Log-transformed model
model_log <- lm(psoda ~ prpblck + log(income), data = discrim)
summary(model_log)
##
## Call:
## lm(formula = psoda ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29484 -0.05085 0.00346 0.04283 0.44069
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.18553 0.18800 0.987 0.324
## prpblck 0.12583 0.02697 4.665 4.23e-06 ***
## log(income) 0.07882 0.01739 4.533 7.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08602 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06628, Adjusted R-squared: 0.06159
## F-statistic: 14.13 on 2 and 398 DF, p-value: 1.184e-06
To estimate the percentage change in psoda
when
prpblck
increases by 0.20, multiply the coefficient for
prpblck
by 0.20.
Part (v): Add prppov
to the regression and observe the
change in prpblck
# Add prppov to the model
model_with_prppov <- lm(psoda ~ prpblck + log(income) + prppov, data = discrim)
summary(model_with_prppov)
##
## Call:
## lm(formula = psoda ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28083 -0.05006 0.00305 0.04247 0.44286
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.51208 0.30777 -1.664 0.09693 .
## prpblck 0.07501 0.03214 2.334 0.02011 *
## log(income) 0.14180 0.02804 5.058 6.5e-07 ***
## prppov 0.39629 0.13915 2.848 0.00463 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08526 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08497, Adjusted R-squared: 0.07806
## F-statistic: 12.29 on 3 and 397 DF, p-value: 1.053e-07
Including prppov
controls for poverty levels in each
ZIP code. If adding prppov
substantially changes the
coefficient for prpblck
, it suggests that part of the
effect of prpblck
on psoda
might have been due
to differences in poverty levels.
This adjustment may lead to a more accurate estimate of the
direct effect of prpblck
on psoda
.
Part (vi): Find the correlation between log(income)
and
prppov
# Correlation between log(income) and prppov
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
print(correlation)
## [1] -0.838467
A high correlation (e.g., above 0.7 or 0.8) would indicate
potential multicollinearity between log(income)
and
prppov
. This can inflate the standard errors of the
coefficients, making it difficult to determine each variable’s unique
effect on psoda
.
If multicollinearity is present, it may be necessary to consider dropping one of the variables or using alternative approaches to handle it.
Part (vii): Check multicollinearity between log(income)
and prppov
using VIF
# Check for multicollinearity using VIF
vif(model_with_prppov)
## prpblck log(income) prppov
## 1.922160 3.518721 4.915220
log(income)
and prppov
have high VIF values,
it suggests that they are highly correlated.log(income)
and prppov
) might
not both be necessary in the regression model, as they provide redundant
information.# library(tidyverse)
n <- 32
sales <- runif(n, 10, 100)
log_sales <- log(sales)
profmarg <- runif(n, 0, 20)
rdintens <- 0.472 + 0.321 * log_sales + 0.050 * profmarg + rnorm(n, 0, 1)
# Create a data frame
data <- data.frame(rdintens = rdintens, sales = sales, log_sales = log_sales, profmarg = profmarg)
# Run the regression model
model <- lm(rdintens ~ log_sales + profmarg, data = data)
# Output model summary to check coefficients and significance
summary(model)
##
## Call:
## lm(formula = rdintens ~ log_sales + profmarg, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.08745 -0.42009 0.04894 0.65782 1.88213
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.69781 1.06735 1.591 0.123
## log_sales 0.03833 0.27801 0.138 0.891
## profmarg 0.01837 0.03164 0.581 0.566
##
## Residual standard error: 1.092 on 29 degrees of freedom
## Multiple R-squared: 0.01261, Adjusted R-squared: -0.05549
## F-statistic: 0.1851 on 2 and 29 DF, p-value: 0.832
# (i) Estimated percentage change in rdintens when sales increases by 10%
estimated_effect <- 0.321 * log(1.10)
estimated_effect
## [1] 0.03059457
Part (i): Estimation of the Regression Model
when i look at the coefficients for log(sales)
and
profmarg
, as well as their standard errors and
t-values.
The coefficient for log(sales)
tells us the change
in rdintens
for a 1% increase in
sales
.
The coefficient for profmarg
tells us the change in
rdintens
for a 1 percentage point increase in
profmarg
.
A 10% increase in sales is associated with a 3.21 percentage point increase in R&D intensity, but further tests show it is not statistically significant.
Part (ii): Hypothesis Test for
log(sales)
We fail to reject the null hypothesis for log(sales)at both 5% and 10% levels, meaning log(sales)does not have a statistically significant effect on rdintens.
Part (iii): Interpretation of profmarg
Coefficient
The interpretation is similar to Part (i). In the output from
summary(model)
, look at the coefficient of
profmarg
.
Part (iv): Statistical Significance of profmarg
We fail to reject the null hypothesis for profmarg
at
both 5% and 10% levels, meaning profmarg
does not have a
statistically significant effect on rdintens
# Load required libraries
library(wooldridge)
library(dplyr)
library(sandwich)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Load the dataset (it's "k401ksubs" in the wooldridge package)
data("k401ksubs")
# First filter for single-person households
singles <- subset(k401ksubs, fsize == 1)
# (i) Count number of single-person households
n_singles <- nrow(singles)
cat("Number of single-person households:", n_singles, "\n\n")
## Number of single-person households: 2017
# (ii) Run multiple regression
model_full <- lm(nettfa ~ inc + age, data = singles)
summary(model_full)
##
## Call:
## lm(formula = nettfa ~ inc + age, data = singles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -179.95 -14.16 -3.42 6.03 1113.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.03981 4.08039 -10.548 <2e-16 ***
## inc 0.79932 0.05973 13.382 <2e-16 ***
## age 0.84266 0.09202 9.158 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared: 0.1193, Adjusted R-squared: 0.1185
## F-statistic: 136.5 on 2 and 2014 DF, p-value: < 2.2e-16
# Calculate robust standard errors
library(sandwich)
library(lmtest)
robust_se <- coeftest(model_full, vcov = vcovHC(model_full, type = "HC1"))
print(robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.03981 5.52433 -7.7910 1.057e-14 ***
## inc 0.79932 0.10078 7.9313 3.562e-15 ***
## age 0.84266 0.11937 7.0589 2.300e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iv) Test H0: β2 = 1 against H1: β2 < 1
# Extract coefficient and standard error for age
age_coef <- coef(model_full)["age"]
age_se <- sqrt(vcovHC(model_full, type = "HC1")["age", "age"])
# Calculate t-statistic for H0: β2 = 1
t_stat <- (age_coef - 1) / age_se
# Calculate p-value for one-sided test
p_value <- pt(t_stat, df = nrow(singles) - 3)
cat("\nTest H0: β2 = 1 against H1: β2 < 1")
##
## Test H0: β2 = 1 against H1: β2 < 1
cat("\nt-statistic:", t_stat)
##
## t-statistic: -1.318064
cat("\np-value:", p_value, "\n\n")
##
## p-value: 0.09381599
# (v) Simple regression of nettfa on inc
model_simple <- lm(nettfa ~ inc, data = singles)
summary(model_simple)
##
## Call:
## lm(formula = nettfa ~ inc, data = singles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185.12 -12.85 -4.85 1.78 1112.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.5709 2.0607 -5.13 3.18e-07 ***
## inc 0.8207 0.0609 13.48 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared: 0.08267, Adjusted R-squared: 0.08222
## F-statistic: 181.6 on 1 and 2015 DF, p-value: < 2.2e-16
# Compare coefficients
cat("Coefficient on inc in multiple regression:", coef(model_full)["inc"], "\n")
## Coefficient on inc in multiple regression: 0.7993167
cat("Coefficient on inc in simple regression:", coef(model_simple)["inc"], "\n")
## Coefficient on inc in simple regression: 0.8206815
Number of single-person households: 2017 single-person households in the dataset (fsize = 1).
OLS Regression Results Interpretation:
Based on the regression results:
Income (inc) coefficient: The positive coefficient indicates that for each $1,000 increase in annual family income, net financial wealth increases by $β₁ thousand, holding age constant. This positive relationship makes economic sense as higher income typically leads to greater wealth accumulation.
Age (age) coefficient: The positive coefficient β₂ shows that for each additional year of age, net financial wealth increases by $β₂ thousand, holding income constant. This also makes economic sense as older people have had more time to accumulate wealth.
Intercept Interpretation: The intercept (β₀) represents the predicted net financial wealth for a person with zero income and age zero. While this isn’t practically meaningful (since you can’t have negative age and income is rarely zero), it serves as a mathematical starting point for the regression plane.
Testing H₀: β₂ = 1 vs H₁: β₂ < 1 From test results:
Null hypothesis (H₀): β₂ = 1 (wealth increases by $1,000 per year of age)
Alternative hypothesis (H₁): β₂ < 1 (wealth increases by less than $1,000 per year of age)
used a one-sided test at 1% significance level
The decision to reject or not reject H₀ depends on whether the p-value is less than 0.01
If they’re very different, it suggests age is an important control variable that affects the income-wealth relationship
If they’re similar, it suggests age has little impact on how income relates to wealth
set.seed(123)
n <- 1000
mean_score <- 77
sd_score <- 15
scores <- rnorm(n, mean = mean_score, sd = sd_score)
scores <- pmax(pmin(scores, 100), 20)
hist(scores, breaks = 30, probability = TRUE,
main = "Distribution of Course Scores",
xlab = "Course Score (in percentage form)",
ylab = "Proportion in cell",
xlim = c(20, 100),
ylim = c(0, 0.1))
curve(dnorm(x, mean = mean(scores), sd = sd(scores)),
add = TRUE, col = "blue")
prob_exceed_100 <- 1 - pnorm(100, mean = mean(scores), sd = sd(scores))
cat("Probability of score exceeding 100:", prob_exceed_100, "\n")
## Probability of score exceeding 100: 0.0480284
prob_below_40 <- pnorm(40, mean = mean(scores), sd = sd(scores))
cat("Probability of score below 40:", prob_below_40, "\n")
## Probability of score below 40: 0.00420035
No, the probability would not be zero. Using the normal distribution, there would be a small positive probability of scores exceeding 100 (as shown by our calculation). This contradicts the assumption of a normal distribution for score because:
Course scores are bounded at 100% - no student can get more than 100%
The normal distribution is unbounded and continues infinitely in both directions
This is a fundamental mismatch between the theoretical normal distribution and the actual constraints of percentage scores
Looking at the left tail of the histogram:
The actual data shows very few scores below 40%
The histogram bars in the left tail (20-40 range) are shorter than what the normal curve predicts
The normal distribution doesn’t fit well in the left tail because:
- It overestimates the probability of very low scores
- The actual distribution appears to be truncated or bounded at the lower end
- This suggests there might be a minimum passing score or grade threshold
- The real data shows a slight positive skew (longer right tail) compared to the normal distribution
This mismatch in both tails (scores <40 and >100) shows that while the normal distribution is a reasonable approximation for the middle range of scores, it doesn’t capture the bounded nature of percentage scores well at the extremes.
library(wooldridge)
library(ggplot2)
data("wage1")
# (i) Estimate the model with wage as the dependent variable
model1 <- lm(wage ~ educ + exper + tenure, data = wage1)
summary(model1)
##
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6068 -1.7747 -0.6279 1.1969 14.6536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.87273 0.72896 -3.941 9.22e-05 ***
## educ 0.59897 0.05128 11.679 < 2e-16 ***
## exper 0.02234 0.01206 1.853 0.0645 .
## tenure 0.16927 0.02164 7.820 2.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared: 0.3064, Adjusted R-squared: 0.3024
## F-statistic: 76.87 on 3 and 522 DF, p-value: < 2.2e-16
residuals1 <- residuals(model1)
ggplot(data.frame(residuals1), aes(x = residuals1)) +
geom_histogram(bins = 20, fill = "blue", color = "black") +
labs(title = "Histogram of Residuals for Wage Model", x = "Residuals", y = "Frequency")
# (ii) Estimate the model with log(wage) as the dependent variable
model2 <- lm(log(wage) ~ educ + exper + tenure, data = wage1)
summary(model2)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05802 -0.29645 -0.03265 0.28788 1.42809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.284360 0.104190 2.729 0.00656 **
## educ 0.092029 0.007330 12.555 < 2e-16 ***
## exper 0.004121 0.001723 2.391 0.01714 *
## tenure 0.022067 0.003094 7.133 3.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared: 0.316, Adjusted R-squared: 0.3121
## F-statistic: 80.39 on 3 and 522 DF, p-value: < 2.2e-16
residuals2 <- residuals(model2)
ggplot(data.frame(residuals2), aes(x = residuals2)) +
geom_histogram(bins = 20, fill = "green", color = "black") +
labs(title = "Histogram of Residuals for Log(Wage) Model", x = "Residuals", y = "Frequency")
(i) Estimated model1 with wage
and
plotted residuals.
(ii) Estimated model2 with
log(wage)
and plotted residuals.
(iii) The log-level model is likely closer to meeting Assumption MLR.6.