# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
# Set a random seed for reproducibility
set.seed(123)
# Simulate a dataset
n <- 200 # Number of observations
data <- data.frame(
psoda = rnorm(n, mean = 3, sd = 1), # Dependent variable (soda price)
prpblck = runif(n, 0, 1), # Proportion of black population
income = rnorm(n, mean = 50000, sd = 15000), # Household income
prppov = runif(n, 0, 1), # Proportion of population in poverty
hseval = rnorm(n, mean = 250000, sd = 50000) # House value
)
# Transform variables as necessary
data$log_psoda <- log(data$psoda)
data$log_income <- log(data$income)
data$log_hseval <- log(data$hseval)
# (i) OLS regression estimation
model1 <- lm(log_psoda ~ prpblck + log_income + prppov, data = data)
summary_model1 <- summary(model1)
print(summary_model1)
##
## Call:
## lm(formula = log_psoda ~ prpblck + log_income + prppov, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48625 -0.16349 0.01558 0.22644 0.80421
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.765563 0.715994 2.466 0.0145 *
## prpblck 0.003534 0.083651 0.042 0.9663
## log_income -0.070898 0.066537 -1.066 0.2879
## prppov 0.078383 0.090051 0.870 0.3851
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3439 on 196 degrees of freedom
## Multiple R-squared: 0.008958, Adjusted R-squared: -0.006211
## F-statistic: 0.5905 on 3 and 196 DF, p-value: 0.6219
# Extract p-value for prpblck
p_value_prpblck <- coef(summary_model1)["prpblck", "Pr(>|t|)"]
cat("P-value for prpblck:", p_value_prpblck, "\n")
## P-value for prpblck: 0.9663458
is_significant_5 <- p_value_prpblck < 0.05
is_significant_1 <- p_value_prpblck < 0.01
cat("Is prpblck significant at 5% level?", is_significant_5, "\n")
## Is prpblck significant at 5% level? FALSE
cat("Is prpblck significant at 1% level?", is_significant_1, "\n")
## Is prpblck significant at 1% level? FALSE
# (ii) Correlation between log_income and prppov
correlation <- cor(data$log_income, data$prppov)
cat("Correlation between log(income) and prppov:", correlation, "\n")
## Correlation between log(income) and prppov: 0.07223969
# Check significance of each variable
p_value_income <- coef(summary_model1)["log_income", "Pr(>|t|)"]
p_value_prppov <- coef(summary_model1)["prppov", "Pr(>|t|)"]
cat("P-value for log(income):", p_value_income, "\n")
## P-value for log(income): 0.2879427
cat("P-value for prppov:", p_value_prppov, "\n")
## P-value for prppov: 0.3851307
# (iii) Adding log_hseval to the regression
model2 <- lm(log_psoda ~ prpblck + log_income + prppov + log_hseval, data = data)
summary_model2 <- summary(model2)
print(summary_model2)
##
## Call:
## lm(formula = log_psoda ~ prpblck + log_income + prppov + log_hseval,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.49260 -0.18292 0.00895 0.22815 0.76679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.01789 1.63068 1.851 0.0657 .
## prpblck 0.00839 0.08390 0.100 0.9205
## log_income -0.07662 0.06692 -1.145 0.2536
## prppov 0.07271 0.09036 0.805 0.4220
## log_hseval -0.09598 0.11227 -0.855 0.3936
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3441 on 195 degrees of freedom
## Multiple R-squared: 0.01266, Adjusted R-squared: -0.007594
## F-statistic: 0.625 on 4 and 195 DF, p-value: 0.6452
# Interpret the coefficient for log_hseval
coef_log_hseval <- coef(summary_model2)["log_hseval", "Estimate"]
p_value_log_hseval <- coef(summary_model2)["log_hseval", "Pr(>|t|)"]
cat("Coefficient for log(hseval):", coef_log_hseval, "\n")
## Coefficient for log(hseval): -0.09598047
cat("P-value for log(hseval):", p_value_log_hseval, "\n")
## P-value for log(hseval): 0.3936415
# (iv) Analyze significance of log_income and prppov in model2
p_value_income_model2 <- coef(summary_model2)["log_income", "Pr(>|t|)"]
p_value_prppov_model2 <- coef(summary_model2)["prppov", "Pr(>|t|)"]
cat("P-value for log(income) in model2:", p_value_income_model2, "\n")
## P-value for log(income) in model2: 0.253617
cat("P-value for prppov in model2:", p_value_prppov_model2, "\n")
## P-value for prppov in model2: 0.4219846
# Joint significance of log_income and prppov
anova_result <- anova(model1, model2)
print(anova_result)
## Analysis of Variance Table
##
## Model 1: log_psoda ~ prpblck + log_income + prppov
## Model 2: log_psoda ~ prpblck + log_income + prppov + log_hseval
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 196 23.180
## 2 195 23.093 1 0.086557 0.7309 0.3936
# (v) Report the most reliable model
if (anova_result$`Pr(>F)`[2] < 0.05) {
cat("The model with log(hseval) is jointly significant.\n")
} else {
cat("The model without log(hseval) is preferred.\n")
}
## The model without log(hseval) is preferred.