# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
# Set a random seed for reproducibility
set.seed(123)

# Simulate a dataset
n <- 200  # Number of observations
data <- data.frame(
  psoda = rnorm(n, mean = 3, sd = 1),     # Dependent variable (soda price)
  prpblck = runif(n, 0, 1),                # Proportion of black population
  income = rnorm(n, mean = 50000, sd = 15000),  # Household income
  prppov = runif(n, 0, 1),                 # Proportion of population in poverty
  hseval = rnorm(n, mean = 250000, sd = 50000)  # House value
)

# Transform variables as necessary
data$log_psoda <- log(data$psoda)
data$log_income <- log(data$income)
data$log_hseval <- log(data$hseval)

# (i) OLS regression estimation
model1 <- lm(log_psoda ~ prpblck + log_income + prppov, data = data)
summary_model1 <- summary(model1)
print(summary_model1)
## 
## Call:
## lm(formula = log_psoda ~ prpblck + log_income + prppov, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.48625 -0.16349  0.01558  0.22644  0.80421 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  1.765563   0.715994   2.466   0.0145 *
## prpblck      0.003534   0.083651   0.042   0.9663  
## log_income  -0.070898   0.066537  -1.066   0.2879  
## prppov       0.078383   0.090051   0.870   0.3851  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3439 on 196 degrees of freedom
## Multiple R-squared:  0.008958,   Adjusted R-squared:  -0.006211 
## F-statistic: 0.5905 on 3 and 196 DF,  p-value: 0.6219
# Extract p-value for prpblck
p_value_prpblck <- coef(summary_model1)["prpblck", "Pr(>|t|)"]
cat("P-value for prpblck:", p_value_prpblck, "\n")
## P-value for prpblck: 0.9663458
is_significant_5 <- p_value_prpblck < 0.05
is_significant_1 <- p_value_prpblck < 0.01
cat("Is prpblck significant at 5% level?", is_significant_5, "\n")
## Is prpblck significant at 5% level? FALSE
cat("Is prpblck significant at 1% level?", is_significant_1, "\n")
## Is prpblck significant at 1% level? FALSE
# (ii) Correlation between log_income and prppov
correlation <- cor(data$log_income, data$prppov)
cat("Correlation between log(income) and prppov:", correlation, "\n")
## Correlation between log(income) and prppov: 0.07223969
# Check significance of each variable
p_value_income <- coef(summary_model1)["log_income", "Pr(>|t|)"]
p_value_prppov <- coef(summary_model1)["prppov", "Pr(>|t|)"]
cat("P-value for log(income):", p_value_income, "\n")
## P-value for log(income): 0.2879427
cat("P-value for prppov:", p_value_prppov, "\n")
## P-value for prppov: 0.3851307
# (iii) Adding log_hseval to the regression
model2 <- lm(log_psoda ~ prpblck + log_income + prppov + log_hseval, data = data)
summary_model2 <- summary(model2)
print(summary_model2)
## 
## Call:
## lm(formula = log_psoda ~ prpblck + log_income + prppov + log_hseval, 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.49260 -0.18292  0.00895  0.22815  0.76679 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  3.01789    1.63068   1.851   0.0657 .
## prpblck      0.00839    0.08390   0.100   0.9205  
## log_income  -0.07662    0.06692  -1.145   0.2536  
## prppov       0.07271    0.09036   0.805   0.4220  
## log_hseval  -0.09598    0.11227  -0.855   0.3936  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3441 on 195 degrees of freedom
## Multiple R-squared:  0.01266,    Adjusted R-squared:  -0.007594 
## F-statistic: 0.625 on 4 and 195 DF,  p-value: 0.6452
# Interpret the coefficient for log_hseval
coef_log_hseval <- coef(summary_model2)["log_hseval", "Estimate"]
p_value_log_hseval <- coef(summary_model2)["log_hseval", "Pr(>|t|)"]
cat("Coefficient for log(hseval):", coef_log_hseval, "\n")
## Coefficient for log(hseval): -0.09598047
cat("P-value for log(hseval):", p_value_log_hseval, "\n")
## P-value for log(hseval): 0.3936415
# (iv) Analyze significance of log_income and prppov in model2
p_value_income_model2 <- coef(summary_model2)["log_income", "Pr(>|t|)"]
p_value_prppov_model2 <- coef(summary_model2)["prppov", "Pr(>|t|)"]
cat("P-value for log(income) in model2:", p_value_income_model2, "\n")
## P-value for log(income) in model2: 0.253617
cat("P-value for prppov in model2:", p_value_prppov_model2, "\n")
## P-value for prppov in model2: 0.4219846
# Joint significance of log_income and prppov
anova_result <- anova(model1, model2)
print(anova_result)
## Analysis of Variance Table
## 
## Model 1: log_psoda ~ prpblck + log_income + prppov
## Model 2: log_psoda ~ prpblck + log_income + prppov + log_hseval
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    196 23.180                           
## 2    195 23.093  1  0.086557 0.7309 0.3936
# (v) Report the most reliable model
if (anova_result$`Pr(>F)`[2] < 0.05) {
  cat("The model with log(hseval) is jointly significant.\n")
} else {
  cat("The model without log(hseval) is preferred.\n")
}
## The model without log(hseval) is preferred.