# Load necessary libraries
library(wooldridge)  # For the DISCRIM dataset
library(tidyverse)   # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)       # For tidying up model results
library(car)         # For additional regression diagnostics
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
# Load the DISCRIM dataset from wooldridge
data <- wooldridge::discrim

# Inspect the data
glimpse(data)
## Rows: 410
## Columns: 37
## $ psoda    <dbl> 1.12, 1.06, 1.06, 1.12, 1.12, 1.06, 1.17, 1.17, 1.18, 1.17, 1…
## $ pfries   <dbl> 1.06, 0.91, 0.91, 1.02, NA, 0.95, 0.95, 1.02, 1.02, 1.12, 1.0…
## $ pentree  <dbl> 1.02, 0.95, 0.98, 1.06, 0.49, 1.01, 0.95, 1.06, 1.06, 1.02, 2…
## $ wagest   <dbl> 4.25, 4.75, 4.25, 5.00, 5.00, 4.25, 4.65, 4.50, NA, 4.25, 4.7…
## $ nmgrs    <dbl> 3, 3, 3, 4, 3, 4, 3, 3, 4, 3, 3, 4, 3, 4, 2, 4, 4, 3, 5, 3, 3…
## $ nregs    <int> 5, 3, 5, 5, 3, 4, 2, 5, 4, 5, 5, 2, 5, NA, 2, 4, 5, 2, 4, 5, …
## $ hrsopen  <dbl> 16.0, 16.5, 18.0, 16.0, 16.0, 15.0, 16.0, 17.0, 17.0, 18.0, 1…
## $ emp      <dbl> 27.5, 21.5, 30.0, 27.5, 5.0, 17.5, 22.5, 18.5, 17.0, 27.0, 11…
## $ psoda2   <dbl> 1.11, 1.05, 1.05, 1.15, 1.04, 1.05, 1.05, 1.11, 1.10, 1.11, 1…
## $ pfries2  <dbl> 1.11, 0.89, 0.94, 1.05, 1.01, 0.94, 0.94, 1.06, 1.01, 1.12, 1…
## $ pentree2 <dbl> 1.05, 0.95, 0.98, 1.05, 0.58, 1.00, 0.94, 1.05, 0.99, 1.05, 2…
## $ wagest2  <dbl> 5.05, 5.05, 5.05, 5.05, 5.05, 5.05, 5.05, 5.05, 5.05, 5.05, 5…
## $ nmgrs2   <dbl> 5, 4, 4, 4, 3, 3, 3, 3, 4, 6, 5, 4, 2, 0, 2, 5, 4, 3, 4, 4, 3…
## $ nregs2   <int> 5, 3, 5, 5, 3, 4, 2, 5, 4, 5, 5, 2, 5, NA, 2, 4, 5, 2, 4, 5, …
## $ hrsopen2 <dbl> 15.0, 17.5, 17.5, 16.0, 16.0, 15.0, 16.0, 16.0, 18.0, 17.0, 1…
## $ emp2     <dbl> 27.0, 24.5, 25.0, NA, 12.0, 28.0, 18.5, 17.0, 34.0, 22.0, 32.…
## $ compown  <int> 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ chain    <int> 3, 1, 1, 3, 1, 1, 1, 3, 1, 3, 2, 4, 2, 1, 1, 1, 1, 4, 1, 3, 1…
## $ density  <dbl> 4030, 4030, 11400, 8345, 720, 4424, 2678, 6405, 18388, 18388,…
## $ crmrte   <dbl> 0.0528866, 0.0528866, 0.0360003, 0.0484232, 0.0615890, 0.0334…
## $ state    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ prpblck  <dbl> 0.1711542, 0.1711542, 0.0473602, 0.0528394, 0.0344800, 0.0591…
## $ prppov   <dbl> 0.0365789, 0.0365789, 0.0879072, 0.0591227, 0.0254145, 0.0835…
## $ prpncar  <dbl> 0.0788428, 0.0788428, 0.2694298, 0.1366903, 0.0738020, 0.1151…
## $ hseval   <dbl> 148300, 148300, 169200, 171600, 249100, 148000, 212700, 19880…
## $ nstores  <int> 3, 3, 3, 3, 1, 2, 1, 1, 5, 5, 5, 5, 5, 1, 2, 2, 2, 1, 1, 3, 3…
## $ income   <dbl> 44534, 44534, 41164, 50366, 72287, 44515, 62056, 53655, 31314…
## $ county   <int> 18, 18, 12, 10, 10, 18, 10, 24, 10, 10, 10, 10, 10, 2, 2, 2, …
## $ lpsoda   <dbl> 0.11332869, 0.05826885, 0.05826885, 0.11332869, 0.11332869, 0…
## $ lpfries  <dbl> 0.058268853, -0.094310649, -0.094310649, 0.019802609, NA, -0.…
## $ lhseval  <dbl> 11.90699, 11.90699, 12.03884, 12.05292, 12.42561, 11.90497, 1…
## $ lincome  <dbl> 10.70401, 10.70401, 10.62532, 10.82707, 11.18840, 10.70358, 1…
## $ ldensity <dbl> 8.301521, 8.301521, 9.341369, 9.029418, 6.579251, 8.394799, 7…
## $ NJ       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ BK       <int> 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1…
## $ KFC      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ RR       <int> 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
# Part (i) - Estimating the model using OLS
# log(psoda) = β0 + β1prpblck + β2log(income) + β3prppov + u
model1 <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = data)
summary(model1)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08
# Is β1 statistically significant at the 5% and 1% levels?
# The significance can be checked from the p-value of prpblck in the summary

# Part (ii) - Correlation between log(income) and prppov, and statistical significance
correlation <- cor(log(data$income), data$prppov)
cat("Correlation between log(income) and prppov: ", correlation, "\n")
## Correlation between log(income) and prppov:  NA
# Perform correlation test to check significance
cor_test <- cor.test(log(data$income), data$prppov)
print(cor_test)
## 
##  Pearson's product-moment correlation
## 
## data:  log(data$income) and data$prppov
## t = -31.04, df = 407, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8650980 -0.8071224
## sample estimates:
##       cor 
## -0.838467
# Part (iii) - Adding log(hseval) to the regression
model2 <- lm(log(psoda) ~ prpblck + log(income) + prppov + log(hseval), data = data)
summary(model2)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov + log(hseval), 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30652 -0.04380  0.00701  0.04332  0.35272 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.84151    0.29243  -2.878 0.004224 ** 
## prpblck      0.09755    0.02926   3.334 0.000937 ***
## log(income) -0.05299    0.03753  -1.412 0.158706    
## prppov       0.05212    0.13450   0.388 0.698571    
## log(hseval)  0.12131    0.01768   6.860 2.67e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07702 on 396 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.1839, Adjusted R-squared:  0.1757 
## F-statistic: 22.31 on 4 and 396 DF,  p-value: < 2.2e-16
# Interpret the coefficient for log(hseval) and check its significance from the p-value

# Part (iv) - Checking the changes in statistical significance for log(income) and prppov
# Extract p-values from both models
tidy_model1 <- tidy(model1)
tidy_model2 <- tidy(model2)

cat("P-values from model 1: \n")
## P-values from model 1:
print(tidy_model1[, c("term", "p.value")])
## # A tibble: 4 × 2
##   term            p.value
##   <chr>             <dbl>
## 1 (Intercept) 0.000000940
## 2 prpblck     0.0181     
## 3 log(income) 0.000000480
## 4 prppov      0.00440
cat("P-values from model 2: \n")
## P-values from model 2:
print(tidy_model2[, c("term", "p.value")])
## # A tibble: 5 × 2
##   term         p.value
##   <chr>          <dbl>
## 1 (Intercept) 4.22e- 3
## 2 prpblck     9.37e- 4
## 3 log(income) 1.59e- 1
## 4 prppov      6.99e- 1
## 5 log(hseval) 2.67e-11
# Joint significance test of log(income) and prppov (using ANOVA to compare models)
anova_test <- anova(model1, model2)
print(anova_test)
## Analysis of Variance Table
## 
## Model 1: log(psoda) ~ prpblck + log(income) + prppov
## Model 2: log(psoda) ~ prpblck + log(income) + prppov + log(hseval)
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    397 2.6284                                  
## 2    396 2.3493  1   0.27915 47.054 2.668e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Part (v) - Choose the most reliable model based on the results
# Compare the R-squared values and interpret the results
cat("R-squared of model 1: ", summary(model1)$r.squared, "\n")
## R-squared of model 1:  0.08696154
cat("R-squared of model 2: ", summary(model2)$r.squared, "\n")
## R-squared of model 2:  0.1839299
# You can also check other statistics such as AIC and BIC for model comparison
cat("AIC of model 1: ", AIC(model1), "\n")
## AIC of model 1:  -868.0719
cat("AIC of model 2: ", AIC(model2), "\n")
## AIC of model 2:  -911.0953
cat("BIC of model 1: ", BIC(model1), "\n")
## BIC of model 1:  -848.1021
cat("BIC of model 2: ", BIC(model2), "\n")
## BIC of model 2:  -887.1315

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.