This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.
# Install the wooldridge package if it's not installed
if (!require(wooldridge)) {
install.packages("wooldridge")
}
## Loading required package: wooldridge
# Load the wooldridge package
library(wooldridge)
# Load the DISCRIM dataset
data("discrim")
# (i): Find the average values and standard deviations of prpblck and income
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)
# Display the results for (i)
cat("Part (i):\n")
## Part (i):
cat("Mean of prpblck:", mean_prpblck, " (11.35% average proportion)\n")
## Mean of prpblck: 0.1134864 (11.35% average proportion)
cat("Standard deviation of prpblck:", sd_prpblck, "\n")
## Standard deviation of prpblck: 0.1824165
cat("Mean of income:", mean_income, " (average median household income in dollars)\n")
## Mean of income: 47053.78 (average median household income in dollars)
cat("Standard deviation of income:", sd_income, "\n")
## Standard deviation of income: 13179.29
# (ii): Estimate the model by OLS and report the results
model1 <- lm(psoda ~ prpblck + income, data = discrim)
# Display the regression results for (ii)
cat("\nPart (ii) - OLS Regression Results:\n")
##
## Part (ii) - OLS Regression Results:
summary_model1 <- summary(model1)
print(summary_model1)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
cat("\nInterpretation:\n")
##
## Interpretation:
cat("Intercept:", summary_model1$coefficients[1, 1], "is the predicted price of soda when prpblck and income are zero.\n")
## Intercept: 0.9563196 is the predicted price of soda when prpblck and income are zero.
cat("Coefficient for prpblck:", summary_model1$coefficients[2, 1],
"suggests that a one-unit increase in the proportion of black population increases the price by approximately 0.115 dollars.\n")
## Coefficient for prpblck: 0.1149882 suggests that a one-unit increase in the proportion of black population increases the price by approximately 0.115 dollars.
cat("Coefficient for income:", summary_model1$coefficients[3, 1],
"indicates a minor increase in the soda price for every dollar increase in income.\n")
## Coefficient for income: 1.602674e-06 indicates a minor increase in the soda price for every dollar increase in income.
cat("R-squared:", summary_model1$r.squared,
"shows that 6.42% of the variation in soda prices is explained by the model.\n")
## R-squared: 0.06422039 shows that 6.42% of the variation in soda prices is explained by the model.
# (iii): Compare with the simple regression estimate from psoda on prpblck
model2 <- lm(psoda ~ prpblck, data = discrim)
# Display the regression results for (iii)
cat("\nPart (iii) - Simple Regression Results:\n")
##
## Part (iii) - Simple Regression Results:
summary_model2 <- summary(model2)
print(summary_model2)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
cat("\nComparison:\n")
##
## Comparison:
cat("The coefficient for prpblck in the simple regression is", summary_model2$coefficients[2, 1],
", which is smaller than in the multiple regression (0.1150).\n")
## The coefficient for prpblck in the simple regression is 0.06492687 , which is smaller than in the multiple regression (0.1150).
cat("Controlling for income increases the estimated effect of prpblck on soda price.\n")
## Controlling for income increases the estimated effect of prpblck on soda price.
# (iv): Estimate the model with a log transformation
model3 <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
# Display the regression results for (iv)
cat("\nPart (iv) - Log-Linear Regression Results:\n")
##
## Part (iv) - Log-Linear Regression Results:
summary_model3 <- summary(model3)
print(summary_model3)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
# Calculate the percentage change in psoda for a 0.20 increase in prpblck
percent_change <- coef(model3)["prpblck"] * 0.20 * 100
cat("\nEstimated percentage change in psoda for a 0.20 increase in prpblck:", percent_change, "%\n")
##
## Estimated percentage change in psoda for a 0.20 increase in prpblck: 2.431605 %
cat("\nInterpretation:\n")
##
## Interpretation:
cat("The estimated percentage change for a 0.20 increase in prpblck is", percent_change, "%.\n")
## The estimated percentage change for a 0.20 increase in prpblck is 2.431605 %.
cat("This indicates a 2.43% increase in soda price with a 20-percentage point increase in prpblck.\n")
## This indicates a 2.43% increase in soda price with a 20-percentage point increase in prpblck.
# (v): Add the variable prppov to the regression
model4 <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
# Display the regression results for (v)
cat("\nPart (v) - Regression with prppov Included:\n")
##
## Part (v) - Regression with prppov Included:
summary_model4 <- summary(model4)
print(summary_model4)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
cat("\nChange in Coefficient:\n")
##
## Change in Coefficient:
cat("The coefficient for prpblck decreases to", summary_model4$coefficients[2, 1],
"when prppov is included, indicating that some of the prpblck effect is captured by prppov.\n")
## The coefficient for prpblck decreases to 0.07280726 when prppov is included, indicating that some of the prpblck effect is captured by prppov.
# (vi): Find the correlation between log(income) and prppov
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
# Display the correlation for (vi)
cat("\nPart (vi) - Correlation between log(income) and prppov:", correlation, "\n")
##
## Part (vi) - Correlation between log(income) and prppov: -0.838467
cat("\nInterpretation:\n")
##
## Interpretation:
cat("The correlation of", correlation, "indicates a strong negative relationship, suggesting that higher incomes are associated with lower poverty levels.\n")
## The correlation of -0.838467 indicates a strong negative relationship, suggesting that higher incomes are associated with lower poverty levels.
# (vii): Evaluate multicollinearity issue
cat("\nPart (vii) - Evaluation of multicollinearity:\n")
##
## Part (vii) - Evaluation of multicollinearity:
cat("The correlation between log(income) and prppov is", correlation,
". High correlation may indicate multicollinearity, leading to unstable coefficient estimates.\n")
## The correlation between log(income) and prppov is -0.838467 . High correlation may indicate multicollinearity, leading to unstable coefficient estimates.
cat("Despite this, including both variables could still provide useful insights depending on the research context.\n")
## Despite this, including both variables could still provide useful insights depending on the research context.