R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

# Install the wooldridge package if it's not installed
if (!require(wooldridge)) {
  install.packages("wooldridge")
}
## Loading required package: wooldridge
# Load the wooldridge package
library(wooldridge)

# Load the DISCRIM dataset
data("discrim")

# (i): Find the average values and standard deviations of prpblck and income
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)

mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)

# Display the results for (i)
cat("Part (i):\n")
## Part (i):
cat("Mean of prpblck:", mean_prpblck, " (11.35% average proportion)\n")
## Mean of prpblck: 0.1134864  (11.35% average proportion)
cat("Standard deviation of prpblck:", sd_prpblck, "\n")
## Standard deviation of prpblck: 0.1824165
cat("Mean of income:", mean_income, " (average median household income in dollars)\n")
## Mean of income: 47053.78  (average median household income in dollars)
cat("Standard deviation of income:", sd_income, "\n")
## Standard deviation of income: 13179.29
# (ii): Estimate the model by OLS and report the results
model1 <- lm(psoda ~ prpblck + income, data = discrim)

# Display the regression results for (ii)
cat("\nPart (ii) - OLS Regression Results:\n")
## 
## Part (ii) - OLS Regression Results:
summary_model1 <- summary(model1)
print(summary_model1)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06
cat("\nInterpretation:\n")
## 
## Interpretation:
cat("Intercept:", summary_model1$coefficients[1, 1], "is the predicted price of soda when prpblck and income are zero.\n")
## Intercept: 0.9563196 is the predicted price of soda when prpblck and income are zero.
cat("Coefficient for prpblck:", summary_model1$coefficients[2, 1], 
    "suggests that a one-unit increase in the proportion of black population increases the price by approximately 0.115 dollars.\n")
## Coefficient for prpblck: 0.1149882 suggests that a one-unit increase in the proportion of black population increases the price by approximately 0.115 dollars.
cat("Coefficient for income:", summary_model1$coefficients[3, 1], 
    "indicates a minor increase in the soda price for every dollar increase in income.\n")
## Coefficient for income: 1.602674e-06 indicates a minor increase in the soda price for every dollar increase in income.
cat("R-squared:", summary_model1$r.squared, 
    "shows that 6.42% of the variation in soda prices is explained by the model.\n")
## R-squared: 0.06422039 shows that 6.42% of the variation in soda prices is explained by the model.
# (iii): Compare with the simple regression estimate from psoda on prpblck
model2 <- lm(psoda ~ prpblck, data = discrim)

# Display the regression results for (iii)
cat("\nPart (iii) - Simple Regression Results:\n")
## 
## Part (iii) - Simple Regression Results:
summary_model2 <- summary(model2)
print(summary_model2)
## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015
cat("\nComparison:\n")
## 
## Comparison:
cat("The coefficient for prpblck in the simple regression is", summary_model2$coefficients[2, 1], 
    ", which is smaller than in the multiple regression (0.1150).\n")
## The coefficient for prpblck in the simple regression is 0.06492687 , which is smaller than in the multiple regression (0.1150).
cat("Controlling for income increases the estimated effect of prpblck on soda price.\n")
## Controlling for income increases the estimated effect of prpblck on soda price.
# (iv): Estimate the model with a log transformation
model3 <- lm(log(psoda) ~ prpblck + log(income), data = discrim)

# Display the regression results for (iv)
cat("\nPart (iv) - Log-Linear Regression Results:\n")
## 
## Part (iv) - Log-Linear Regression Results:
summary_model3 <- summary(model3)
print(summary_model3)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07
# Calculate the percentage change in psoda for a 0.20 increase in prpblck
percent_change <- coef(model3)["prpblck"] * 0.20 * 100
cat("\nEstimated percentage change in psoda for a 0.20 increase in prpblck:", percent_change, "%\n")
## 
## Estimated percentage change in psoda for a 0.20 increase in prpblck: 2.431605 %
cat("\nInterpretation:\n")
## 
## Interpretation:
cat("The estimated percentage change for a 0.20 increase in prpblck is", percent_change, "%.\n")
## The estimated percentage change for a 0.20 increase in prpblck is 2.431605 %.
cat("This indicates a 2.43% increase in soda price with a 20-percentage point increase in prpblck.\n")
## This indicates a 2.43% increase in soda price with a 20-percentage point increase in prpblck.
# (v): Add the variable prppov to the regression
model4 <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)

# Display the regression results for (v)
cat("\nPart (v) - Regression with prppov Included:\n")
## 
## Part (v) - Regression with prppov Included:
summary_model4 <- summary(model4)
print(summary_model4)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08
cat("\nChange in Coefficient:\n")
## 
## Change in Coefficient:
cat("The coefficient for prpblck decreases to", summary_model4$coefficients[2, 1], 
    "when prppov is included, indicating that some of the prpblck effect is captured by prppov.\n")
## The coefficient for prpblck decreases to 0.07280726 when prppov is included, indicating that some of the prpblck effect is captured by prppov.
# (vi): Find the correlation between log(income) and prppov
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")

# Display the correlation for (vi)
cat("\nPart (vi) - Correlation between log(income) and prppov:", correlation, "\n")
## 
## Part (vi) - Correlation between log(income) and prppov: -0.838467
cat("\nInterpretation:\n")
## 
## Interpretation:
cat("The correlation of", correlation, "indicates a strong negative relationship, suggesting that higher incomes are associated with lower poverty levels.\n")
## The correlation of -0.838467 indicates a strong negative relationship, suggesting that higher incomes are associated with lower poverty levels.
# (vii): Evaluate multicollinearity issue
cat("\nPart (vii) - Evaluation of multicollinearity:\n")
## 
## Part (vii) - Evaluation of multicollinearity:
cat("The correlation between log(income) and prppov is", correlation, 
    ". High correlation may indicate multicollinearity, leading to unstable coefficient estimates.\n")
## The correlation between log(income) and prppov is -0.838467 . High correlation may indicate multicollinearity, leading to unstable coefficient estimates.
cat("Despite this, including both variables could still provide useful insights depending on the research context.\n")
## Despite this, including both variables could still provide useful insights depending on the research context.