# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# CHAPTER 2 ----------------------------------------------------
# Sample data or load actual dataset for Chapter 2
set.seed(42)
COUNTYMURDERS <- tibble(
murders = rpois(100, lambda = 5),
execs = sample(0:3, 100, replace = TRUE)
)
# Analysis for Chapter 2
# (i) Summary statistics and counts
zero_murders <- sum(COUNTYMURDERS$murders == 0)
one_or_more_executions <- sum(COUNTYMURDERS$execs > 0)
max_executions <- max(COUNTYMURDERS$execs)
cat("Counties with zero murders in 1996:", zero_murders, "\n")
## Counties with zero murders in 1996: 3
cat("Counties with at least one execution:", one_or_more_executions, "\n")
## Counties with at least one execution: 73
cat("Largest number of executions:", max_executions, "\n\n")
## Largest number of executions: 3
# (ii) OLS estimation
model_ch2 <- lm(murders ~ execs, data = COUNTYMURDERS)
summary(model_ch2)
##
## Call:
## lm(formula = murders ~ execs, data = COUNTYMURDERS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3617 -1.9858 -0.1111 1.1702 5.6383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.9858 0.3899 12.788 <2e-16 ***
## execs 0.1253 0.2183 0.574 0.567
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.448 on 98 degrees of freedom
## Multiple R-squared: 0.00335, Adjusted R-squared: -0.00682
## F-statistic: 0.3294 on 1 and 98 DF, p-value: 0.5673
beta_0 <- coef(model_ch2)[1]
beta_1 <- coef(model_ch2)[2]
cat("\nIntercept (β0):", beta_0, "\n")
##
## Intercept (β0): 4.985849
cat("Slope (β1):", beta_1, "\n")
## Slope (β1): 0.1252882
# (iii) Interpretation of slope
cat("\nInterpretation:", ifelse(beta_1 < 0, "The negative slope suggests a potential deterrent effect.\n", "The positive slope suggests an increase in murders with executions.\n"))
##
## Interpretation: The positive slope suggests an increase in murders with executions.
# (iv) Predicted murders and residuals
predicted_murders_zero_exec <- beta_0
residual_zero_exec_zero_murders <- 0 - predicted_murders_zero_exec
cat("\nPredicted murders for zero executions:", predicted_murders_zero_exec, "\n")
##
## Predicted murders for zero executions: 4.985849
cat("Residual for zero executions and zero murders:", residual_zero_exec_zero_murders, "\n")
## Residual for zero executions and zero murders: -4.985849
# (v) Explanation of limitations
cat("\nExplanation: Simple regression does not account for confounding variables.\n\n")
##
## Explanation: Simple regression does not account for confounding variables.
# Visualization for Chapter 2
ggplot(COUNTYMURDERS, aes(x = execs, y = murders)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Murders vs Executions",
x = "Number of Executions",
y = "Number of Murders") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# CHAPTER 3 ----------------------------------------------------
# Sample dataset for Chapter 3
set.seed(42) # Ensure reproducibility
student_data <- tibble(
study = rnorm(100, mean = 5, sd = 2),
sleep = rnorm(100, mean = 7, sd = 1.5),
work = rnorm(100, mean = 3, sd = 1),
leisure = rnorm(100, mean = 2, sd = 0.5),
GPA = rnorm(100, mean = 3, sd = 0.5)
)
# (i) Correlation analysis
correlation_matrix <- cor(student_data)
print(correlation_matrix)
## study sleep work leisure GPA
## study 1.00000000 0.031279837 -0.14477734 0.074237314 0.08427619
## sleep 0.03127984 1.000000000 0.07122699 0.008644817 -0.12072259
## work -0.14477734 0.071226992 1.00000000 -0.045979669 0.05933732
## leisure 0.07423731 0.008644817 -0.04597967 1.000000000 0.18501354
## GPA 0.08427619 -0.120722592 0.05933732 0.185013536 1.00000000
# (ii) Multiple regression
model_ch3 <- lm(GPA ~ study + sleep + work + leisure, data = student_data)
summary(model_ch3)
##
## Call:
## lm(formula = GPA ~ study + sleep + work + leisure, data = student_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96045 -0.30569 -0.00674 0.22892 1.71887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.60570 0.39535 6.591 2.43e-09 ***
## study 0.02149 0.02466 0.871 0.3858
## sleep -0.04943 0.03748 -1.319 0.1904
## work 0.04506 0.05052 0.892 0.3747
## leisure 0.21392 0.11604 1.844 0.0684 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5041 on 95 degrees of freedom
## Multiple R-squared: 0.0626, Adjusted R-squared: 0.02313
## F-statistic: 1.586 on 4 and 95 DF, p-value: 0.1844
# Visualization for Chapter 3
ggplot(student_data, aes(x = study, y = GPA)) +
geom_point() +
geom_smooth(method = "lm", color = "red") +
labs(title = "GPA vs Study Hours",
x = "Study Hours",
y = "GPA") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# CHAPTER 4 ----------------------------------------------------
# Sample dataset for Chapter 4
set.seed(42)
DISCRIM <- tibble(
psoda = rnorm(100, mean = 1.5, sd = 0.2),
prpblck = runif(100, min = 0, max = 1),
income = runif(100, min = 20000, max = 80000),
prppov = runif(100, min = 0, max = 0.5)
)
# (i) Descriptive statistics
summary(DISCRIM)
## psoda prpblck income prppov
## Min. :0.9014 Min. :0.01755 Min. :20143 Min. :0.0007169
## 1st Qu.:1.3767 1st Qu.:0.18688 1st Qu.:36803 1st Qu.:0.0936920
## Median :1.5180 Median :0.41258 Median :48924 Median :0.2195957
## Mean :1.5065 Mean :0.44077 Mean :50193 Mean :0.2297092
## 3rd Qu.:1.6323 3rd Qu.:0.60696 3rd Qu.:65157 3rd Qu.:0.3458391
## Max. :1.9573 Max. :0.99655 Max. :79448 Max. :0.4901393
# (ii) Exploratory Data Analysis (EDA)
ggplot(DISCRIM, aes(x = income, y = psoda, color = as.factor(prpblck))) +
geom_point() +
labs(title = "Soda Consumption vs Income by Race Proportion",
x = "Income",
y = "Soda Consumption",
color = "Proportion Black") +
theme_minimal()

# (iii) Logistic regression
DISCRIM <- DISCRIM %>%
mutate(binary_outcome = ifelse(psoda > 1.5, 1, 0)) # Example binary outcome
logistic_model_ch4 <- glm(binary_outcome ~ prpblck + income + prppov, data = DISCRIM, family = binomial)
summary(logistic_model_ch4)
##
## Call:
## glm(formula = binary_outcome ~ prpblck + income + prppov, family = binomial,
## data = DISCRIM)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.229e-01 7.652e-01 -0.422 0.673
## prpblck 4.706e-02 7.128e-01 0.066 0.947
## income 1.888e-06 1.152e-05 0.164 0.870
## prppov 1.611e+00 1.407e+00 1.145 0.252
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 137.99 on 99 degrees of freedom
## Residual deviance: 136.65 on 96 degrees of freedom
## AIC: 144.65
##
## Number of Fisher Scoring iterations: 4
# CHAPTER 5 ----------------------------------------------------
# Load or create WAGEl dataset
# WAGEl <- read.csv("path_to_your_data/WAGEl.csv") # Load your actual data here
# Example: Generate a sample dataset for WAGEl (for illustration)
set.seed(42)
WAGEl <- tibble(
wage = rnorm(100, mean = 50, sd = 10),
educ = rnorm(100, mean = 12, sd = 2),
exper = rnorm(100, mean = 5, sd = 3),
tenure = rnorm(100, mean = 3, sd = 1)
)
# (i) Estimate the equation
model_wage <- lm(wage ~ educ + exper + tenure, data = WAGEl)
summary(model_wage)
##
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = WAGEl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.2778 -5.4011 0.6288 6.7314 26.1132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.5622 7.9215 6.004 3.42e-08 ***
## educ 0.2361 0.5812 0.406 0.685
## exper -0.4936 0.3448 -1.431 0.156
## tenure 0.7991 1.1977 0.667 0.506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.43 on 96 degrees of freedom
## Multiple R-squared: 0.02721, Adjusted R-squared: -0.00319
## F-statistic: 0.8951 on 3 and 96 DF, p-value: 0.4467
# Save residuals
WAGEl$residuals <- resid(model_wage)
# Histogram of residuals
ggplot(WAGEl, aes(x = residuals)) +
geom_histogram(bins = 30, aes(y = ..density..), fill = "blue", alpha = 0.7) +
labs(title = "Histogram of Residuals from Wage Model",
x = "Residuals",
y = "Density") +
theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# (ii) Repeat with log(wage)
model_log_wage <- lm(log(wage) ~ educ + exper + tenure, data = WAGEl)
summary(model_log_wage)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = WAGEl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.91269 -0.09138 0.03114 0.15417 0.46941
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.820515 0.178317 21.425 <2e-16 ***
## educ 0.004766 0.013083 0.364 0.716
## exper -0.009684 0.007762 -1.248 0.215
## tenure 0.021455 0.026962 0.796 0.428
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2348 on 96 degrees of freedom
## Multiple R-squared: 0.02407, Adjusted R-squared: -0.00643
## F-statistic: 0.7891 on 3 and 96 DF, p-value: 0.5029
# Save residuals for log(wage)
WAGEl$residuals_log <- resid(model_log_wage)
# Histogram of residuals for log(wage)
ggplot(WAGEl, aes(x = residuals_log)) +
geom_histogram(bins = 30, aes(y = ..density..), fill = "green", alpha = 0.7) +
labs(title = "Histogram of Residuals from Log(Wage) Model",
x = "Residuals",
y = "Density") +
theme_minimal()

# (iii) Assessing Assumption MLR.6
# Residuals vs Fitted for Wage Model
ggplot(WAGEl, aes(x = fitted(model_wage), y = residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Residuals vs Fitted Values (Wage Model)",
x = "Fitted Values",
y = "Residuals") +
theme_minimal()

# Residuals vs Fitted for Log(Wage) Model
ggplot(WAGEl, aes(x = fitted(model_log_wage), y = residuals_log)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Residuals vs Fitted Values (Log(Wage) Model)",
x = "Fitted Values",
y = "Residuals") +
theme_minimal()

# Conclusion and final thoughts
cat("\nAnalysis complete across all chapters. Considerations for further research include exploring confounding variables and potential interactions in the models.\n")
##
## Analysis complete across all chapters. Considerations for further research include exploring confounding variables and potential interactions in the models.