Midterm

# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# CHAPTER 2 ----------------------------------------------------

# Sample data or load actual dataset for Chapter 2
set.seed(42)
COUNTYMURDERS <- tibble(
  murders = rpois(100, lambda = 5),
  execs = sample(0:3, 100, replace = TRUE)
)

# Analysis for Chapter 2
# (i) Summary statistics and counts
zero_murders <- sum(COUNTYMURDERS$murders == 0)
one_or_more_executions <- sum(COUNTYMURDERS$execs > 0)
max_executions <- max(COUNTYMURDERS$execs)

cat("Counties with zero murders in 1996:", zero_murders, "\n")

## Counties with zero murders in 1996: 3

cat("Counties with at least one execution:", one_or_more_executions, "\n")

## Counties with at least one execution: 73

cat("Largest number of executions:", max_executions, "\n\n")

## Largest number of executions: 3

# (ii) OLS estimation
model_ch2 <- lm(murders ~ execs, data = COUNTYMURDERS)
summary(model_ch2)

## 
## Call:
## lm(formula = murders ~ execs, data = COUNTYMURDERS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3617 -1.9858 -0.1111  1.1702  5.6383 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.9858     0.3899  12.788   <2e-16 ***
## execs         0.1253     0.2183   0.574    0.567    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.448 on 98 degrees of freedom
## Multiple R-squared:  0.00335,    Adjusted R-squared:  -0.00682 
## F-statistic: 0.3294 on 1 and 98 DF,  p-value: 0.5673

beta_0 <- coef(model_ch2)[1]
beta_1 <- coef(model_ch2)[2]

cat("\nIntercept (β0):", beta_0, "\n")

## 
## Intercept (β0): 4.985849

cat("Slope (β1):", beta_1, "\n")

## Slope (β1): 0.1252882

# (iii) Interpretation of slope
cat("\nInterpretation:", ifelse(beta_1 < 0, "The negative slope suggests a potential deterrent effect.\n", "The positive slope suggests an increase in murders with executions.\n"))

## 
## Interpretation: The positive slope suggests an increase in murders with executions.

# (iv) Predicted murders and residuals
predicted_murders_zero_exec <- beta_0
residual_zero_exec_zero_murders <- 0 - predicted_murders_zero_exec

cat("\nPredicted murders for zero executions:", predicted_murders_zero_exec, "\n")

## 
## Predicted murders for zero executions: 4.985849

cat("Residual for zero executions and zero murders:", residual_zero_exec_zero_murders, "\n")

## Residual for zero executions and zero murders: -4.985849

# (v) Explanation of limitations
cat("\nExplanation: Simple regression does not account for confounding variables.\n\n")

## 
## Explanation: Simple regression does not account for confounding variables.

# Visualization for Chapter 2
ggplot(COUNTYMURDERS, aes(x = execs, y = murders)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Murders vs Executions",
       x = "Number of Executions",
       y = "Number of Murders") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# CHAPTER 3 ----------------------------------------------------

# Sample dataset for Chapter 3
set.seed(42)  # Ensure reproducibility
student_data <- tibble(
  study = rnorm(100, mean = 5, sd = 2),
  sleep = rnorm(100, mean = 7, sd = 1.5),
  work = rnorm(100, mean = 3, sd = 1),
  leisure = rnorm(100, mean = 2, sd = 0.5),
  GPA = rnorm(100, mean = 3, sd = 0.5)
)

# (i) Correlation analysis
correlation_matrix <- cor(student_data)
print(correlation_matrix)

##               study        sleep        work      leisure         GPA
## study    1.00000000  0.031279837 -0.14477734  0.074237314  0.08427619
## sleep    0.03127984  1.000000000  0.07122699  0.008644817 -0.12072259
## work    -0.14477734  0.071226992  1.00000000 -0.045979669  0.05933732
## leisure  0.07423731  0.008644817 -0.04597967  1.000000000  0.18501354
## GPA      0.08427619 -0.120722592  0.05933732  0.185013536  1.00000000

# (ii) Multiple regression
model_ch3 <- lm(GPA ~ study + sleep + work + leisure, data = student_data)
summary(model_ch3)

## 
## Call:
## lm(formula = GPA ~ study + sleep + work + leisure, data = student_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96045 -0.30569 -0.00674  0.22892  1.71887 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.60570    0.39535   6.591 2.43e-09 ***
## study        0.02149    0.02466   0.871   0.3858    
## sleep       -0.04943    0.03748  -1.319   0.1904    
## work         0.04506    0.05052   0.892   0.3747    
## leisure      0.21392    0.11604   1.844   0.0684 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5041 on 95 degrees of freedom
## Multiple R-squared:  0.0626, Adjusted R-squared:  0.02313 
## F-statistic: 1.586 on 4 and 95 DF,  p-value: 0.1844

# Visualization for Chapter 3
ggplot(student_data, aes(x = study, y = GPA)) +
  geom_point() +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "GPA vs Study Hours",
       x = "Study Hours",
       y = "GPA") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# CHAPTER 4 ----------------------------------------------------

# Sample dataset for Chapter 4
set.seed(42)
DISCRIM <- tibble(
  psoda = rnorm(100, mean = 1.5, sd = 0.2),
  prpblck = runif(100, min = 0, max = 1),
  income = runif(100, min = 20000, max = 80000),
  prppov = runif(100, min = 0, max = 0.5)
)

# (i) Descriptive statistics
summary(DISCRIM)

##      psoda           prpblck            income          prppov         
##  Min.   :0.9014   Min.   :0.01755   Min.   :20143   Min.   :0.0007169  
##  1st Qu.:1.3767   1st Qu.:0.18688   1st Qu.:36803   1st Qu.:0.0936920  
##  Median :1.5180   Median :0.41258   Median :48924   Median :0.2195957  
##  Mean   :1.5065   Mean   :0.44077   Mean   :50193   Mean   :0.2297092  
##  3rd Qu.:1.6323   3rd Qu.:0.60696   3rd Qu.:65157   3rd Qu.:0.3458391  
##  Max.   :1.9573   Max.   :0.99655   Max.   :79448   Max.   :0.4901393

# (ii) Exploratory Data Analysis (EDA)
ggplot(DISCRIM, aes(x = income, y = psoda, color = as.factor(prpblck))) +
  geom_point() +
  labs(title = "Soda Consumption vs Income by Race Proportion",
       x = "Income",
       y = "Soda Consumption",
       color = "Proportion Black") +
  theme_minimal()

# (iii) Logistic regression
DISCRIM <- DISCRIM %>%
  mutate(binary_outcome = ifelse(psoda > 1.5, 1, 0))  # Example binary outcome

logistic_model_ch4 <- glm(binary_outcome ~ prpblck + income + prppov, data = DISCRIM, family = binomial)
summary(logistic_model_ch4)

## 
## Call:
## glm(formula = binary_outcome ~ prpblck + income + prppov, family = binomial, 
##     data = DISCRIM)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.229e-01  7.652e-01  -0.422    0.673
## prpblck      4.706e-02  7.128e-01   0.066    0.947
## income       1.888e-06  1.152e-05   0.164    0.870
## prppov       1.611e+00  1.407e+00   1.145    0.252
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 137.99  on 99  degrees of freedom
## Residual deviance: 136.65  on 96  degrees of freedom
## AIC: 144.65
## 
## Number of Fisher Scoring iterations: 4

# CHAPTER 5 ----------------------------------------------------

# Load or create WAGEl dataset
# WAGEl <- read.csv("path_to_your_data/WAGEl.csv")  # Load your actual data here

# Example: Generate a sample dataset for WAGEl (for illustration)
set.seed(42)
WAGEl <- tibble(
  wage = rnorm(100, mean = 50, sd = 10),
  educ = rnorm(100, mean = 12, sd = 2),
  exper = rnorm(100, mean = 5, sd = 3),
  tenure = rnorm(100, mean = 3, sd = 1)
)

# (i) Estimate the equation
model_wage <- lm(wage ~ educ + exper + tenure, data = WAGEl)
summary(model_wage)

## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = WAGEl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -31.2778  -5.4011   0.6288   6.7314  26.1132 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  47.5622     7.9215   6.004 3.42e-08 ***
## educ          0.2361     0.5812   0.406    0.685    
## exper        -0.4936     0.3448  -1.431    0.156    
## tenure        0.7991     1.1977   0.667    0.506    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.43 on 96 degrees of freedom
## Multiple R-squared:  0.02721,    Adjusted R-squared:  -0.00319 
## F-statistic: 0.8951 on 3 and 96 DF,  p-value: 0.4467

# Save residuals
WAGEl$residuals <- resid(model_wage)

# Histogram of residuals
ggplot(WAGEl, aes(x = residuals)) +
  geom_histogram(bins = 30, aes(y = ..density..), fill = "blue", alpha = 0.7) +
  labs(title = "Histogram of Residuals from Wage Model",
       x = "Residuals",
       y = "Density") +
  theme_minimal()

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# (ii) Repeat with log(wage)
model_log_wage <- lm(log(wage) ~ educ + exper + tenure, data = WAGEl)
summary(model_log_wage)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = WAGEl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.91269 -0.09138  0.03114  0.15417  0.46941 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.820515   0.178317  21.425   <2e-16 ***
## educ         0.004766   0.013083   0.364    0.716    
## exper       -0.009684   0.007762  -1.248    0.215    
## tenure       0.021455   0.026962   0.796    0.428    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2348 on 96 degrees of freedom
## Multiple R-squared:  0.02407,    Adjusted R-squared:  -0.00643 
## F-statistic: 0.7891 on 3 and 96 DF,  p-value: 0.5029

# Save residuals for log(wage)
WAGEl$residuals_log <- resid(model_log_wage)

# Histogram of residuals for log(wage)
ggplot(WAGEl, aes(x = residuals_log)) +
  geom_histogram(bins = 30, aes(y = ..density..), fill = "green", alpha = 0.7) +
  labs(title = "Histogram of Residuals from Log(Wage) Model",
       x = "Residuals",
       y = "Density") +
  theme_minimal()

# (iii) Assessing Assumption MLR.6
# Residuals vs Fitted for Wage Model
ggplot(WAGEl, aes(x = fitted(model_wage), y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title = "Residuals vs Fitted Values (Wage Model)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal()

# Residuals vs Fitted for Log(Wage) Model
ggplot(WAGEl, aes(x = fitted(model_log_wage), y = residuals_log)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title = "Residuals vs Fitted Values (Log(Wage) Model)",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal()

# Conclusion and final thoughts
cat("\nAnalysis complete across all chapters. Considerations for further research include exploring confounding variables and potential interactions in the models.\n")

## 
## Analysis complete across all chapters. Considerations for further research include exploring confounding variables and potential interactions in the models.

Midterm

2024-10-31