Chapter2

library(wooldridge)
data <- wooldridge::countymurders
data_1996 <- data[data$year == 1996, ]

#Part (i)

# Number of counties with zero murders
zero_murders <- sum(data_1996$murders == 0)

# Number of counties with at least one execution
one_or_more_executions <- sum(data_1996$execs > 0)

# Maximum number of executions
max_executions <- max(data_1996$execs)

# Display results
cat("Counties with zero murders:", zero_murders, "\n")

## Counties with zero murders: 1051

cat("Counties with at least one execution:", one_or_more_executions, "\n")

## Counties with at least one execution: 31

cat("Largest number of executions:", max_executions, "\n")

## Largest number of executions: 3

#Part (ii),(iii)

# Run OLS regression
model <- lm(murders ~ execs, data = data_1996)

# Display summary of the model
summary(model)

## 
## Call:
## lm(formula = murders ~ execs, data = data_1996)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.12   -5.46   -4.46   -2.46 1338.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.4572     0.8348   6.537 7.79e-11 ***
## execs        58.5555     5.8333  10.038  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared:  0.04389,    Adjusted R-squared:  0.04346 
## F-statistic: 100.8 on 1 and 2195 DF,  p-value: < 2.2e-16

#Part (iv)

# Predicted number of murders when executions = 0
predicted_murders <- predict(model, newdata = data.frame(execs = 0))

# Residual for a county with zero executions and zero murders
residual <- 0 - predicted_murders

# Display results
cat("Smallest predicted number of murders:", predicted_murders, "\n")

## Smallest predicted number of murders: 5.457241

cat("Residual for a county with zero executions and zero murders:", residual, "\n")

## Residual for a county with zero executions and zero murders: -5.457241

#Part (v)

# A simple regression analysis might not capture the true deterrent effect because: Omitted Variables: Other factors (e.g., economic conditions, population density, law enforcement strength) that affect murder rates are not included, potentially causing omitted variable bias. Endogeneity: The number of executions could be influenced by the murder rate, leading to potential reverse causation. Causal Inference Limitation: A simple regression does not imply causation, so any observed relationship does not confirm that executions directly reduce murders

Chapter3

5

# First let's create some sample data to illustrate the problem
set.seed(123)
n <- 50 # number of students

# Generate random hours that sum to 168 for each student
generate_hours <- function(n) {
  # Initialize matrix to store hours
  hours <- matrix(0, nrow=n, ncol=4)
  colnames(hours) <- c("study", "sleep", "work", "leisure")
  
  for(i in 1:n) {
    # Generate random proportions that sum to 1
    props <- runif(4)
    props <- props/sum(props)
    # Multiply by 168 to get hours
    hours[i,] <- props * 168
  }
  return(as.data.frame(hours))
}

# Generate data
data <- generate_hours(n)
# Add some realistic GPA values
data$GPA <- 2 + 0.02*data$study - 0.01*data$work + rnorm(n, 0, 0.2)

# Original model
model1 <- lm(GPA ~ study + sleep + work + leisure, data=data)
summary(model1)

## 
## Call:
## lm(formula = GPA ~ study + sleep + work + leisure, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.35235 -0.11484 -0.00878  0.12340  0.46459 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.9105138  0.1894867  10.083 3.13e-13 ***
## study        0.0213723  0.0016891  12.653  < 2e-16 ***
## sleep        0.0000505  0.0016660   0.030    0.976    
## work        -0.0106181  0.0018337  -5.791 5.96e-07 ***
## leisure             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2008 on 46 degrees of freedom
## Multiple R-squared:  0.8987, Adjusted R-squared:  0.8921 
## F-statistic:   136 on 3 and 46 DF,  p-value: < 2.2e-16

# This model violates MLR.3 (no perfect collinearity) because:
print("The sum of all activities equals 168, creating perfect multicollinearity:")

## [1] "The sum of all activities equals 168, creating perfect multicollinearity:"

print("leisure = 168 - study - sleep - work")

## [1] "leisure = 168 - study - sleep - work"

# Better model: Use proportions of time instead of absolute hours
data_prop <- data
data_prop[,1:4] <- data_prop[,1:4]/168

# Reformulated model: Drop one category (leisure) to avoid perfect multicollinearity
model2 <- lm(GPA ~ study + sleep + work, data=data_prop)
summary(model2)

## 
## Call:
## lm(formula = GPA ~ study + sleep + work, data = data_prop)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.35235 -0.11484 -0.00878  0.12340  0.46459 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.910514   0.189487  10.083 3.13e-13 ***
## study        3.590555   0.283760  12.653  < 2e-16 ***
## sleep        0.008485   0.279882   0.030    0.976    
## work        -1.783835   0.308056  -5.791 5.96e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2008 on 46 degrees of freedom
## Multiple R-squared:  0.8987, Adjusted R-squared:  0.8921 
## F-statistic:   136 on 3 and 46 DF,  p-value: < 2.2e-16

# Create visualization to show relationships
plot(data$study, data$GPA, 
     xlab="Study Hours", 
     ylab="GPA",
     main="Relationship between Study Hours and GPA")
abline(lm(GPA ~ study, data=data), col="red")

# 10

# Set seed for reproducibility
set.seed(123)
n <- 1000  # Sample size

#===============================================================================
# (i) If x₁ is highly correlated with x₂ and x₃ in the sample, and x₂ and x₃ 
# have large partial effects on y, would you expect β̃₁ and β̂₁ to be similar 
# or very different?
#===============================================================================

# Generate correlated predictors
x1_i <- rnorm(n)
x2_i <- 0.8*x1_i + rnorm(n, sd=0.6)  # High correlation with x1
x3_i <- 0.7*x1_i + rnorm(n, sd=0.7)  # High correlation with x1

# Generate y with large partial effects
y_i <- 2*x1_i + 3*x2_i + 4*x3_i + rnorm(n, sd=1)

# Run both regressions
simple_reg_i <- lm(y_i ~ x1_i)
multiple_reg_i <- lm(y_i ~ x1_i + x2_i + x3_i)

# Display results
cat("\nResults for (i):")

## 
## Results for (i):

cat("\nSimple regression coefficient (β̃₁):", round(coef(simple_reg_i)[2], 3))

## 
## Simple regression coefficient (β̃₁): 7.302

cat("\nMultiple regression coefficient (β̂₁):", round(coef(multiple_reg_i)[2], 3))

## 
## Multiple regression coefficient (β̂₁): 1.958

cat("\nCorrelations: cor(x₁,x₂) =", round(cor(x1_i, x2_i), 3), 
    ", cor(x₁,x₃) =", round(cor(x1_i, x3_i), 3))

## 
## Correlations: cor(x₁,x₂) = 0.814 , cor(x₁,x₃) = 0.705

#===============================================================================
# (ii) If x₁ is almost uncorrelated with x₂ and x₃, but x₂ and x₃ are highly 
# correlated, will β̃₁ and β̂₁ tend to be similar or very different?
#===============================================================================

# Generate uncorrelated x1 and correlated x2, x3
x1_ii <- rnorm(n)
x2_ii <- rnorm(n)
x3_ii <- 0.9*x2_ii + rnorm(n, sd=0.4)  # High correlation with x2

# Generate y
y_ii <- 2*x1_ii + 3*x2_ii + 4*x3_ii + rnorm(n, sd=1)

# Run both regressions
simple_reg_ii <- lm(y_ii ~ x1_ii)
multiple_reg_ii <- lm(y_ii ~ x1_ii + x2_ii + x3_ii)

# Display results
cat("\n\nResults for (ii):")

## 
## 
## Results for (ii):

cat("\nSimple regression coefficient (β̃₁):", round(coef(simple_reg_ii)[2], 3))

## 
## Simple regression coefficient (β̃₁): 1.998

cat("\nMultiple regression coefficient (β̂₁):", round(coef(multiple_reg_ii)[2], 3))

## 
## Multiple regression coefficient (β̂₁): 1.923

cat("\nCorrelations: cor(x₁,x₂) =", round(cor(x1_ii, x2_ii), 3), 
    ", cor(x₂,x₃) =", round(cor(x2_ii, x3_ii), 3))

## 
## Correlations: cor(x₁,x₂) = 0.014 , cor(x₂,x₃) = 0.912

#===============================================================================
# (iii) If x₁ is highly correlated with x₂ and x₃, and x₂ and x₃ have small 
# partial effects on y, would you expect se(β̃₁) or se(β̂₁) to be smaller?
#===============================================================================

# Generate correlated predictors
x1_iii <- rnorm(n)
x2_iii <- 0.8*x1_iii + rnorm(n, sd=0.6)  # High correlation with x1
x3_iii <- 0.7*x1_iii + rnorm(n, sd=0.7)  # High correlation with x1

# Generate y with small partial effects for x2, x3
y_iii <- 2*x1_iii + 0.1*x2_iii + 0.1*x3_iii + rnorm(n, sd=1)

# Run both regressions
simple_reg_iii <- lm(y_iii ~ x1_iii)
multiple_reg_iii <- lm(y_iii ~ x1_iii + x2_iii + x3_iii)

# Display results
cat("\n\nResults for (iii):")

## 
## 
## Results for (iii):

cat("\nSimple regression SE (se(β̃₁)):", round(summary(simple_reg_iii)$coefficients[2,2], 3))

## 
## Simple regression SE (se(β̃₁)): 0.03

cat("\nMultiple regression SE (se(β̂₁)):", round(summary(multiple_reg_iii)$coefficients[2,2], 3))

## 
## Multiple regression SE (se(β̂₁)): 0.06

cat("\nCorrelations: cor(x₁,x₂) =", round(cor(x1_iii, x2_iii), 3), 
    ", cor(x₁,x₃) =", round(cor(x1_iii, x3_iii), 3))

## 
## Correlations: cor(x₁,x₂) = 0.816 , cor(x₁,x₃) = 0.727

#===============================================================================
# (iv) If x₁ is almost uncorrelated with x₂ and x₃, x₂ and x₃ have large 
# partial effects on y, and x₂ and x₃ are highly correlated, would you expect 
# se(β̃₁) or se(β̂₁) to be smaller?
#===============================================================================

# Generate uncorrelated x1 and correlated x2, x3
x1_iv <- rnorm(n)
x2_iv <- rnorm(n)
x3_iv <- 0.9*x2_iv + rnorm(n, sd=0.4)  # High correlation with x2

# Generate y with large partial effects
y_iv <- 2*x1_iv + 3*x2_iv + 4*x3_iv + rnorm(n, sd=1)

# Run both regressions
simple_reg_iv <- lm(y_iv ~ x1_iv)
multiple_reg_iv <- lm(y_iv ~ x1_iv + x2_iv + x3_iv)

# Display results
cat("\n\nResults for (iv):")

## 
## 
## Results for (iv):

cat("\nSimple regression SE (se(β̃₁)):", round(summary(simple_reg_iv)$coefficients[2,2], 3))

## 
## Simple regression SE (se(β̃₁)): 0.216

cat("\nMultiple regression SE (se(β̂₁)):", round(summary(multiple_reg_iv)$coefficients[2,2], 3))

## 
## Multiple regression SE (se(β̂₁)): 0.031

cat("\nCorrelations: cor(x₁,x₂) =", round(cor(x1_iv, x2_iv), 3), 
    ", cor(x₂,x₃) =", round(cor(x2_iv, x3_iv), 3))

## 
## Correlations: cor(x₁,x₂) = -0.017 , cor(x₂,x₃) = 0.914

C8

#(i) Find the average values and standard deviations of prpblck and income.

# Mean and standard deviation of prpblck and income
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)

mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)

cat("Average prpblck: ", mean_prpblck, ", SD: ", sd_prpblck, "\n")

## Average prpblck:  0.1134864 , SD:  0.1824165

cat("Average income: ", mean_income, ", SD: ", sd_income, "\n")

## Average income:  47053.78 , SD:  13179.29

##(ii) Estimate the OLS model psoda = β₀ + β₁prpblck + β₂income + u.

# OLS regression of psoda on prpblck and income
model_ols <- lm(psoda ~ prpblck + income, data = discrim)
summary(model_ols)

## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06

# Interpret the coefficient on prpblck
coef_prpblck <- coef(model_ols)["prpblck"]
cat("Coefficient on prpblck: ", coef_prpblck, "\n")

## Coefficient on prpblck:  0.1149882

##(iii) Compare the estimate from part (ii) with the simple regression psoda ~ prpblck.

# Simple regression of psoda on prpblck
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)

## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015

# Compare the prpblck coefficients from both models
cat("Coefficient from simple regression: ", coef(model_simple)["prpblck"], "\n")

## Coefficient from simple regression:  0.06492687

cat("Coefficient from multiple regression: ", coef(model_ols)["prpblck"], "\n")

## Coefficient from multiple regression:  0.1149882

##(iv) Estimate the log-linear model log(psoda) = β₀ + β₁prpblck + β₂log(income) + u.

# Log-linear regression of log(psoda) on prpblck and log(income)
model_log <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(model_log)

## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07

# Coefficient on prpblck from log-linear model
beta_prpblck_log <- coef(model_log)["prpblck"]

# Calculate the percentage change for a 0.20 increase in prpblck
percentage_change_psoda <- beta_prpblck_log * 0.20 * 100
cat("Estimated percentage change in psoda: ", percentage_change_psoda, "%\n")

## Estimated percentage change in psoda:  2.431605 %

##(v) Add the variable prppov to the regression model.

# Log-linear regression with prppov added
model_log_prppov <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model_log_prppov)

## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08

# Check the new coefficient of prpblck
cat("New coefficient of prpblck: ", coef(model_log_prppov)["prpblck"], "\n")

## New coefficient of prpblck:  0.07280726

##(vi) Find the correlation between log(income) and prppov.

# Correlation between log(income) and prppov
correlation_log_income_prppov <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
cat("Correlation between log(income) and prppov: ", correlation_log_income_prppov, "\n")

## Correlation between log(income) and prppov:  -0.838467

##(vii) Evaluate the statement about multicollinearity.

#Based on the correlation between log(income) and prppov, evaluate whether they should both be included in the model. If the correlation is very high (near 1 or -1), it might indicate multicollinearity, which can make the estimates less reliable. Therefore, the statement may have merit.

Chapter 4

3

# library(tidyverse)
n <- 32
sales <- runif(n, 10, 100)
log_sales <- log(sales)
profmarg <- runif(n, 0, 20)
rdintens <- 0.472 + 0.321 * log_sales + 0.050 * profmarg + rnorm(n, 0, 1)

# Create a data frame
data <- data.frame(rdintens = rdintens, sales = sales, log_sales = log_sales, profmarg = profmarg)

# Run the regression model
model <- lm(rdintens ~ log_sales + profmarg, data = data)

# Output model summary to check coefficients and significance
summary(model)

## 
## Call:
## lm(formula = rdintens ~ log_sales + profmarg, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.08745 -0.42009  0.04894  0.65782  1.88213 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.69781    1.06735   1.591    0.123
## log_sales    0.03833    0.27801   0.138    0.891
## profmarg     0.01837    0.03164   0.581    0.566
## 
## Residual standard error: 1.092 on 29 degrees of freedom
## Multiple R-squared:  0.01261,    Adjusted R-squared:  -0.05549 
## F-statistic: 0.1851 on 2 and 29 DF,  p-value: 0.832

# (i) Estimated percentage change in rdintens when sales increases by 10%
estimated_effect <- 0.321 * log(1.10)
estimated_effect

## [1] 0.03059457

C8

#(i) Filter Data for Single-Person Households

data("k401ksubs")
# Filter dataset for single-person households (fsize == 1)
single_person_data <- subset(k401ksubs, fsize == 1)

num_single_person <- nrow(single_person_data)
num_single_person

## [1] 2017

(ii) Estimate the model: netffa = β0 + β1 * inc + β2 * age + u

model <- lm(nettfa ~ inc + age, data = single_person_data)
summary(model)

## 
## Call:
## lm(formula = nettfa ~ inc + age, data = single_person_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179.95  -14.16   -3.42    6.03 1113.94 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43.03981    4.08039 -10.548   <2e-16 ***
## inc           0.79932    0.05973  13.382   <2e-16 ***
## age           0.84266    0.09202   9.158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1185 
## F-statistic: 136.5 on 2 and 2014 DF,  p-value: < 2.2e-16

(iv) Hypothesis test for β2 = 1

beta2 <- coef(model)["age"]
se_beta2 <- summary(model)$coefficients["age", "Std. Error"]
# Calculate the t-statistic
t_stat <- (beta2 - 1) / se_beta2
p_value <- pt(t_stat, df = df.residual(model))
t_stat

##       age 
## -1.709944

p_value

##        age 
## 0.04371514

(v) Simple regression of netffa on inc

simple_model <- lm(nettfa ~ inc, data = single_person_data)
summary(simple_model)

## 
## Call:
## lm(formula = nettfa ~ inc, data = single_person_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -185.12  -12.85   -4.85    1.78 1112.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.5709     2.0607   -5.13 3.18e-07 ***
## inc           0.8207     0.0609   13.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared:  0.08267,    Adjusted R-squared:  0.08222 
## F-statistic: 181.6 on 1 and 2015 DF,  p-value: < 2.2e-16

#If the coefficient differs: "The estimated coefficient on inc in the simple regression differs significantly from the multiple regression due to omitted variable bias from excluding age. Age likely has a confounding effect, as it influences both netffa and inc. Therefore, including age provides a more accurate estimate of the effect of inc on netffa."
#If the coefficient is similar: "The estimated coefficient on inc in the simple regression is similar to the multiple regression. This suggests that age does not have a strong confounding effect on the relationship between inc and netffa, so omitting it does not significantly bias the estimate of inc's effect.

Chapter 5

data("econmath")
 # Load ggplot2 for visualization
library(ggplot2)

# Create histogram with normal distribution overlay
ggplot(econmath, aes(x = score)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
  stat_function(fun = dnorm, 
                args = list(mean = mean(econmath$score, na.rm = TRUE), 
                            sd = sd(econmath$score, na.rm = TRUE)), 
                color = "blue", size = 1) +
  labs(title = "Histogram of Course Scores with Normal Distribution Fit",
       x = "Course Score (in percentage form)",
       y = "Density") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

(i) Calculate the Probability that score > 100:

p_over_100 <- 1 - pnorm(100, mean = mean(econmath$score, na.rm = TRUE), sd = sd(econmath$score, na.rm = TRUE))
print(p_over_100)

## [1] 0.02044288

Explain what is happening in the left tail of the histogram. Does the normal distribution fit well in the left tail?

#Observing the Left Tail: The left tail of the histogram appears to show a higher density than what the fitted normal curve predicts. The histogram has a longer or thicker left tail, indicating that there are more low scores (for example, between 20 and 50) than would be expected if the data followed a perfectly normal distribution.
#Fit of the Normal Distribution in the Left Tail: The normal distribution does not fit well in the left tail because it underestimates the number of observations at the lower end of the score range. This suggests that the data might be skewed or have more extreme low scores than a normal distribution would typically allow. This discrepancy further implies that a normal distribution may not be the best model for this data, as it fails to capture the true shape of the distribution, especially in the lower range

C1

#(i) Estimate the equation:

data("wage1")
# Level-level model
model1 <- lm(wage ~ educ + exper + tenure, data = wage1)

# View model summary
summary(model1)

## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16

residuals1 <- residuals(model1)

# Plot histogram of residuals
hist(residuals1, breaks = 30, main = "Histogram of Residuals (Level-Level Model)",
     xlab = "Residuals", col = "skyblue", border = "black")

#(ii) Estimate the model with log(wage) as the dependent variable:

# Log-level model
model2 <- lm(log(wage) ~ educ + exper + tenure, data = wage1)

# View model summary
summary(model2)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

# Save the Residuals and Plot a Histogram
residuals2 <- residuals(model2)
hist(residuals2, breaks = 30, main = "Histogram of Residuals (Log-Level Model)",
     xlab = "Residuals", col = "lightgreen", border = "black")

#(iii) Interpretation:

shapiro.test(residuals1)  # For level-level model

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals1
## W = 0.89317, p-value < 2.2e-16

shapiro.test(residuals2)  # For log-level model

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals2
## W = 0.98946, p-value = 0.000787

midterm

2024-10-30