library(wooldridge)
data <- wooldridge::countymurders
data_1996 <- data[data$year == 1996, ]
#Part (i)
# Number of counties with zero murders
zero_murders <- sum(data_1996$murders == 0)
# Number of counties with at least one execution
one_or_more_executions <- sum(data_1996$execs > 0)
# Maximum number of executions
max_executions <- max(data_1996$execs)
# Display results
cat("Counties with zero murders:", zero_murders, "\n")
## Counties with zero murders: 1051
cat("Counties with at least one execution:", one_or_more_executions, "\n")
## Counties with at least one execution: 31
cat("Largest number of executions:", max_executions, "\n")
## Largest number of executions: 3
#Part (ii),(iii)
# Run OLS regression
model <- lm(murders ~ execs, data = data_1996)
# Display summary of the model
summary(model)
##
## Call:
## lm(formula = murders ~ execs, data = data_1996)
##
## Residuals:
## Min 1Q Median 3Q Max
## -149.12 -5.46 -4.46 -2.46 1338.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.4572 0.8348 6.537 7.79e-11 ***
## execs 58.5555 5.8333 10.038 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared: 0.04389, Adjusted R-squared: 0.04346
## F-statistic: 100.8 on 1 and 2195 DF, p-value: < 2.2e-16
#Part (iv)
# Predicted number of murders when executions = 0
predicted_murders <- predict(model, newdata = data.frame(execs = 0))
# Residual for a county with zero executions and zero murders
residual <- 0 - predicted_murders
# Display results
cat("Smallest predicted number of murders:", predicted_murders, "\n")
## Smallest predicted number of murders: 5.457241
cat("Residual for a county with zero executions and zero murders:", residual, "\n")
## Residual for a county with zero executions and zero murders: -5.457241
#Part (v)
# A simple regression analysis might not capture the true deterrent effect because: Omitted Variables: Other factors (e.g., economic conditions, population density, law enforcement strength) that affect murder rates are not included, potentially causing omitted variable bias. Endogeneity: The number of executions could be influenced by the murder rate, leading to potential reverse causation. Causal Inference Limitation: A simple regression does not imply causation, so any observed relationship does not confirm that executions directly reduce murders
# First let's create some sample data to illustrate the problem
set.seed(123)
n <- 50 # number of students
# Generate random hours that sum to 168 for each student
generate_hours <- function(n) {
# Initialize matrix to store hours
hours <- matrix(0, nrow=n, ncol=4)
colnames(hours) <- c("study", "sleep", "work", "leisure")
for(i in 1:n) {
# Generate random proportions that sum to 1
props <- runif(4)
props <- props/sum(props)
# Multiply by 168 to get hours
hours[i,] <- props * 168
}
return(as.data.frame(hours))
}
# Generate data
data <- generate_hours(n)
# Add some realistic GPA values
data$GPA <- 2 + 0.02*data$study - 0.01*data$work + rnorm(n, 0, 0.2)
# Original model
model1 <- lm(GPA ~ study + sleep + work + leisure, data=data)
summary(model1)
##
## Call:
## lm(formula = GPA ~ study + sleep + work + leisure, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.35235 -0.11484 -0.00878 0.12340 0.46459
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.9105138 0.1894867 10.083 3.13e-13 ***
## study 0.0213723 0.0016891 12.653 < 2e-16 ***
## sleep 0.0000505 0.0016660 0.030 0.976
## work -0.0106181 0.0018337 -5.791 5.96e-07 ***
## leisure NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2008 on 46 degrees of freedom
## Multiple R-squared: 0.8987, Adjusted R-squared: 0.8921
## F-statistic: 136 on 3 and 46 DF, p-value: < 2.2e-16
# This model violates MLR.3 (no perfect collinearity) because:
print("The sum of all activities equals 168, creating perfect multicollinearity:")
## [1] "The sum of all activities equals 168, creating perfect multicollinearity:"
print("leisure = 168 - study - sleep - work")
## [1] "leisure = 168 - study - sleep - work"
# Better model: Use proportions of time instead of absolute hours
data_prop <- data
data_prop[,1:4] <- data_prop[,1:4]/168
# Reformulated model: Drop one category (leisure) to avoid perfect multicollinearity
model2 <- lm(GPA ~ study + sleep + work, data=data_prop)
summary(model2)
##
## Call:
## lm(formula = GPA ~ study + sleep + work, data = data_prop)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.35235 -0.11484 -0.00878 0.12340 0.46459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.910514 0.189487 10.083 3.13e-13 ***
## study 3.590555 0.283760 12.653 < 2e-16 ***
## sleep 0.008485 0.279882 0.030 0.976
## work -1.783835 0.308056 -5.791 5.96e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2008 on 46 degrees of freedom
## Multiple R-squared: 0.8987, Adjusted R-squared: 0.8921
## F-statistic: 136 on 3 and 46 DF, p-value: < 2.2e-16
# Create visualization to show relationships
plot(data$study, data$GPA,
xlab="Study Hours",
ylab="GPA",
main="Relationship between Study Hours and GPA")
abline(lm(GPA ~ study, data=data), col="red")
# 10
# Set seed for reproducibility
set.seed(123)
n <- 1000 # Sample size
#===============================================================================
# (i) If x₁ is highly correlated with x₂ and x₃ in the sample, and x₂ and x₃
# have large partial effects on y, would you expect β̃₁ and β̂₁ to be similar
# or very different?
#===============================================================================
# Generate correlated predictors
x1_i <- rnorm(n)
x2_i <- 0.8*x1_i + rnorm(n, sd=0.6) # High correlation with x1
x3_i <- 0.7*x1_i + rnorm(n, sd=0.7) # High correlation with x1
# Generate y with large partial effects
y_i <- 2*x1_i + 3*x2_i + 4*x3_i + rnorm(n, sd=1)
# Run both regressions
simple_reg_i <- lm(y_i ~ x1_i)
multiple_reg_i <- lm(y_i ~ x1_i + x2_i + x3_i)
# Display results
cat("\nResults for (i):")
##
## Results for (i):
cat("\nSimple regression coefficient (β̃₁):", round(coef(simple_reg_i)[2], 3))
##
## Simple regression coefficient (β̃₁): 7.302
cat("\nMultiple regression coefficient (β̂₁):", round(coef(multiple_reg_i)[2], 3))
##
## Multiple regression coefficient (β̂₁): 1.958
cat("\nCorrelations: cor(x₁,x₂) =", round(cor(x1_i, x2_i), 3),
", cor(x₁,x₃) =", round(cor(x1_i, x3_i), 3))
##
## Correlations: cor(x₁,x₂) = 0.814 , cor(x₁,x₃) = 0.705
#===============================================================================
# (ii) If x₁ is almost uncorrelated with x₂ and x₃, but x₂ and x₃ are highly
# correlated, will β̃₁ and β̂₁ tend to be similar or very different?
#===============================================================================
# Generate uncorrelated x1 and correlated x2, x3
x1_ii <- rnorm(n)
x2_ii <- rnorm(n)
x3_ii <- 0.9*x2_ii + rnorm(n, sd=0.4) # High correlation with x2
# Generate y
y_ii <- 2*x1_ii + 3*x2_ii + 4*x3_ii + rnorm(n, sd=1)
# Run both regressions
simple_reg_ii <- lm(y_ii ~ x1_ii)
multiple_reg_ii <- lm(y_ii ~ x1_ii + x2_ii + x3_ii)
# Display results
cat("\n\nResults for (ii):")
##
##
## Results for (ii):
cat("\nSimple regression coefficient (β̃₁):", round(coef(simple_reg_ii)[2], 3))
##
## Simple regression coefficient (β̃₁): 1.998
cat("\nMultiple regression coefficient (β̂₁):", round(coef(multiple_reg_ii)[2], 3))
##
## Multiple regression coefficient (β̂₁): 1.923
cat("\nCorrelations: cor(x₁,x₂) =", round(cor(x1_ii, x2_ii), 3),
", cor(x₂,x₃) =", round(cor(x2_ii, x3_ii), 3))
##
## Correlations: cor(x₁,x₂) = 0.014 , cor(x₂,x₃) = 0.912
#===============================================================================
# (iii) If x₁ is highly correlated with x₂ and x₃, and x₂ and x₃ have small
# partial effects on y, would you expect se(β̃₁) or se(β̂₁) to be smaller?
#===============================================================================
# Generate correlated predictors
x1_iii <- rnorm(n)
x2_iii <- 0.8*x1_iii + rnorm(n, sd=0.6) # High correlation with x1
x3_iii <- 0.7*x1_iii + rnorm(n, sd=0.7) # High correlation with x1
# Generate y with small partial effects for x2, x3
y_iii <- 2*x1_iii + 0.1*x2_iii + 0.1*x3_iii + rnorm(n, sd=1)
# Run both regressions
simple_reg_iii <- lm(y_iii ~ x1_iii)
multiple_reg_iii <- lm(y_iii ~ x1_iii + x2_iii + x3_iii)
# Display results
cat("\n\nResults for (iii):")
##
##
## Results for (iii):
cat("\nSimple regression SE (se(β̃₁)):", round(summary(simple_reg_iii)$coefficients[2,2], 3))
##
## Simple regression SE (se(β̃₁)): 0.03
cat("\nMultiple regression SE (se(β̂₁)):", round(summary(multiple_reg_iii)$coefficients[2,2], 3))
##
## Multiple regression SE (se(β̂₁)): 0.06
cat("\nCorrelations: cor(x₁,x₂) =", round(cor(x1_iii, x2_iii), 3),
", cor(x₁,x₃) =", round(cor(x1_iii, x3_iii), 3))
##
## Correlations: cor(x₁,x₂) = 0.816 , cor(x₁,x₃) = 0.727
#===============================================================================
# (iv) If x₁ is almost uncorrelated with x₂ and x₃, x₂ and x₃ have large
# partial effects on y, and x₂ and x₃ are highly correlated, would you expect
# se(β̃₁) or se(β̂₁) to be smaller?
#===============================================================================
# Generate uncorrelated x1 and correlated x2, x3
x1_iv <- rnorm(n)
x2_iv <- rnorm(n)
x3_iv <- 0.9*x2_iv + rnorm(n, sd=0.4) # High correlation with x2
# Generate y with large partial effects
y_iv <- 2*x1_iv + 3*x2_iv + 4*x3_iv + rnorm(n, sd=1)
# Run both regressions
simple_reg_iv <- lm(y_iv ~ x1_iv)
multiple_reg_iv <- lm(y_iv ~ x1_iv + x2_iv + x3_iv)
# Display results
cat("\n\nResults for (iv):")
##
##
## Results for (iv):
cat("\nSimple regression SE (se(β̃₁)):", round(summary(simple_reg_iv)$coefficients[2,2], 3))
##
## Simple regression SE (se(β̃₁)): 0.216
cat("\nMultiple regression SE (se(β̂₁)):", round(summary(multiple_reg_iv)$coefficients[2,2], 3))
##
## Multiple regression SE (se(β̂₁)): 0.031
cat("\nCorrelations: cor(x₁,x₂) =", round(cor(x1_iv, x2_iv), 3),
", cor(x₂,x₃) =", round(cor(x2_iv, x3_iv), 3))
##
## Correlations: cor(x₁,x₂) = -0.017 , cor(x₂,x₃) = 0.914
#(i) Find the average values and standard deviations of prpblck and income.
# Mean and standard deviation of prpblck and income
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)
cat("Average prpblck: ", mean_prpblck, ", SD: ", sd_prpblck, "\n")
## Average prpblck: 0.1134864 , SD: 0.1824165
cat("Average income: ", mean_income, ", SD: ", sd_income, "\n")
## Average income: 47053.78 , SD: 13179.29
##(ii) Estimate the OLS model psoda = β₀ + β₁prpblck + β₂income + u.
# OLS regression of psoda on prpblck and income
model_ols <- lm(psoda ~ prpblck + income, data = discrim)
summary(model_ols)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
# Interpret the coefficient on prpblck
coef_prpblck <- coef(model_ols)["prpblck"]
cat("Coefficient on prpblck: ", coef_prpblck, "\n")
## Coefficient on prpblck: 0.1149882
##(iii) Compare the estimate from part (ii) with the simple regression psoda ~ prpblck.
# Simple regression of psoda on prpblck
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
# Compare the prpblck coefficients from both models
cat("Coefficient from simple regression: ", coef(model_simple)["prpblck"], "\n")
## Coefficient from simple regression: 0.06492687
cat("Coefficient from multiple regression: ", coef(model_ols)["prpblck"], "\n")
## Coefficient from multiple regression: 0.1149882
##(iv) Estimate the log-linear model log(psoda) = β₀ + β₁prpblck + β₂log(income) + u.
# Log-linear regression of log(psoda) on prpblck and log(income)
model_log <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(model_log)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
# Coefficient on prpblck from log-linear model
beta_prpblck_log <- coef(model_log)["prpblck"]
# Calculate the percentage change for a 0.20 increase in prpblck
percentage_change_psoda <- beta_prpblck_log * 0.20 * 100
cat("Estimated percentage change in psoda: ", percentage_change_psoda, "%\n")
## Estimated percentage change in psoda: 2.431605 %
##(v) Add the variable prppov to the regression model.
# Log-linear regression with prppov added
model_log_prppov <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model_log_prppov)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
# Check the new coefficient of prpblck
cat("New coefficient of prpblck: ", coef(model_log_prppov)["prpblck"], "\n")
## New coefficient of prpblck: 0.07280726
##(vi) Find the correlation between log(income) and prppov.
# Correlation between log(income) and prppov
correlation_log_income_prppov <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
cat("Correlation between log(income) and prppov: ", correlation_log_income_prppov, "\n")
## Correlation between log(income) and prppov: -0.838467
##(vii) Evaluate the statement about multicollinearity.
#Based on the correlation between log(income) and prppov, evaluate whether they should both be included in the model. If the correlation is very high (near 1 or -1), it might indicate multicollinearity, which can make the estimates less reliable. Therefore, the statement may have merit.
# library(tidyverse)
n <- 32
sales <- runif(n, 10, 100)
log_sales <- log(sales)
profmarg <- runif(n, 0, 20)
rdintens <- 0.472 + 0.321 * log_sales + 0.050 * profmarg + rnorm(n, 0, 1)
# Create a data frame
data <- data.frame(rdintens = rdintens, sales = sales, log_sales = log_sales, profmarg = profmarg)
# Run the regression model
model <- lm(rdintens ~ log_sales + profmarg, data = data)
# Output model summary to check coefficients and significance
summary(model)
##
## Call:
## lm(formula = rdintens ~ log_sales + profmarg, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.08745 -0.42009 0.04894 0.65782 1.88213
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.69781 1.06735 1.591 0.123
## log_sales 0.03833 0.27801 0.138 0.891
## profmarg 0.01837 0.03164 0.581 0.566
##
## Residual standard error: 1.092 on 29 degrees of freedom
## Multiple R-squared: 0.01261, Adjusted R-squared: -0.05549
## F-statistic: 0.1851 on 2 and 29 DF, p-value: 0.832
# (i) Estimated percentage change in rdintens when sales increases by 10%
estimated_effect <- 0.321 * log(1.10)
estimated_effect
## [1] 0.03059457
#(i) Filter Data for Single-Person Households
data("k401ksubs")
# Filter dataset for single-person households (fsize == 1)
single_person_data <- subset(k401ksubs, fsize == 1)
num_single_person <- nrow(single_person_data)
num_single_person
## [1] 2017
model <- lm(nettfa ~ inc + age, data = single_person_data)
summary(model)
##
## Call:
## lm(formula = nettfa ~ inc + age, data = single_person_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -179.95 -14.16 -3.42 6.03 1113.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.03981 4.08039 -10.548 <2e-16 ***
## inc 0.79932 0.05973 13.382 <2e-16 ***
## age 0.84266 0.09202 9.158 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared: 0.1193, Adjusted R-squared: 0.1185
## F-statistic: 136.5 on 2 and 2014 DF, p-value: < 2.2e-16
beta2 <- coef(model)["age"]
se_beta2 <- summary(model)$coefficients["age", "Std. Error"]
# Calculate the t-statistic
t_stat <- (beta2 - 1) / se_beta2
p_value <- pt(t_stat, df = df.residual(model))
t_stat
## age
## -1.709944
p_value
## age
## 0.04371514
simple_model <- lm(nettfa ~ inc, data = single_person_data)
summary(simple_model)
##
## Call:
## lm(formula = nettfa ~ inc, data = single_person_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185.12 -12.85 -4.85 1.78 1112.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.5709 2.0607 -5.13 3.18e-07 ***
## inc 0.8207 0.0609 13.48 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared: 0.08267, Adjusted R-squared: 0.08222
## F-statistic: 181.6 on 1 and 2015 DF, p-value: < 2.2e-16
#If the coefficient differs: "The estimated coefficient on inc in the simple regression differs significantly from the multiple regression due to omitted variable bias from excluding age. Age likely has a confounding effect, as it influences both netffa and inc. Therefore, including age provides a more accurate estimate of the effect of inc on netffa."
#If the coefficient is similar: "The estimated coefficient on inc in the simple regression is similar to the multiple regression. This suggests that age does not have a strong confounding effect on the relationship between inc and netffa, so omitting it does not significantly bias the estimate of inc's effect.
data("econmath")
# Load ggplot2 for visualization
library(ggplot2)
# Create histogram with normal distribution overlay
ggplot(econmath, aes(x = score)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
stat_function(fun = dnorm,
args = list(mean = mean(econmath$score, na.rm = TRUE),
sd = sd(econmath$score, na.rm = TRUE)),
color = "blue", size = 1) +
labs(title = "Histogram of Course Scores with Normal Distribution Fit",
x = "Course Score (in percentage form)",
y = "Density") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
(i) Calculate the Probability that score > 100:
p_over_100 <- 1 - pnorm(100, mean = mean(econmath$score, na.rm = TRUE), sd = sd(econmath$score, na.rm = TRUE))
print(p_over_100)
## [1] 0.02044288
#Observing the Left Tail: The left tail of the histogram appears to show a higher density than what the fitted normal curve predicts. The histogram has a longer or thicker left tail, indicating that there are more low scores (for example, between 20 and 50) than would be expected if the data followed a perfectly normal distribution.
#Fit of the Normal Distribution in the Left Tail: The normal distribution does not fit well in the left tail because it underestimates the number of observations at the lower end of the score range. This suggests that the data might be skewed or have more extreme low scores than a normal distribution would typically allow. This discrepancy further implies that a normal distribution may not be the best model for this data, as it fails to capture the true shape of the distribution, especially in the lower range
#(i) Estimate the equation:
data("wage1")
# Level-level model
model1 <- lm(wage ~ educ + exper + tenure, data = wage1)
# View model summary
summary(model1)
##
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6068 -1.7747 -0.6279 1.1969 14.6536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.87273 0.72896 -3.941 9.22e-05 ***
## educ 0.59897 0.05128 11.679 < 2e-16 ***
## exper 0.02234 0.01206 1.853 0.0645 .
## tenure 0.16927 0.02164 7.820 2.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared: 0.3064, Adjusted R-squared: 0.3024
## F-statistic: 76.87 on 3 and 522 DF, p-value: < 2.2e-16
residuals1 <- residuals(model1)
# Plot histogram of residuals
hist(residuals1, breaks = 30, main = "Histogram of Residuals (Level-Level Model)",
xlab = "Residuals", col = "skyblue", border = "black")
#(ii) Estimate the model with log(wage) as the dependent variable:
# Log-level model
model2 <- lm(log(wage) ~ educ + exper + tenure, data = wage1)
# View model summary
summary(model2)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05802 -0.29645 -0.03265 0.28788 1.42809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.284360 0.104190 2.729 0.00656 **
## educ 0.092029 0.007330 12.555 < 2e-16 ***
## exper 0.004121 0.001723 2.391 0.01714 *
## tenure 0.022067 0.003094 7.133 3.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared: 0.316, Adjusted R-squared: 0.3121
## F-statistic: 80.39 on 3 and 522 DF, p-value: < 2.2e-16
# Save the Residuals and Plot a Histogram
residuals2 <- residuals(model2)
hist(residuals2, breaks = 30, main = "Histogram of Residuals (Log-Level Model)",
xlab = "Residuals", col = "lightgreen", border = "black")
#(iii) Interpretation:
shapiro.test(residuals1) # For level-level model
##
## Shapiro-Wilk normality test
##
## data: residuals1
## W = 0.89317, p-value < 2.2e-16
shapiro.test(residuals2) # For log-level model
##
## Shapiro-Wilk normality test
##
## data: residuals2
## W = 0.98946, p-value = 0.000787