Midterm

#Chapter 2 - C9
library(wooldridge) 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) 
data("countymurders")
murders_1996 <- countymurders[countymurders$year == 1996, ]
#1. Counties with zero murders and counties with executions
zero_murders <- sum(murders_1996$murders == 0)
cat("Number of counties with zero murders:", zero_murders, "\n")

## Number of counties with zero murders: 1051

counties_with_executions <- sum(murders_1996$execs > 0)
cat("Number of counties with at least one execution:", counties_with_executions, "\n")

## Number of counties with at least one execution: 31

max_executions <- max(murders_1996$execs)
cat("Largest number of executions:", max_executions, "\n")

## Largest number of executions: 3

#OLS Regression
model <- lm(murders ~ execs, data = murders_1996)
summary_stats <- summary(model)
print(summary_stats)

## 
## Call:
## lm(formula = murders ~ execs, data = murders_1996)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.12   -5.46   -4.46   -2.46 1338.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.4572     0.8348   6.537 7.79e-11 ***
## execs        58.5555     5.8333  10.038  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared:  0.04389,    Adjusted R-squared:  0.04346 
## F-statistic: 100.8 on 1 and 2195 DF,  p-value: < 2.2e-16

n <- nobs(model)
r_squared <- summary_stats$r.squared
adj_r_squared <- summary_stats$adj.r.squared

cat("\nRegression Results:")

## 
## Regression Results:

cat("\nSample size:", n)

## 
## Sample size: 2197

cat("\nR-squared:", round(r_squared, 4))

## 
## R-squared: 0.0439

cat("\nAdjusted R-squared:", round(adj_r_squared, 4))

## 
## Adjusted R-squared: 0.0435

#3. Visualization and Interpretation
ggplot(murders_1996, aes(x = execs, y = murders)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Relationship between Executions and Murders",
       x = "Number of Executions",
       y = "Number of Murders") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

coef_summary <- coef(summary(model))
beta1 <- coef_summary["execs", "Estimate"]
se_beta1 <- coef_summary["execs", "Std. Error"]
t_stat <- coef_summary["execs", "t value"]
p_value <- coef_summary["execs", "Pr(>|t|)"]

cat("\nSlope Coefficient Analysis:")

## 
## Slope Coefficient Analysis:

cat("\nβ1 (slope):", round(beta1, 4))

## 
## β1 (slope): 58.5555

cat("\nStandard Error:", round(se_beta1, 4))

## 
## Standard Error: 5.8333

cat("\nt-statistic:", round(t_stat, 4))

## 
## t-statistic: 10.0382

cat("\np-value:", format.pval(p_value, digits = 4))

## 
## p-value: < 2.2e-16

#4. Predictions and Residuals
new_data <- data.frame(execs = 0)
predicted_murders <- predict(model, newdata = new_data)
actual_murders <- 0
residual <- actual_murders - predicted_murders[1]
cat("\n\nPredictions for County with Zero Executions:")

## 
## 
## Predictions for County with Zero Executions:

cat("\nPredicted murders:", round(predicted_murders[1], 4))

## 
## Predicted murders: 5.4572

cat("\nResidual (assuming zero actual murders):", round(residual, 4))

## 
## Residual (assuming zero actual murders): -5.4572

#5. Diagnostic Plots for Regression Assumptions
par(mfrow = c(2, 2))
plot(model)

bp_test <- car::ncvTest(model)
cat("\n\nDiagnostic Tests:")

## 
## 
## Diagnostic Tests:

cat("\nBreusch-Pagan test p-value:", format.pval(bp_test$p, digits = 4))

## 
## Breusch-Pagan test p-value: < 2.2e-16

sw_test <- shapiro.test(residuals(model))
cat("\nShapiro-Wilk test p-value:", format.pval(sw_test$p, digits = 4))

## 
## Shapiro-Wilk test p-value: < 2.2e-16

#Chapter 3 - 5

set.seed(123)
n <- 100  # number of students


generate_time <- function(n) {

  times <- matrix(0, nrow = n, ncol = 4)
  colnames(times) <- c("study", "sleep", "work", "leisure")
  
  for(i in 1:n) {
  
    props <- runif(4)
    props <- props/sum(props) * 168
    times[i,] <- props
  }
  return(as.data.frame(times))
}


student_data <- generate_time(n)
student_data$GPA <- with(student_data, 
                         2 + 0.02*study - 0.01*sleep + 0.005*work - 0.015*leisure + rnorm(n, 0, 0.5))


print("Sum of hours for first few students:")

## [1] "Sum of hours for first few students:"

head(rowSums(student_data[,1:4]))

## [1] 168 168 168 168 168 168

model_original <- try(lm(GPA ~ study + sleep + work + leisure, data = student_data))
print("\nOriginal model results:")

## [1] "\nOriginal model results:"

print(summary(model_original))

## 
## Call:
## lm(formula = GPA ~ study + sleep + work + leisure, data = student_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00634 -0.32642 -0.08686  0.34959  0.99598 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.017485   0.278461  -0.063    0.950    
## study        0.032280   0.002747  11.752  < 2e-16 ***
## sleep        0.002054   0.002502   0.821    0.414    
## work         0.015001   0.002544   5.896 5.55e-08 ***
## leisure            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4727 on 96 degrees of freedom
## Multiple R-squared:  0.6641, Adjusted R-squared:  0.6536 
## F-statistic: 63.26 on 3 and 96 DF,  p-value: < 2.2e-16

student_data$study_prop <- student_data$study/168
student_data$sleep_prop <- student_data$sleep/168
student_data$work_prop <- student_data$work/168

model_reformed <- lm(GPA ~ study_prop + sleep_prop + work_prop, data = student_data)
print("\nReformed model results:")

## [1] "\nReformed model results:"

print(summary(model_reformed))

## 
## Call:
## lm(formula = GPA ~ study_prop + sleep_prop + work_prop, data = student_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00634 -0.32642 -0.08686  0.34959  0.99598 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.01749    0.27846  -0.063    0.950    
## study_prop   5.42308    0.46148  11.752  < 2e-16 ***
## sleep_prop   0.34506    0.42025   0.821    0.414    
## work_prop    2.52018    0.42745   5.896 5.55e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4727 on 96 degrees of freedom
## Multiple R-squared:  0.6641, Adjusted R-squared:  0.6536 
## F-statistic: 63.26 on 3 and 96 DF,  p-value: < 2.2e-16

#Chapter 3 - 10
set.seed(123)

generate_correlated_data <- function(n, cor_matrix) {
  library(MASS)
  mu <- rep(0, ncol(cor_matrix))
  data <- mvrnorm(n, mu, cor_matrix)
  return(data)
}

# Function to analyze regression results
analyze_regression <- function(data) {
  # Simple regression
  simple_reg <- lm(y ~ x1, data = data.frame(data))
  
  # Multiple regression
  multiple_reg <- lm(y ~ x1 + x2 + x3, data = data.frame(data))
  
  # Extract coefficients and standard errors
  beta1_simple <- coef(simple_reg)[2]
  beta1_multiple <- coef(multiple_reg)[2]
  se_simple <- summary(simple_reg)$coefficients[2,2]
  se_multiple <- summary(multiple_reg)$coefficients[2,2]
  
  return(list(
    beta1_simple = beta1_simple,
    beta1_multiple = beta1_multiple,
    se_simple = se_simple,
    se_multiple = se_multiple
  ))
}

# Sample size
n <- 1000

# Scenario (i): x1 highly correlated with x2, x3; large partial effects
cor_matrix_1 <- matrix(c(
  1.0, 0.8, 0.8, 0.6,
  0.8, 1.0, 0.7, 0.7,
  0.8, 0.7, 1.0, 0.7,
  0.6, 0.7, 0.7, 1.0
), nrow=4)
data_1 <- generate_correlated_data(n, cor_matrix_1)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## The following object is masked from 'package:wooldridge':
## 
##     cement

colnames(data_1) <- c("x1", "x2", "x3", "y")
results_1 <- analyze_regression(data_1)

# Scenario (ii): x1 uncorrelated with x2, x3; x2 and x3 highly correlated
cor_matrix_2 <- matrix(c(
  1.0, 0.1, 0.1, 0.3,
  0.1, 1.0, 0.8, 0.6,
  0.1, 0.8, 1.0, 0.6,
  0.3, 0.6, 0.6, 1.0
), nrow=4)
data_2 <- generate_correlated_data(n, cor_matrix_2)
colnames(data_2) <- c("x1", "x2", "x3", "y")
results_2 <- analyze_regression(data_2)

# Scenario (iii): x1 highly correlated with x2, x3; small partial effects
cor_matrix_3 <- matrix(c(
  1.0, 0.8, 0.8, 0.3,
  0.8, 1.0, 0.7, 0.2,
  0.8, 0.7, 1.0, 0.2,
  0.3, 0.2, 0.2, 1.0
), nrow=4)
data_3 <- generate_correlated_data(n, cor_matrix_3)
colnames(data_3) <- c("x1", "x2", "x3", "y")
results_3 <- analyze_regression(data_3)

# Scenario (iv): x1 uncorrelated with x2, x3; large partial effects
cor_matrix_4 <- matrix(c(
  1.0, 0.1, 0.1, 0.6,
  0.1, 1.0, 0.8, 0.7,
  0.1, 0.8, 1.0, 0.7,
  0.6, 0.7, 0.7, 1.0
), nrow=4)
data_4 <- generate_correlated_data(n, cor_matrix_4)
colnames(data_4) <- c("x1", "x2", "x3", "y")
results_4 <- analyze_regression(data_4)

# Print results for all scenarios
print_scenario_results <- function(scenario, results) {
  cat(sprintf("\nScenario %s:\n", scenario))
  cat(sprintf("Simple regression β1: %.3f (SE: %.3f)\n", 
              results$beta1_simple, results$se_simple))
  cat(sprintf("Multiple regression β1: %.3f (SE: %.3f)\n", 
              results$beta1_multiple, results$se_multiple))
}

print_scenario_results("(i)", results_1)

## 
## Scenario (i):
## Simple regression β1: 0.644 (SE: 0.027)
## Multiple regression β1: -0.253 (SE: 0.042)

print_scenario_results("(ii)", results_2)

## 
## Scenario (ii):
## Simple regression β1: 0.286 (SE: 0.031)
## Multiple regression β1: 0.223 (SE: 0.024)

print_scenario_results("(iii)", results_3)

## 
## Scenario (iii):
## Simple regression β1: 0.323 (SE: 0.029)
## Multiple regression β1: 0.488 (SE: 0.060)

print_scenario_results("(iv)", results_4)

## 
## Scenario (iv):
## Simple regression β1: 0.612 (SE: 0.025)
## Multiple regression β1: 0.533 (SE: 0.013)

#Chapter 3 - C8
#1: Calculate the average values and standard deviations of prpblck and income
library(wooldridge)
library(dplyr)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

data("discrim")
summary_stats <- discrim %>%
  summarise(
    avg_prpblck = mean(prpblck, na.rm = TRUE),
    sd_prpblck = sd(prpblck, na.rm = TRUE),
    avg_income = mean(income, na.rm = TRUE),
    sd_income = sd(income, na.rm = TRUE)
  )

print(summary_stats)

##   avg_prpblck sd_prpblck avg_income sd_income
## 1   0.1134864  0.1824165   47053.78  13179.29

#2: Estimate psoda = β₀ + β₁*prpblck + β₂*income + u using OLS and interpret prpblck
model1 <- lm(psoda ~ prpblck + income, data = discrim)
summary(model1)

## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06

#3: Compare with a simple regression of psoda on prpblck only
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)

## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015

#4: Estimate the model with log-transformed income (constant price elasticity with respect to income)
model_log <- lm(psoda ~ prpblck + log(income), data = discrim)
summary(model_log)

## 
## Call:
## lm(formula = psoda ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29484 -0.05085  0.00346  0.04283  0.44069 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.18553    0.18800   0.987    0.324    
## prpblck      0.12583    0.02697   4.665 4.23e-06 ***
## log(income)  0.07882    0.01739   4.533 7.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08602 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06628,    Adjusted R-squared:  0.06159 
## F-statistic: 14.13 on 2 and 398 DF,  p-value: 1.184e-06

#5: Add prppov to the regression and observe the change in prpblck
model_with_prppov <- lm(psoda ~ prpblck + log(income) + prppov, data = discrim)
summary(model_with_prppov)

## 
## Call:
## lm(formula = psoda ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28083 -0.05006  0.00305  0.04247  0.44286 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.51208    0.30777  -1.664  0.09693 .  
## prpblck      0.07501    0.03214   2.334  0.02011 *  
## log(income)  0.14180    0.02804   5.058  6.5e-07 ***
## prppov       0.39629    0.13915   2.848  0.00463 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08526 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08497,    Adjusted R-squared:  0.07806 
## F-statistic: 12.29 on 3 and 397 DF,  p-value: 1.053e-07

#6: Find the correlation between log(income) and prppov
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
print(correlation)

## [1] -0.838467

#7: Check multicollinearity between log(income) and prppov using VIF
vif(model_with_prppov)

##     prpblck log(income)      prppov 
##    1.922160    3.518721    4.915220

#Chapter 4 - 3
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ car::recode()   masks dplyr::recode()
## ✖ MASS::select()  masks dplyr::select()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

n <- 32
sales <- runif(n, 10, 100)
log_sales <- log(sales)
profmarg <- runif(n, 0, 20)
rdintens <- 0.472 + 0.321 * log_sales + 0.050 * profmarg + rnorm(n, 0, 1)

# Create a data frame
data <- data.frame(rdintens = rdintens, sales = sales, log_sales = log_sales, profmarg = profmarg)

# Run the regression model
model <- lm(rdintens ~ log_sales + profmarg, data = data)

# Output model summary to check coefficients and significance
summary(model)

## 
## Call:
## lm(formula = rdintens ~ log_sales + profmarg, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.08745 -0.42009  0.04894  0.65782  1.88213 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.69781    1.06735   1.591    0.123
## log_sales    0.03833    0.27801   0.138    0.891
## profmarg     0.01837    0.03164   0.581    0.566
## 
## Residual standard error: 1.092 on 29 degrees of freedom
## Multiple R-squared:  0.01261,    Adjusted R-squared:  -0.05549 
## F-statistic: 0.1851 on 2 and 29 DF,  p-value: 0.832

# (i) Estimated percentage change in rdintens when sales increases by 10%
estimated_effect <- 0.321 * log(1.10)
estimated_effect

## [1] 0.03059457

#Chapter 4 - C8
library(wooldridge)
library(dplyr)
library(sandwich)
library(lmtest)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

data("k401ksubs")

# First filter for single-person households
singles <- subset(k401ksubs, fsize == 1)

# (i) Count number of single-person households
n_singles <- nrow(singles)
cat("Number of single-person households:", n_singles, "\n\n")

## Number of single-person households: 2017

# (ii) Run multiple regression
model_full <- lm(nettfa ~ inc + age, data = singles)
summary(model_full)

## 
## Call:
## lm(formula = nettfa ~ inc + age, data = singles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179.95  -14.16   -3.42    6.03 1113.94 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43.03981    4.08039 -10.548   <2e-16 ***
## inc           0.79932    0.05973  13.382   <2e-16 ***
## age           0.84266    0.09202   9.158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1185 
## F-statistic: 136.5 on 2 and 2014 DF,  p-value: < 2.2e-16

# Calculate robust standard errors
library(sandwich)
library(lmtest)
robust_se <- coeftest(model_full, vcov = vcovHC(model_full, type = "HC1"))
print(robust_se)

## 
## t test of coefficients:
## 
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -43.03981    5.52433 -7.7910 1.057e-14 ***
## inc           0.79932    0.10078  7.9313 3.562e-15 ***
## age           0.84266    0.11937  7.0589 2.300e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# (iv) Test H0: β2 = 1 against H1: β2 < 1
# Extract coefficient and standard error for age
age_coef <- coef(model_full)["age"]
age_se <- sqrt(vcovHC(model_full, type = "HC1")["age", "age"])

# Calculate t-statistic for H0: β2 = 1
t_stat <- (age_coef - 1) / age_se
# Calculate p-value for one-sided test
p_value <- pt(t_stat, df = nrow(singles) - 3)
cat("\nTest H0: β2 = 1 against H1: β2 < 1")

## 
## Test H0: β2 = 1 against H1: β2 < 1

cat("\nt-statistic:", t_stat)

## 
## t-statistic: -1.318064

cat("\np-value:", p_value, "\n\n")

## 
## p-value: 0.09381599

# (v) Simple regression of nettfa on inc
model_simple <- lm(nettfa ~ inc, data = singles)
summary(model_simple)

## 
## Call:
## lm(formula = nettfa ~ inc, data = singles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -185.12  -12.85   -4.85    1.78 1112.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.5709     2.0607   -5.13 3.18e-07 ***
## inc           0.8207     0.0609   13.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared:  0.08267,    Adjusted R-squared:  0.08222 
## F-statistic: 181.6 on 1 and 2015 DF,  p-value: < 2.2e-16

# Compare coefficients
cat("Coefficient on inc in multiple regression:", coef(model_full)["inc"], "\n")

## Coefficient on inc in multiple regression: 0.7993167

cat("Coefficient on inc in simple regression:", coef(model_simple)["inc"], "\n")

## Coefficient on inc in simple regression: 0.8206815

#Chapter 5 - 5
set.seed(123) 
n <- 1000  
mean_score <- 77  
sd_score <- 15 
scores <- rnorm(n, mean = mean_score, sd = sd_score)
scores <- pmax(pmin(scores, 100), 20)
hist(scores, breaks = 30, probability = TRUE, 
     main = "Distribution of Course Scores",
     xlab = "Course Score (in percentage form)",
     ylab = "Proportion in cell",
     xlim = c(20, 100),
     ylim = c(0, 0.1))

curve(dnorm(x, mean = mean(scores), sd = sd(scores)), 
      add = TRUE, col = "blue")
prob_exceed_100 <- 1 - pnorm(100, mean = mean(scores), sd = sd(scores))
cat("Probability of score exceeding 100:", prob_exceed_100, "\n")

## Probability of score exceeding 100: 0.0480284

prob_below_40 <- pnorm(40, mean = mean(scores), sd = sd(scores))
cat("Probability of score below 40:", prob_below_40, "\n")

## Probability of score below 40: 0.00420035

#Chapter 5 - C1
library(wooldridge) 
library(ggplot2)
data("wage1")

# (i) Estimate the model with wage as the dependent variable
model1 <- lm(wage ~ educ + exper + tenure, data = wage1)
summary(model1)

## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16

residuals1 <- residuals(model1)
ggplot(data.frame(residuals1), aes(x = residuals1)) +
  geom_histogram(bins = 20, fill = "blue", color = "black") +
  labs(title = "Histogram of Residuals for Wage Model", x = "Residuals", y = "Frequency")
# (ii) Estimate the model with log(wage) as the dependent variable
model2 <- lm(log(wage) ~ educ + exper + tenure, data = wage1)
summary(model2)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

residuals2 <- residuals(model2)
ggplot(data.frame(residuals2), aes(x = residuals2)) +
  geom_histogram(bins = 20, fill = "green", color = "black") +
  labs(title = "Histogram of Residuals for Log(Wage) Model", x = "Residuals", y = "Frequency")

Midterm

Sodbileg

2024-10-30