midterm1

##Chapter 2

##C9

library(wooldridge)
data(countymurders)
county_murders_1996 <- subset(countymurders, year == 1996)
# Count counties with zero murders
zero_murders <- sum(county_murders_1996$murders == 0)

# Count counties with at least one execution
at_least_one_execution <- sum(county_murders_1996$execs > 0)

# Find the largest number of executions
max_executions <- max(county_murders_1996$execs)
# Run the OLS regression
model <- lm(murders ~ execs, data = county_murders_1996)
summary(model)

## 
## Call:
## lm(formula = murders ~ execs, data = county_murders_1996)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.12   -5.46   -4.46   -2.46 1338.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.4572     0.8348   6.537 7.79e-11 ***
## execs        58.5555     5.8333  10.038  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared:  0.04389,    Adjusted R-squared:  0.04346 
## F-statistic: 100.8 on 1 and 2195 DF,  p-value: < 2.2e-16

# Predict minimum murders with zero executions
min_predicted_murders <- predict(model, newdata = data.frame(execs = 0))

# Calculate residual for a county with zero executions and zero murders
residual_zero <- 0 - min_predicted_murders

#Simple regression model might not accurately capture the deterrent effect of capital punishment on murders because of omitted variable bias, confounding factors, and the difficulty in establishing causality.

##Chapter 3

##5

# Assuming each student spends a total of 168 hours per week across study, sleep, work, and leisure.
set.seed(123) # For reproducibility

# Create a dataset with 100 students
n <- 100
study_hours <- runif(n, 10, 40)  # Random study hours between 10 and 40
sleep_hours <- runif(n, 40, 60)  # Random sleep hours between 40 and 60
work_hours <- runif(n, 10, 50)   # Random work hours between 10 and 50

# Calculate leisure hours to ensure the sum is 168 hours
leisure_hours <- 168 - (study_hours + sleep_hours + work_hours)

# Generate random GPA values based on the hours, adding some noise
GPA <- 2.5 + 0.01 * study_hours - 0.005 * sleep_hours + 0.003 * work_hours + rnorm(n, 0, 0.2)

# Combine into a data frame
data <- data.frame(GPA, study_hours, sleep_hours, work_hours, leisure_hours)

# Rename columns for easier referencing
names(data) <- c("GPA", "study", "sleep", "work", "leisure")

# Display the first few rows of the dataset to understand the structure
head(data)

##        GPA    study    sleep     work  leisure
## 1 2.642469 18.62733 51.99978 19.54904 77.82385
## 2 2.902501 33.64915 46.65647 48.49436 39.20002
## 3 2.642436 22.26931 49.77226 34.05463 61.90380
## 4 2.459586 36.49052 59.08948 30.60119 41.81881
## 5 2.688268 38.21402 49.65805 26.10293 54.02500
## 6 2.404182 11.36669 57.80700 45.20986 53.61644

# Part (iii): Reformulate the model by removing "leisure" to avoid multicollinearity
# This allows us to estimate the effect of study, sleep, and work relative to leisure
model <- lm(GPA ~ study + sleep + work, data = data)

# Summary of the model to see the coefficients and significance
summary(model)

## 
## Call:
## lm(formula = GPA ~ study + sleep + work, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30905 -0.12573 -0.02513  0.11220  0.58041 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.140964   0.205409  10.423  < 2e-16 ***
## study       0.009698   0.002203   4.403 2.77e-05 ***
## sleep       0.001941   0.003581   0.542   0.5891    
## work        0.003700   0.001602   2.309   0.0231 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1861 on 96 degrees of freedom
## Multiple R-squared:  0.1952, Adjusted R-squared:   0.17 
## F-statistic:  7.76 on 3 and 96 DF,  p-value: 0.0001079

# Interpretation:
# The coefficients of study, sleep, and work represent the effect of each activity on GPA
# relative to leisure, as leisure hours are implicitly defined by the constraint.

##10

# Set seed for reproducibility
set.seed(123)

# Create mock data
n <- 100  # Number of observations

# Generate x1, x2, and x3 based on different correlation scenarios
# x1, x2, x3 represent attendance, GPA, and test scores, respectively.

# For demonstration purposes, let's create the variables with different levels of correlation

# (i) x1 highly correlated with x2 and x3, and x2 and x3 have large partial effects on y
x1_i <- rnorm(n, mean = 75, sd = 10)
x2_i <- x1_i + rnorm(n, mean = 0, sd = 2)  # x2 highly correlated with x1
x3_i <- x1_i + rnorm(n, mean = 0, sd = 2)  # x3 highly correlated with x1
y_i <- 0.5 * x1_i + 0.6 * x2_i + 0.4 * x3_i + rnorm(n, mean = 0, sd = 5)  # Large partial effects

# (ii) x1 almost uncorrelated with x2 and x3, but x2 and x3 highly correlated
x1_ii <- rnorm(n, mean = 75, sd = 10)
x2_ii <- rnorm(n, mean = 70, sd = 10)
x3_ii <- x2_ii + rnorm(n, mean = 0, sd = 2)  # x3 highly correlated with x2
y_ii <- 0.5 * x1_ii + 0.6 * x2_ii + 0.4 * x3_ii + rnorm(n, mean = 0, sd = 5)

# (iii) x1 highly correlated with x2 and x3, but x2 and x3 have small partial effects on y
x1_iii <- rnorm(n, mean = 75, sd = 10)
x2_iii <- x1_iii + rnorm(n, mean = 0, sd = 2)
x3_iii <- x1_iii + rnorm(n, mean = 0, sd = 2)
y_iii <- 0.5 * x1_iii + 0.05 * x2_iii + 0.03 * x3_iii + rnorm(n, mean = 0, sd = 5)  # Small partial effects

# (iv) x1 almost uncorrelated with x2 and x3, x2 and x3 have large partial effects
x1_iv <- rnorm(n, mean = 75, sd = 10)
x2_iv <- rnorm(n, mean = 70, sd = 10)
x3_iv <- x2_iv + rnorm(n, mean = 0, sd = 2)  # x3 highly correlated with x2
y_iv <- 0.5 * x1_iv + 0.6 * x2_iv + 0.4 * x3_iv + rnorm(n, mean = 0, sd = 5)  # Large partial effects

# Function to run simple and multiple regression and get summaries
run_regression_analysis <- function(y, x1, x2, x3) {
  # Simple regression of y on x1 only
  simple_model <- lm(y ~ x1)
  cat("\nSimple Regression of y on x1 only:\n")
  print(summary(simple_model))
  
  # Multiple regression of y on x1, x2, x3
  multiple_model <- lm(y ~ x1 + x2 + x3)
  cat("\nMultiple Regression of y on x1, x2, and x3:\n")
  print(summary(multiple_model))
}
# (i) Run regression for case (i): x1 highly correlated with x2 and x3, large partial effects
cat("Case (i): x1 highly correlated with x2 and x3, large partial effects on y\n")

## Case (i): x1 highly correlated with x2 and x3, large partial effects on y

run_regression_analysis(y_i, x1_i, x2_i, x3_i)

## 
## Simple Regression of y on x1 only:
## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.274  -3.169  -0.114   3.404  12.470 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.98413    4.55419   0.655    0.514    
## x1           1.45787    0.05957  24.471   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.411 on 98 degrees of freedom
## Multiple R-squared:  0.8594, Adjusted R-squared:  0.8579 
## F-statistic: 598.9 on 1 and 98 DF,  p-value: < 2.2e-16
## 
## 
## Multiple Regression of y on x1, x2, and x3:
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.4569  -3.2696   0.2832   3.3516  12.6605 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   1.9828     4.4736   0.443   0.6586  
## x1            0.5002     0.3842   1.302   0.1961  
## x2            0.7155     0.2736   2.615   0.0104 *
## x3            0.2565     0.2806   0.914   0.3628  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.258 on 96 degrees of freedom
## Multiple R-squared:  0.8699, Adjusted R-squared:  0.8659 
## F-statistic:   214 on 3 and 96 DF,  p-value: < 2.2e-16

# (ii) Run regression for case (ii): x1 uncorrelated with x2 and x3, x2 and x3 highly correlated
cat("\nCase (ii): x1 uncorrelated with x2 and x3, x2 and x3 highly correlated\n")

## 
## Case (ii): x1 uncorrelated with x2 and x3, x2 and x3 highly correlated

run_regression_analysis(y_ii, x1_ii, x2_ii, x3_ii)

## 
## Simple Regression of y on x1 only:
## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.150  -6.007   0.597   6.713  39.452 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  60.5774     8.6041   7.041 2.65e-10 ***
## x1            0.6237     0.1122   5.559 2.34e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.04 on 98 degrees of freedom
## Multiple R-squared:  0.2398, Adjusted R-squared:  0.232 
## F-statistic: 30.91 on 1 and 98 DF,  p-value: 2.338e-07
## 
## 
## Multiple Regression of y on x1, x2, and x3:
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.2006  -2.4961   0.2769   3.3389  14.0425 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.19395    4.94876   0.443   0.6585    
## x1           0.41460    0.05229   7.929 4.01e-12 ***
## x2           0.65100    0.24923   2.612   0.0104 *  
## x3           0.41851    0.24664   1.697   0.0930 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.036 on 96 degrees of freedom
## Multiple R-squared:  0.8451, Adjusted R-squared:  0.8403 
## F-statistic: 174.6 on 3 and 96 DF,  p-value: < 2.2e-16

# (iii) Run regression for case (iii): x1 highly correlated with x2 and x3, small partial effects
cat("\nCase (iii): x1 highly correlated with x2 and x3, small partial effects on y\n")

## 
## Case (iii): x1 highly correlated with x2 and x3, small partial effects on y

run_regression_analysis(y_iii, x1_iii, x2_iii, x3_iii)

## 
## Simple Regression of y on x1 only:
## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8194  -3.6396  -0.1729   2.6429  16.1017 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.51429    3.66613    0.14    0.889    
## x1           0.57203    0.04783   11.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.005 on 98 degrees of freedom
## Multiple R-squared:  0.5934, Adjusted R-squared:  0.5893 
## F-statistic: 143.1 on 1 and 98 DF,  p-value: < 2.2e-16
## 
## 
## Multiple Regression of y on x1, x2, and x3:
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6351 -3.4572 -0.3161  2.9107 16.6733 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   0.3943     3.7063   0.106   0.9155  
## x1            0.6738     0.3214   2.097   0.0386 *
## x2           -0.2207     0.2517  -0.877   0.3827  
## x3            0.1200     0.2471   0.486   0.6283  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.034 on 96 degrees of freedom
## Multiple R-squared:  0.5972, Adjusted R-squared:  0.5846 
## F-statistic: 47.44 on 3 and 96 DF,  p-value: < 2.2e-16

# (iv) Run regression for case (iv): x1 uncorrelated with x2 and x3, x2 and x3 large partial effects
cat("\nCase (iv): x1 uncorrelated with x2 and x3, x2 and x3 large partial effects on y\n")

## 
## Case (iv): x1 uncorrelated with x2 and x3, x2 and x3 large partial effects on y

run_regression_analysis(y_iv, x1_iv, x2_iv, x3_iv)

## 
## Simple Regression of y on x1 only:
## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.949 -10.341  -0.202   8.593  32.713 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  75.5087     9.2668   8.148 1.22e-12 ***
## x1            0.4219     0.1227   3.439 0.000859 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.33 on 98 degrees of freedom
## Multiple R-squared:  0.1077, Adjusted R-squared:  0.09857 
## F-statistic: 11.83 on 1 and 98 DF,  p-value: 0.0008593
## 
## 
## Multiple Regression of y on x1, x2, and x3:
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.1807  -3.1910   0.4197   3.0325  11.3661 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.3962     4.8176   0.705  0.48254    
## x1            0.4394     0.0490   8.968 2.46e-14 ***
## x2            0.7774     0.2377   3.270  0.00149 ** 
## x3            0.2421     0.2309   1.048  0.29712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.47 on 96 degrees of freedom
## Multiple R-squared:  0.8639, Adjusted R-squared:  0.8596 
## F-statistic:   203 on 3 and 96 DF,  p-value: < 2.2e-16

##C8

# Load the DISCRIM dataset
data("discrim")

# Part (i): Find the average values of prpblck and income and their standard deviations
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)

cat("Average proportion of black residents (prpblck):", mean_prpblck, "\n")

## Average proportion of black residents (prpblck): 0.1134864

cat("Standard deviation of prpblck:", sd_prpblck, "\n")

## Standard deviation of prpblck: 0.1824165

cat("Average median income (income):", mean_income, "\n")

## Average median income (income): 47053.78

cat("Standard deviation of income:", sd_income, "\n")

## Standard deviation of income: 13179.29

# Part (ii): Estimate the model psoda = β0 + β1*prpblck + β2*income + u
model1 <- lm(psoda ~ prpblck + income, data = discrim)
summary(model1)

## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06

# Interpretation of the coefficient on prpblck
cat("Coefficient on prpblck:", coef(model1)["prpblck"], "\n")

## Coefficient on prpblck: 0.1149882

cat("Interpretation: This coefficient represents the impact of a 1-unit increase in the proportion of black residents on the price of soda.\n")

## Interpretation: This coefficient represents the impact of a 1-unit increase in the proportion of black residents on the price of soda.

# Part (iii): Simple regression of psoda on prpblck
model2 <- lm(psoda ~ prpblck, data = discrim)
summary(model2)

## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015

# Compare the estimate of prpblck from model1 and model2
cat("Coefficient on prpblck in model with income:", coef(model1)["prpblck"], "\n")

## Coefficient on prpblck in model with income: 0.1149882

cat("Coefficient on prpblck in simple regression:", coef(model2)["prpblck"], "\n")

## Coefficient on prpblck in simple regression: 0.06492687

# Part (iv): Log-log model to estimate constant elasticity
model3 <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(model3)

## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07

# Calculate the estimated percentage change in psoda for a 20 percentage point increase in prpblck
percent_change <- coef(model3)["prpblck"] * 0.2 * 100
cat("Estimated percentage change in psoda for a 20 percentage point increase in prpblck:", percent_change, "%\n")

## Estimated percentage change in psoda for a 20 percentage point increase in prpblck: 2.431605 %

# Part (v): Add prppov to the regression in part (iv)
model4 <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model4)

## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08

# Observe what happens to β_hat on prpblck
cat("Coefficient on prpblck in model with prppov:", coef(model4)["prpblck"], "\n")

## Coefficient on prpblck in model with prppov: 0.07280726

# Part (vi): Find the correlation between log(income) and prppov
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
cat("Correlation between log(income) and prppov:", correlation, "\n")

## Correlation between log(income) and prppov: -0.838467

# Part (vii): Evaluate the statement about log(income) and prppov correlation
cat("Since log(income) and prppov are highly correlated, including both in the same regression could lead to multicollinearity issues, making it difficult to interpret the individual effects of these variables.\n")

## Since log(income) and prppov are highly correlated, including both in the same regression could lead to multicollinearity issues, making it difficult to interpret the individual effects of these variables.

##Chapter 4

##3

# Load the rdchem dataset
data("rdchem", package = "wooldridge")

# Run the regression model
model <- lm(rdintens ~ log(sales) + profmarg, data = rdchem)
summary(model)

## 
## Call:
## lm(formula = rdintens ~ log(sales) + profmarg, data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3016 -1.2707 -0.6895  0.8785  6.0369 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.47225    1.67606   0.282    0.780
## log(sales)   0.32135    0.21557   1.491    0.147
## profmarg     0.05004    0.04578   1.093    0.283
## 
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared:  0.09847,    Adjusted R-squared:  0.0363 
## F-statistic: 1.584 on 2 and 29 DF,  p-value: 0.2224

# (i) Calculate the estimated percentage change in rdintens for a 10% increase in sales
coef_log_sales <- coef(model)["log(sales)"]
percentage_change <- coef_log_sales * 10
cat("Estimated percentage change in rdintens for a 10% increase in sales:", percentage_change, "\n")

## Estimated percentage change in rdintens for a 10% increase in sales: 3.213484

# (ii) Test the significance of log(sales)
p_value_log_sales <- summary(model)$coefficients["log(sales)", "Pr(>|t|)"]
cat("P-value for log(sales):", p_value_log_sales, "\n")

## P-value for log(sales): 0.1468382

# (iii) Interpret the coefficient on profmarg
coef_profmarg <- coef(model)["profmarg"]
cat("Coefficient on profmarg:", coef_profmarg, "\n")

## Coefficient on profmarg: 0.0500367

# (iv) Check if profmarg is statistically significant
p_value_profmarg <- summary(model)$coefficients["profmarg", "Pr(>|t|)"]
cat("P-value for profmarg:", p_value_profmarg, "\n")

## P-value for profmarg: 0.2833658

##C8

# Load the k401ksubs dataset and filter for single-person households
data("k401ksubs", package = "wooldridge")
single_person <- subset(k401ksubs, fsize == 1)

# (i) Count the number of single-person households
num_single_person <- nrow(single_person)
cat("Number of single-person households:", num_single_person, "\n")

## Number of single-person households: 2017

# (ii) Estimate the model netffa = β0 + β1 * inc + β2 * age + u
model <- lm(nettfa ~ inc + age, data = single_person)
summary(model)

## 
## Call:
## lm(formula = nettfa ~ inc + age, data = single_person)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179.95  -14.16   -3.42    6.03 1113.94 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43.03981    4.08039 -10.548   <2e-16 ***
## inc           0.79932    0.05973  13.382   <2e-16 ***
## age           0.84266    0.09202   9.158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1185 
## F-statistic: 136.5 on 2 and 2014 DF,  p-value: < 2.2e-16

# (iii) Interpret the intercept
intercept <- coef(model)["(Intercept)"]
cat("Intercept (β0):", intercept, "\n")

## Intercept (β0): -43.03981

# (iv) Test H0: β2 = 1 against H1: β2 < 1
beta2 <- coef(model)["age"]
se_beta2 <- summary(model)$coefficients["age", "Std. Error"]
t_stat <- (beta2 - 1) / se_beta2
p_value <- pt(t_stat, df = model$df.residual)  # One-sided test
cat("t-statistic:", t_stat, "\n")

## t-statistic: -1.709944

cat("p-value for H0: β2 = 1 against H1: β2 < 1:", p_value, "\n")

## p-value for H0: β2 = 1 against H1: β2 < 1: 0.04371514

# (v) Simple regression of netffa on inc
simple_model <- lm(nettfa ~ inc, data = single_person)
summary(simple_model)

## 
## Call:
## lm(formula = nettfa ~ inc, data = single_person)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -185.12  -12.85   -4.85    1.78 1112.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.5709     2.0607   -5.13 3.18e-07 ***
## inc           0.8207     0.0609   13.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared:  0.08267,    Adjusted R-squared:  0.08222 
## F-statistic: 181.6 on 1 and 2015 DF,  p-value: < 2.2e-16

# Compare coefficients on inc
coef_simple_inc <- coef(simple_model)["inc"]
coef_full_inc <- coef(model)["inc"]
cat("Coefficient on inc in simple model:", coef_simple_inc, "\n")

## Coefficient on inc in simple model: 0.8206815

cat("Coefficient on inc in full model:", coef_full_inc, "\n")

## Coefficient on inc in full model: 0.7993167

##Chapter 5

# Summary statistics
summary(econmath)

##       age             work            study           econhs      
##  Min.   :18.00   Min.   : 0.000   Min.   : 0.00   Min.   :0.0000  
##  1st Qu.:19.00   1st Qu.: 0.000   1st Qu.: 8.50   1st Qu.:0.0000  
##  Median :19.00   Median : 8.000   Median :12.00   Median :0.0000  
##  Mean   :19.41   Mean   : 8.626   Mean   :13.92   Mean   :0.3703  
##  3rd Qu.:20.00   3rd Qu.:15.000   3rd Qu.:18.00   3rd Qu.:1.0000  
##  Max.   :29.00   Max.   :37.500   Max.   :50.00   Max.   :1.0000  
##                                                                   
##      colgpa          hsgpa           acteng          actmth     
##  Min.   :0.875   Min.   :2.355   Min.   :12.00   Min.   :12.00  
##  1st Qu.:2.446   1st Qu.:3.110   1st Qu.:20.00   1st Qu.:20.00  
##  Median :2.813   Median :3.321   Median :23.00   Median :23.00  
##  Mean   :2.815   Mean   :3.342   Mean   :22.59   Mean   :23.21  
##  3rd Qu.:3.207   3rd Qu.:3.589   3rd Qu.:25.00   3rd Qu.:26.00  
##  Max.   :4.000   Max.   :4.260   Max.   :34.00   Max.   :36.00  
##                                  NA's   :42      NA's   :42     
##       act           mathscr            male        calculus     
##  Min.   :13.00   Min.   : 0.000   Min.   :0.0   Min.   :0.0000  
##  1st Qu.:21.00   1st Qu.: 7.000   1st Qu.:0.0   1st Qu.:0.0000  
##  Median :23.00   Median : 8.000   Median :0.5   Median :1.0000  
##  Mean   :23.12   Mean   : 7.875   Mean   :0.5   Mean   :0.6764  
##  3rd Qu.:25.00   3rd Qu.: 9.000   3rd Qu.:1.0   3rd Qu.:1.0000  
##  Max.   :33.00   Max.   :10.000   Max.   :1.0   Max.   :1.0000  
##  NA's   :42                                                     
##      attexc          attgood          fathcoll         mothcoll     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000   Median :1.0000   Median :1.0000  
##  Mean   :0.2967   Mean   :0.5864   Mean   :0.5245   Mean   :0.6285  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##      score      
##  Min.   :19.53  
##  1st Qu.:64.06  
##  Median :74.22  
##  Mean   :72.60  
##  3rd Qu.:82.79  
##  Max.   :98.44  
##

# Histogram of score
hist(econmath$score, breaks = 30, freq = FALSE)
curve(dnorm(x, mean = mean(econmath$score), sd = sd(econmath$score)), add = TRUE)

# Q1(i)
# The probability of score exceeding 100 using normal distribution would not be exactly zero, but it would be extremely small.
# This contradicts the assumption of a normal distribution because the normal distribution theoretically extends to infinity on both sides, while the score variable is bounded between 0 and 100.

# Q1(ii)
# The left tail of the histogram shows a spike at 0, indicating a large number of observations with a score of 0. This suggests that the normal distribution may not fit well in the left tail, as it assumes a continuous distribution with no such spikes.

data("wage1")
# Estimate the model
model <- lm(wage ~ educ + exper + tenure, data = wage1)
summary(model)

## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16

# Extract residuals
residuals <- model$residuals

# Plot histogram of residuals
hist(residuals, main = "Histogram of Residuals", xlab = "Residuals", col = "lightblue", border = "black")

# Estimate model with log(wage)
log_model <- lm(log(wage) ~ educ + exper + tenure, data = wage1)
summary(log_model)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# Breusch-Pagan test for original model
bptest(model)

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 43.096, df = 3, p-value = 2.349e-09

# Breusch-Pagan test for log-transformed model
bptest(log_model)

## 
##  studentized Breusch-Pagan test
## 
## data:  log_model
## BP = 10.761, df = 3, p-value = 0.01309

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

midterm1

uyanga

2024-11-01

R Markdown

Including Plots