Mid-term

##Chapter2 Question C9
# Load necessary libraries
library(wooldridge)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Load the dataset and filter for the year 1996
data("countymurders", package = "wooldridge")
dataset_1996 <- subset(countymurders, year == 1996)

# (i) Count the number of counties with zero murders and those with at least one execution
zero_murders <- sum(dataset_1996$murders == 0)
at_least_one_execution <- sum(dataset_1996$execs >= 1)

cat("Counties with zero murders in 1996:", zero_murders, "\n")

## Counties with zero murders in 1996: 1051

cat("Counties with at least one execution in 1996:", at_least_one_execution, "\n")

## Counties with at least one execution in 1996: 31

# (ii) Estimate the equation 'murders = β0 + β1 * execs + u' by OLS
# Using the lm() function for linear regression
model <- lm(murders ~ execs, data = dataset_1996)

# Display the model summary, which includes coefficients and R-squared
summary(model)

## 
## Call:
## lm(formula = murders ~ execs, data = dataset_1996)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.12   -5.46   -4.46   -2.46 1338.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.4572     0.8348   6.537 7.79e-11 ***
## execs        58.5555     5.8333  10.038  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared:  0.04389,    Adjusted R-squared:  0.04346 
## F-statistic: 100.8 on 1 and 2195 DF,  p-value: < 2.2e-16

# (iii) Interpret the slope coefficient from the model output
# The output from summary(model) will give you the slope coefficient and R-squared.

# (iv) Calculate the smallest number of murders predicted by the model
min_predicted_murders <- min(predict(model))

# Calculate residuals for a county with zero executions and zero murders
predicted_for_zero <- predict(model, newdata = data.frame(execs = 0))
residual_for_zero <- 0 - predicted_for_zero

cat("Smallest number of predicted murders:", min_predicted_murders, "\n")

## Smallest number of predicted murders: 5.457241

cat("Residual for county with zero executions and zero murders:", residual_for_zero, "\n")

## Residual for county with zero executions and zero murders: -5.457241

# (v) Discuss limitations of using a simple regression model for causality
# This part requires interpretive writing on why simple regression may not establish causality.

## Chapter 3 Question 5

# Load necessary library
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## The following object is masked from 'package:wooldridge':
## 
##     cement

# Set the total hours in a week for the four activities
total_hours <- 168

# Set sample size
n <- 100

# Simulate hours for study, sleep, and work, and calculate leisure as the remaining time
set.seed(123)  # For reproducibility
study <- round(runif(n, 10, 50))  # Random hours between 10 and 50 for studying
sleep <- round(runif(n, 40, 70))  # Random hours between 40 and 70 for sleeping
work <- round(runif(n, 10, 60))   # Random hours between 10 and 60 for working
leisure <- total_hours - study - sleep - work  # Leisure is the remaining time

# Combine into a data frame
data <- data.frame(study, sleep, work, leisure)

# Create a dependent variable GPA based on hours spent (with some noise)
# Hypothetical model: GPA = 0.5*study - 0.2*sleep + 0.1*work + 0.3*leisure + error
data$GPA <- 0.5 * data$study - 0.2 * data$sleep + 0.1 * data$work + 0.3 * data$leisure + rnorm(n, mean = 0, sd = 0.5)

# View the data structure
head(data)

##   study sleep work leisure       GPA
## 1    22    58   22      66 21.793869
## 2    42    50   58      18 22.584521
## 3    26    55   40      47 20.266101
## 4    45    69   36      18 17.195812
## 5    48    54   30      36 26.940274
## 6    12    67   54      35  8.359802

# (ii) Trying to fit the model with all four variables (study, sleep, work, leisure)
# This model should have collinearity issues because study + sleep + work + leisure = 168
cat("\n### Model with Perfect Collinearity (all four predictors) ###\n")

## 
## ### Model with Perfect Collinearity (all four predictors) ###

model_all <- lm(GPA ~ study + sleep + work + leisure, data = data)
summary(model_all)

## 
## Call:
## lm(formula = GPA ~ study + sleep + work + leisure, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.76505 -0.31234 -0.06369  0.28443  1.45949 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.750382   0.396190  125.57   <2e-16 ***
## study        0.199563   0.004144   48.16   <2e-16 ***
## sleep       -0.488770   0.005973  -81.82   <2e-16 ***
## work        -0.198605   0.003209  -61.90   <2e-16 ***
## leisure            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4658 on 96 degrees of freedom
## Multiple R-squared:  0.9928, Adjusted R-squared:  0.9925 
## F-statistic:  4392 on 3 and 96 DF,  p-value: < 2.2e-16

# (iii) Reformulate the model by dropping one variable to satisfy MLR.3
# Let's drop 'leisure' and use only study, sleep, and work as predictors
cat("\n### Reformulated Model (dropping 'leisure' to avoid collinearity) ###\n")

## 
## ### Reformulated Model (dropping 'leisure' to avoid collinearity) ###

model_reformulated <- lm(GPA ~ study + sleep + work, data = data)
summary(model_reformulated)

## 
## Call:
## lm(formula = GPA ~ study + sleep + work, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.76505 -0.31234 -0.06369  0.28443  1.45949 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.750382   0.396190  125.57   <2e-16 ***
## study        0.199563   0.004144   48.16   <2e-16 ***
## sleep       -0.488770   0.005973  -81.82   <2e-16 ***
## work        -0.198605   0.003209  -61.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4658 on 96 degrees of freedom
## Multiple R-squared:  0.9928, Adjusted R-squared:  0.9925 
## F-statistic:  4392 on 3 and 96 DF,  p-value: < 2.2e-16

##Chapter 3 Question 10

# Load required libraries
library(MASS) # For generating correlated data
library(dplyr)

# Set seed for reproducibility
set.seed(42)

# Define the number of observations
n <- 100

# Part (i): High correlation between x1 and (x2, x3) with large partial effects
# Define a covariance matrix for high correlation between x1, x2, and x3
sigma_high_corr <- matrix(c(1, 0.8, 0.8, 
                            0.8, 1, 0.8, 
                            0.8, 0.8, 1), 
                          nrow = 3)

# Generate correlated predictors x1, x2, and x3
data_high_corr <- mvrnorm(n, mu = c(0, 0, 0), Sigma = sigma_high_corr)
x1_high <- data_high_corr[, 1]
x2_high <- data_high_corr[, 2]
x3_high <- data_high_corr[, 3]

# Define a response y with large effects of x2 and x3
y_high <- 0.5 * x1_high + 2 * x2_high + 2 * x3_high + rnorm(n)

# Run simple and multiple regressions
model_simple_high <- lm(y_high ~ x1_high)
model_multiple_high <- lm(y_high ~ x1_high + x2_high + x3_high)

# Print results
cat("Part (i) Results:\n")

## Part (i) Results:

summary(model_simple_high)

## 
## Call:
## lm(formula = y_high ~ x1_high)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4487 -1.3673  0.1559  1.2785  3.7071 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.0603     0.2084  -0.289    0.773    
## x1_high       3.5402     0.1942  18.233   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.084 on 98 degrees of freedom
## Multiple R-squared:  0.7723, Adjusted R-squared:   0.77 
## F-statistic: 332.4 on 1 and 98 DF,  p-value: < 2.2e-16

summary(model_multiple_high)

## 
## Call:
## lm(formula = y_high ~ x1_high + x2_high + x3_high)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7944 -0.5867 -0.1038  0.6188  2.3280 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.03150    0.08914   0.353   0.7246    
## x1_high      0.42130    0.17108   2.463   0.0156 *  
## x2_high      2.02130    0.16715  12.093   <2e-16 ***
## x3_high      1.99536    0.18447  10.817   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8867 on 96 degrees of freedom
## Multiple R-squared:  0.9596, Adjusted R-squared:  0.9583 
## F-statistic: 760.2 on 3 and 96 DF,  p-value: < 2.2e-16

# Part (ii): x1 uncorrelated with x2 and x3, x2 and x3 highly correlated
# Define a covariance matrix with x1 uncorrelated with x2 and x3, but high correlation between x2 and x3
sigma_x1_uncorr <- matrix(c(1, 0, 0, 
                            0, 1, 0.8, 
                            0, 0.8, 1), 
                          nrow = 3)

# Generate data
data_x1_uncorr <- mvrnorm(n, mu = c(0, 0, 0), Sigma = sigma_x1_uncorr)
x1_uncorr <- data_x1_uncorr[, 1]
x2_uncorr <- data_x1_uncorr[, 2]
x3_uncorr <- data_x1_uncorr[, 3]

# Define response with effects from x2 and x3
y_uncorr <- 0.5 * x1_uncorr + 2 * x2_uncorr + 2 * x3_uncorr + rnorm(n)

# Run regressions
model_simple_uncorr <- lm(y_uncorr ~ x1_uncorr)
model_multiple_uncorr <- lm(y_uncorr ~ x1_uncorr + x2_uncorr + x3_uncorr)

# Print results
cat("\nPart (ii) Results:\n")

## 
## Part (ii) Results:

summary(model_simple_uncorr)

## 
## Call:
## lm(formula = y_uncorr ~ x1_uncorr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6370 -2.5296 -0.0481  2.2480 11.5249 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -0.6178     0.3946  -1.566  0.12068   
## x1_uncorr     1.0624     0.3730   2.849  0.00535 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.946 on 98 degrees of freedom
## Multiple R-squared:  0.07647,    Adjusted R-squared:  0.06705 
## F-statistic: 8.115 on 1 and 98 DF,  p-value: 0.005351

summary(model_multiple_uncorr)

## 
## Call:
## lm(formula = y_uncorr ~ x1_uncorr + x2_uncorr + x3_uncorr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.18619 -0.55754  0.00533  0.49589  2.04906 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.16512    0.09052  -1.824   0.0712 .  
## x1_uncorr    0.51332    0.08631   5.947 4.41e-08 ***
## x2_uncorr    2.07699    0.15415  13.474  < 2e-16 ***
## x3_uncorr    1.92276    0.15459  12.438  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8955 on 96 degrees of freedom
## Multiple R-squared:  0.9534, Adjusted R-squared:  0.952 
## F-statistic: 654.8 on 3 and 96 DF,  p-value: < 2.2e-16

# Part (iii): High correlation between x1 and (x2, x3) with small partial effects
# Define response y with small effects from x2 and x3
y_small_effects <- 0.5 * x1_high + 0.2 * x2_high + 0.2 * x3_high + rnorm(n)

# Run regressions
model_simple_small <- lm(y_small_effects ~ x1_high)
model_multiple_small <- lm(y_small_effects ~ x1_high + x2_high + x3_high)

# Print results
cat("\nPart (iii) Results:\n")

## 
## Part (iii) Results:

summary(model_simple_small)

## 
## Call:
## lm(formula = y_small_effects ~ x1_high)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.69632 -0.81329 -0.04124  0.72521  3.06668 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.05009    0.10717   0.467    0.641    
## x1_high      0.68739    0.09986   6.884 5.58e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.072 on 98 degrees of freedom
## Multiple R-squared:  0.3259, Adjusted R-squared:  0.3191 
## F-statistic: 47.39 on 1 and 98 DF,  p-value: 5.58e-10

summary(model_multiple_small)

## 
## Call:
## lm(formula = y_small_effects ~ x1_high + x2_high + x3_high)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.41668 -0.76752 -0.08112  0.66795  2.90409 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.07485    0.10380   0.721   0.4726  
## x1_high      0.15908    0.19921   0.799   0.4265  
## x2_high      0.14720    0.19463   0.756   0.4513  
## x3_high      0.52075    0.21480   2.424   0.0172 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.033 on 96 degrees of freedom
## Multiple R-squared:  0.3869, Adjusted R-squared:  0.3678 
## F-statistic:  20.2 on 3 and 96 DF,  p-value: 3.141e-10

# Part (iv): x1 uncorrelated with x2 and x3 with large partial effects, x2 and x3 highly correlated
# Define response y with large effects from x2 and x3
y_large_uncorr <- 0.5 * x1_uncorr + 2 * x2_uncorr + 2 * x3_uncorr + rnorm(n)

# Run regressions
model_simple_large <- lm(y_large_uncorr ~ x1_uncorr)
model_multiple_large <- lm(y_large_uncorr ~ x1_uncorr + x2_uncorr + x3_uncorr)

# Print results
cat("\nPart (iv) Results:\n")

## 
## Part (iv) Results:

summary(model_simple_large)

## 
## Call:
## lm(formula = y_large_uncorr ~ x1_uncorr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7812 -2.2486 -0.2126  1.8324 12.5450 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  -0.3639     0.3969  -0.917   0.3615  
## x1_uncorr     0.9096     0.3752   2.425   0.0172 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.969 on 98 degrees of freedom
## Multiple R-squared:  0.05659,    Adjusted R-squared:  0.04696 
## F-statistic: 5.878 on 1 and 98 DF,  p-value: 0.01716

summary(model_multiple_large)

## 
## Call:
## lm(formula = y_large_uncorr ~ x1_uncorr + x2_uncorr + x3_uncorr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2860 -0.8163  0.1298  0.8934  3.1473 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.0762     0.1200   0.635  0.52689    
## x1_uncorr     0.3597     0.1144   3.144  0.00222 ** 
## x2_uncorr     1.9335     0.2043   9.463 2.12e-15 ***
## x3_uncorr     2.0109     0.2049   9.813 3.74e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.187 on 96 degrees of freedom
## Multiple R-squared:  0.9174, Adjusted R-squared:  0.9148 
## F-statistic: 355.2 on 3 and 96 DF,  p-value: < 2.2e-16

##Chapter 3 Question C8
data(discrim)
# Load the dataset
# Replace this line with the appropriate code to load your dataset if not in memory
# data(discrim)  # Uncomment if the dataset is available in a package

# Preview the data structure
str(discrim)

## 'data.frame':    410 obs. of  37 variables:
##  $ psoda   : num  1.12 1.06 1.06 1.12 1.12 ...
##  $ pfries  : num  1.06 0.91 0.91 1.02 NA ...
##  $ pentree : num  1.02 0.95 0.98 1.06 0.49 ...
##  $ wagest  : num  4.25 4.75 4.25 5 5 ...
##  $ nmgrs   : num  3 3 3 4 3 4 3 3 4 3 ...
##  $ nregs   : int  5 3 5 5 3 4 2 5 4 5 ...
##  $ hrsopen : num  16 16.5 18 16 16 15 16 17 17 18 ...
##  $ emp     : num  27.5 21.5 30 27.5 5 17.5 22.5 18.5 17 27 ...
##  $ psoda2  : num  1.11 1.05 1.05 1.15 1.04 ...
##  $ pfries2 : num  1.11 0.89 0.94 1.05 1.01 ...
##  $ pentree2: num  1.05 0.95 0.98 1.05 0.58 ...
##  $ wagest2 : num  5.05 5.05 5.05 5.05 5.05 ...
##  $ nmgrs2  : num  5 4 4 4 3 3 3 3 4 6 ...
##  $ nregs2  : int  5 3 5 5 3 4 2 5 4 5 ...
##  $ hrsopen2: num  15 17.5 17.5 16 16 15 16 16 18 17 ...
##  $ emp2    : num  27 24.5 25 NA 12 28 18.5 17 34 22 ...
##  $ compown : int  1 0 0 0 0 0 0 1 0 1 ...
##  $ chain   : int  3 1 1 3 1 1 1 3 1 3 ...
##  $ density : num  4030 4030 11400 8345 720 ...
##  $ crmrte  : num  0.0529 0.0529 0.036 0.0484 0.0616 ...
##  $ state   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ prpblck : num  0.1712 0.1712 0.0474 0.0528 0.0345 ...
##  $ prppov  : num  0.0366 0.0366 0.0879 0.0591 0.0254 ...
##  $ prpncar : num  0.0788 0.0788 0.2694 0.1367 0.0738 ...
##  $ hseval  : num  148300 148300 169200 171600 249100 ...
##  $ nstores : int  3 3 3 3 1 2 1 1 5 5 ...
##  $ income  : num  44534 44534 41164 50366 72287 ...
##  $ county  : int  18 18 12 10 10 18 10 24 10 10 ...
##  $ lpsoda  : num  0.1133 0.0583 0.0583 0.1133 0.1133 ...
##  $ lpfries : num  0.0583 -0.0943 -0.0943 0.0198 NA ...
##  $ lhseval : num  11.9 11.9 12 12.1 12.4 ...
##  $ lincome : num  10.7 10.7 10.6 10.8 11.2 ...
##  $ ldensity: num  8.3 8.3 9.34 9.03 6.58 ...
##  $ NJ      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ BK      : int  0 1 1 0 1 1 1 0 1 0 ...
##  $ KFC     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RR      : int  1 0 0 1 0 0 0 1 0 1 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"

# Part (i): Find average values and standard deviations of prpblck and income
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)

cat("Part (i) Results:\n")

## Part (i) Results:

cat("Mean of prpblck:", mean_prpblck, "\n")

## Mean of prpblck: 0.1134864

cat("Standard deviation of prpblck:", sd_prpblck, "\n")

## Standard deviation of prpblck: 0.1824165

cat("Mean of income:", mean_income, "\n")

## Mean of income: 47053.78

cat("Standard deviation of income:", sd_income, "\n")

## Standard deviation of income: 13179.29

# Part (ii): OLS model to predict psoda using prpblck and income
model_ii <- lm(psoda ~ prpblck + income, data = discrim)
cat("\nPart (ii) OLS Regression Results:\n")

## 
## Part (ii) OLS Regression Results:

summary(model_ii)

## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06

# Part (iii): Simple regression of psoda on prpblck
model_simple <- lm(psoda ~ prpblck, data = discrim)
cat("\nPart (iii) Simple Regression Results:\n")

## 
## Part (iii) Simple Regression Results:

summary(model_simple)

## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015

# Compare the coefficient of prpblck from models in Part (ii) and (iii)
cat("\nComparison of prpblck coefficient:\n")

## 
## Comparison of prpblck coefficient:

cat("Model with income:", coef(model_ii)["prpblck"], "\n")

## Model with income: 0.1149882

cat("Simple model:", coef(model_simple)["prpblck"], "\n")

## Simple model: 0.06492687

# Part (iv): Log transformation model
model_iv <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
cat("\nPart (iv) Log-Transformed Model Results:\n")

## 
## Part (iv) Log-Transformed Model Results:

summary(model_iv)

## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07

# Calculate the percentage change in psoda for a 0.20 increase in prpblck
percentage_change <- coef(model_iv)["prpblck"] * 0.20 * 100
cat("Estimated percentage change in psoda for a 20 percentage point increase in prpblck:", percentage_change, "%\n")

## Estimated percentage change in psoda for a 20 percentage point increase in prpblck: 2.431605 %

# Part (v): Add prppov to the regression model in Part (iv)
model_v <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
cat("\nPart (v) Model Results with prppov included:\n")

## 
## Part (v) Model Results with prppov included:

summary(model_v)

## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08

# Part (vi): Find correlation between log(income) and prppov
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
cat("\nPart (vi) Correlation between log(income) and prppov:", correlation, "\n")

## 
## Part (vi) Correlation between log(income) and prppov: -0.838467

# Part (vii): Comment on multicollinearity
cat("\nPart (vii) Comment:\n")

## 
## Part (vii) Comment:

cat("Because log(income) and prppov are highly correlated, multicollinearity may inflate the standard errors for the regression coefficients in the model. This can make it difficult to interpret the individual effects of these variables.\n")

## Because log(income) and prppov are highly correlated, multicollinearity may inflate the standard errors for the regression coefficients in the model. This can make it difficult to interpret the individual effects of these variables.

##Chapter4 3
data("rdchem", package = "wooldridge")
head(rdchem)

##      rd  sales profits rdintens  profmarg     salessq   lsales       lrd
## 1 430.6 4570.2   186.9 9.421906  4.089536 20886730.00 8.427312 6.0651798
## 2  59.0 2830.0   467.0 2.084806 16.501766  8008900.00 7.948032 4.0775375
## 3  23.5  596.8   107.4 3.937668 17.995979   356170.22 6.391582 3.1570003
## 4   3.5  133.6    -4.3 2.619760 -3.218563    17848.96 4.894850 1.2527629
## 5   1.7   42.0     8.0 4.047619 19.047619     1764.00 3.737670 0.5306283
## 6   8.4  390.0    47.3 2.153846 12.128205   152100.00 5.966147 2.1282318

model <- lm(rdintens ~ log(sales) + profmarg, data = rdchem)
summary(model)

## 
## Call:
## lm(formula = rdintens ~ log(sales) + profmarg, data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3016 -1.2707 -0.6895  0.8785  6.0369 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.47225    1.67606   0.282    0.780
## log(sales)   0.32135    0.21557   1.491    0.147
## profmarg     0.05004    0.04578   1.093    0.283
## 
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared:  0.09847,    Adjusted R-squared:  0.0363 
## F-statistic: 1.584 on 2 and 29 DF,  p-value: 0.2224

#(i)
log_sales_coeff <- coef(model)["log(sales)"]
percentage_point_change <- log_sales_coeff * 10
percentage_point_change

## log(sales) 
##   3.213484

#(ii)
summary(model)

## 
## Call:
## lm(formula = rdintens ~ log(sales) + profmarg, data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3016 -1.2707 -0.6895  0.8785  6.0369 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.47225    1.67606   0.282    0.780
## log(sales)   0.32135    0.21557   1.491    0.147
## profmarg     0.05004    0.04578   1.093    0.283
## 
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared:  0.09847,    Adjusted R-squared:  0.0363 
## F-statistic: 1.584 on 2 and 29 DF,  p-value: 0.2224

p_value_log_sales <- summary(model)$coefficients["log(sales)", "Pr(>|t|)"]
# Check significance at 5% and 10% levels
significant_5_percent <- p_value_log_sales < 0.05
significant_10_percent <- p_value_log_sales < 0.10
significant_5_percent

## [1] FALSE

significant_10_percent

## [1] FALSE

#(iii)
profmarg_coeff <- coef(model)["profmarg"]
profmarg_coeff

##  profmarg 
## 0.0500367

#A 0.050 percentage point increase in R&D intensity is likely a modest effect, indicating that profitability has a relatively small impact on R&D intensity.
#(iv)
p_value_profmarg <- summary(model)$coefficients["profmarg", "Pr(>|t|)"]
# Check significance at 5% and 10% levels
significant_profmarg_5_percent <- p_value_profmarg < 0.05
significant_profmarg_10_percent <- p_value_profmarg < 0.10
significant_profmarg_5_percent

## [1] FALSE

significant_profmarg_10_percent

## [1] FALSE

##Chapter 4 Question C8

# Load the k401ksubs dataset and filter for single-person households
data("k401ksubs", package = "wooldridge")
single_person <- subset(k401ksubs, fsize == 1)

# (i) Count the number of single-person households
num_single_person <- nrow(single_person)
cat("Number of single-person households:", num_single_person, "\n")

## Number of single-person households: 2017

# (ii) Estimate the model netffa = β0 + β1 * inc + β2 * age + u
model <- lm(nettfa ~ inc + age, data = single_person)
summary(model)

## 
## Call:
## lm(formula = nettfa ~ inc + age, data = single_person)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179.95  -14.16   -3.42    6.03 1113.94 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43.03981    4.08039 -10.548   <2e-16 ***
## inc           0.79932    0.05973  13.382   <2e-16 ***
## age           0.84266    0.09202   9.158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1185 
## F-statistic: 136.5 on 2 and 2014 DF,  p-value: < 2.2e-16

# (iii) Interpret the intercept
intercept <- coef(model)["(Intercept)"]
cat("Intercept (β0):", intercept, "\n")

## Intercept (β0): -43.03981

# (iv) Test H0: β2 = 1 against H1: β2 < 1
beta2 <- coef(model)["age"]
se_beta2 <- summary(model)$coefficients["age", "Std. Error"]
t_stat <- (beta2 - 1) / se_beta2
p_value <- pt(t_stat, df = model$df.residual)  # One-sided test
cat("t-statistic:", t_stat, "\n")

## t-statistic: -1.709944

cat("p-value for H0: β2 = 1 against H1: β2 < 1:", p_value, "\n")

## p-value for H0: β2 = 1 against H1: β2 < 1: 0.04371514

# (v) Simple regression of netffa on inc
simple_model <- lm(nettfa ~ inc, data = single_person)
summary(simple_model)

## 
## Call:
## lm(formula = nettfa ~ inc, data = single_person)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -185.12  -12.85   -4.85    1.78 1112.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.5709     2.0607   -5.13 3.18e-07 ***
## inc           0.8207     0.0609   13.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared:  0.08267,    Adjusted R-squared:  0.08222 
## F-statistic: 181.6 on 1 and 2015 DF,  p-value: < 2.2e-16

# Compare coefficients on inc
coef_simple_inc <- coef(simple_model)["inc"]
coef_full_inc <- coef(model)["inc"]
cat("Coefficient on inc in simple model:", coef_simple_inc, "\n")

## Coefficient on inc in simple model: 0.8206815

cat("Coefficient on inc in full model:", coef_full_inc, "\n")

## Coefficient on inc in full model: 0.7993167

##Chapter 5 Question 5
# Load necessary library
library(ggplot2)

# Simulate data if you don't have actual data
# Suppose the scores are normally distributed around 70 with a standard deviation of 15
set.seed(42) # for reproducibility
scores <- rnorm(1000, mean = 70, sd = 15)

# Plot histogram with normal curve
ggplot(data.frame(scores), aes(x = scores)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black") +
  stat_function(fun = dnorm, args = list(mean = mean(scores), sd = sd(scores)), color = "blue", size = 1) +
  labs(x = "Course score (in percentage form)", y = "Density") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##Chapter 5 Question C1
# Load necessary libraries
library(wooldridge) # For the wage1 dataset
library(ggplot2)    # For plotting
data("wage1", package = "wooldridge")



# (i) Estimate the equation: wage = β0 + β1*educ + β2*exper + β3*tenure + u
model1 <- lm(wage ~ educ + exper + tenure, data = wage1)

# Print summary of the model
summary(model1)

## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16

# Save the residuals and plot a histogram
residuals1 <- residuals(model1)

# Plot histogram of residuals for the level-level model
ggplot(data.frame(residuals1), aes(x = residuals1)) +
    geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black") +
    labs(x = "Residuals", y = "Density", title = "Histogram of Residuals (Level-Level Model)") +
    theme_minimal()

# (ii) Estimate the equation with log(wage) as the dependent variable
model2 <- lm(log(wage) ~ educ + exper + tenure, data = wage1)

# Print summary of the model
summary(model2)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

# Save the residuals and plot a histogram
residuals2 <- residuals(model2)

# Plot histogram of residuals for the log-level model
ggplot(data.frame(residuals2), aes(x = residuals2)) +
    geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black") +
    labs(x = "Residuals", y = "Density", title = "Histogram of Residuals (Log-Level Model)") +
    theme_minimal()

# (iii) Compare the residuals to see if Assumption MLR.6 (normality of errors) is closer for either model
# Look at the histograms and the summaries to assess which model has residuals closer to normality.

Mid-term

2024-10-31