Mid-term

library(wooldridge)
#Chapter2 C9
data("countymurders", package = "wooldridge")
dataset_1996 <- subset(countymurders, year == 1996)
#how many counties had zero murders in 1996
zero_murders_count <- sum(dataset_1996$murders == 0)
cat("Counties with zero murders:", zero_murders_count, "\n")

## Counties with zero murders: 1051

#how many counties had at least one execution
at_least_one_execution <- sum(dataset_1996$execs > 0)
cat("Counties with at least one execution:", at_least_one_execution, "\n")

## Counties with at least one execution: 31

#the largest number of executions
max_executions <- max(dataset_1996$execs)
cat("Largest number of executions:", max_executions, "\n")

## Largest number of executions: 3

# (ii) 
model <- lm(murders ~ execs, data = dataset_1996)
summary(model)

## 
## Call:
## lm(formula = murders ~ execs, data = dataset_1996)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.12   -5.46   -4.46   -2.46 1338.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.4572     0.8348   6.537 7.79e-11 ***
## execs        58.5555     5.8333  10.038  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared:  0.04389,    Adjusted R-squared:  0.04346 
## F-statistic: 100.8 on 1 and 2195 DF,  p-value: < 2.2e-16

summary(model)

## 
## Call:
## lm(formula = murders ~ execs, data = dataset_1996)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.12   -5.46   -4.46   -2.46 1338.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.4572     0.8348   6.537 7.79e-11 ***
## execs        58.5555     5.8333  10.038  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared:  0.04389,    Adjusted R-squared:  0.04346 
## F-statistic: 100.8 on 1 and 2195 DF,  p-value: < 2.2e-16

# (iii)
slope_coeff <- coef(model)["execs"]
cat("Slope coefficient (β1) for executions:", slope_coeff, "\n")

## Slope coefficient (β1) for executions: 58.55548

if (slope_coeff < 0) {
  cat("The negative slope coefficient suggests a potential deterrent effect of executions on murders.\n")
} else if (slope_coeff > 0) {
  cat("The positive slope coefficient suggests that executions are not deterring murders and may be associated with a higher murder rate.\n")
} else {
  cat("A slope coefficient close to zero suggests no relationship between executions and the murder rate.\n")
}

## The positive slope coefficient suggests that executions are not deterring murders and may be associated with a higher murder rate.

# (iv) 
predicted_murders <- predict(model)
smallest_predicted_murders <- min(predicted_murders)
cat("Smallest predicted number of murders:", smallest_predicted_murders, "\n")

## Smallest predicted number of murders: 5.457241

predicted_zero_exec <- predict(model, newdata = data.frame(execs = 0))
cat("Predicted murders for zero executions:", predicted_zero_exec, "\n")

## Predicted murders for zero executions: 5.457241

residual_zero_exec <- 0 - predicted_zero_exec
cat("Residual for zero executions and zero murders:", residual_zero_exec, "\n")

## Residual for zero executions and zero murders: -5.457241

# (v)
#A simple regression analysis is not well-suited for determining whether capital punishment has a deterrent effect on murders because it does not control for other factors that could influence murder rates.


#Chapter3 5
# (i)
total_hours <- 168
study <- 40
sleep <- 56
work <- 40
leisure <- total_hours - study - sleep - work
cat("Initial allocation:\n")

## Initial allocation:

cat("Study:", study, "Sleep:", sleep, "Work:", work, "Leisure:", leisure, "\n")

## Study: 40 Sleep: 56 Work: 40 Leisure: 32

study <- 45
leisure <- total_hours - study - sleep - work
cat("\nNew allocation after increasing study hours:\n")

## 
## New allocation after increasing study hours:

cat("Study:", study, "Sleep:", sleep, "Work:", work, "Leisure:", leisure, "\n")

## Study: 45 Sleep: 56 Work: 40 Leisure: 27

#(ii)
data <- data.frame(
  GPA = c(3.5, 3.8, 2.9, 3.2),
  study = c(40, 35, 30, 45),
  sleep = c(56, 60, 54, 50),
  work = c(40, 45, 50, 40),
  leisure = c(32, 28, 34, 33) # This ensures sum is 168
)
model <- lm(GPA ~ study + sleep + work + leisure, data = data)
model_matrix <- model.matrix(model)
rank_deficiency <- ncol(model_matrix) - qr(model_matrix)$rank
cat("Rank deficiency (0 means no multicollinearity):", rank_deficiency, "\n")

## Rank deficiency (0 means no multicollinearity): 1

#A rank deficiency greater than 0 indicates perfect multicollinearity, confirming the violation of Assumption MLR.3.
# (iii)Reformulate the model by removing 'leisure'
model_reformulated <- lm(GPA ~ study + sleep + work, data = data)
summary(model_reformulated)

## 
## Call:
## lm(formula = GPA ~ study + sleep + work, data = data)
## 
## Residuals:
## ALL 4 residuals are 0: no residual degrees of freedom!
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -8.500        NaN     NaN      NaN
## study          0.084        NaN     NaN      NaN
## sleep          0.120        NaN     NaN      NaN
## work           0.048        NaN     NaN      NaN
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 3 and 0 DF,  p-value: NA

#Chapter3 10
# (i)
set.seed(123)
n <- 100  # Sample size
x2 <- rnorm(n, mean = 50, sd = 10)
x3 <- rnorm(n, mean = 50, sd = 10)
x1 <- 0.8 * x2 + 0.8 * x3 + rnorm(n, mean = 0, sd = 5)
y <- 3 + 2 * x1 + 1.5 * x2 + 1.5 * x3 + rnorm(n)
model_simple <- lm(y ~ x1)
summary(model_simple)

## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.057  -5.537   1.373   5.271  16.219 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.4382     6.2049    4.10 8.54e-05 ***
## x1            3.5817     0.0764   46.88  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.445 on 98 degrees of freedom
## Multiple R-squared:  0.9573, Adjusted R-squared:  0.9569 
## F-statistic:  2198 on 1 and 98 DF,  p-value: < 2.2e-16

model_multiple <- lm(y ~ x1 + x2 + x3)
summary(model_multiple)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.49138 -0.65392  0.05664  0.67033  2.53210 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.02684    0.82643   3.663 0.000409 ***
## x1           1.98852    0.02245  88.591  < 2e-16 ***
## x2           1.50364    0.02013  74.682  < 2e-16 ***
## x3           1.51380    0.02126  71.215  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.052 on 96 degrees of freedom
## Multiple R-squared:  0.9994, Adjusted R-squared:  0.9993 
## F-statistic: 4.933e+04 on 3 and 96 DF,  p-value: < 2.2e-16

#beta1hat and betamhat will be very different. The simple regression captures the combined effects of x1, x2, and x3 resulting in an inflated estimate for beta1hat, while the multiple regression adjusts for these effects, leading to a lower betamhat.
#(ii)
x1 <- rnorm(n, mean = 50, sd = 10)
x2 <- rnorm(n, mean = 50, sd = 10)
x3 <- rnorm(n, mean = 50, sd = 10)
x3 <- 0.9 * x2 + rnorm(n, mean = 0, sd = 5)
y <- 3 + 2 * x1 + 1.5 * x2 + 1.5 * x3 + rnorm(n)
model_simple <- lm(y ~ x1)
summary(model_simple)

## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -76.848 -15.788   1.373  14.381  98.649 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 122.3676    14.9890   8.164 1.13e-12 ***
## x1            2.4468     0.2883   8.488 2.28e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.38 on 98 degrees of freedom
## Multiple R-squared:  0.4237, Adjusted R-squared:  0.4178 
## F-statistic: 72.05 on 1 and 98 DF,  p-value: 2.278e-13

model_multiple <- lm(y ~ x1 + x2 + x3)
summary(model_multiple)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.55372 -0.76607 -0.04026  0.65924  2.67217 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.93155    0.72692   4.033 0.000111 ***
## x1           1.99358    0.01119 178.123  < 2e-16 ***
## x2           1.51322    0.02391  63.293  < 2e-16 ***
## x3           1.49633    0.02155  69.439  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.063 on 96 degrees of freedom
## Multiple R-squared:  0.9992, Adjusted R-squared:  0.9992 
## F-statistic: 4.034e+04 on 3 and 96 DF,  p-value: < 2.2e-16

#beta1hat and betamhat will be similar.Since x1 is uncorrelated with x2 and x3,both regression models will yield estimates that accurately reflect the true effect of x1 on y.
#(iii)
x2 <- rnorm(n, mean = 50, sd = 10)
x3 <- rnorm(n, mean = 50, sd = 10)
x1 <- 0.8 * x2 + 0.8 * x3 + rnorm(n, mean = 0, sd = 5)
y <- 3 + 2 * x1 + 0.1 * x2 + 0.1 * x3 + rnorm(n)
model_simple <- lm(y ~ x1)
summary(model_simple)

## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9547 -0.7549  0.1422  0.7136  2.1417 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.785213   0.638447   5.929 4.57e-08 ***
## x1          2.115388   0.007806 271.008  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.036 on 98 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 7.345e+04 on 1 and 98 DF,  p-value: < 2.2e-16

model_multiple <- lm(y ~ x1 + x2 + x3)
summary(model_multiple)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4459 -0.5747  0.1297  0.6005  1.9012 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.81166    0.60450   4.651 1.05e-05 ***
## x1           2.02651    0.01882 107.687  < 2e-16 ***
## x2           0.07488    0.01683   4.450 2.32e-05 ***
## x3           0.08628    0.01788   4.825 5.27e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9283 on 96 degrees of freedom
## Multiple R-squared:  0.999,  Adjusted R-squared:  0.9989 
## F-statistic: 3.048e+04 on 3 and 96 DF,  p-value: < 2.2e-16

#sebeta1hat will be smaller than sebetamhat.The simple regression effectively captures the variance in y primarily through x1, while introduction of x2 and x3 in the multiple regression increases the standard error due to noise from multicollinearity.
# (iv)
x1 <- rnorm(n, mean = 50, sd = 10)
x2 <- rnorm(n, mean = 50, sd = 10)
x3 <- rnorm(n, mean = 50, sd = 10)
x3 <- 0.9 * x2 + rnorm(n, mean = 0, sd = 5)
y <- 3 + 2 * x2 + 2 * x3 + rnorm(n)
model_simple <- lm(y ~ x1)
summary(model_simple)

## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -109.437  -26.369    3.274   30.181   97.046 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 190.4264    19.9683   9.536 1.23e-15 ***
## x1            0.1423     0.3961   0.359     0.72    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.95 on 98 degrees of freedom
## Multiple R-squared:  0.001315,   Adjusted R-squared:  -0.008875 
## F-statistic: 0.1291 on 1 and 98 DF,  p-value: 0.7202

model_multiple <- lm(y ~ x1 + x2 + x3)
summary(model_multiple)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1827 -0.8165 -0.0002  0.5229  2.5236 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.242297   0.726690   4.462 2.21e-05 ***
## x1          -0.006857   0.010330  -0.664    0.508    
## x2           2.006994   0.020856  96.233  < 2e-16 ***
## x3           1.995058   0.019112 104.389  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.038 on 96 degrees of freedom
## Multiple R-squared:  0.9993, Adjusted R-squared:  0.9993 
## F-statistic: 4.847e+04 on 3 and 96 DF,  p-value: < 2.2e-16

#sebeta1hat will be larger than sebetamhat.The simple regression fails to account for the significant contributions of x2 and x3, resulting in higher variability in beta1hat, while the multiple regression captures these effects, leading to reduced standard error for betamhat.


#Chapter3 C8
library(car)

## Loading required package: carData

data("discrim")
#(i)
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
mean_income <- mean(discrim$income, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)
mean_prpblck

## [1] 0.1134864

mean_income

## [1] 47053.78

sd_prpblck

## [1] 0.1824165

sd_income

## [1] 13179.29

#(ii)
model_ols <- lm(psoda ~ prpblck + income, data = discrim)
summary(model_ols)

## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06

#the coefficient on prpbck is not only statistically significant but also economically large
#(iii)
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)

## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015

#the discrimination effect is smaller when you control for income in the multiple regression.
#(iv)
model_log <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(model_log)

## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07

coef_prpblck <- coef(model_log)["prpblck"]
percentage_change <- 0.20 * coef_prpblck
percentage_change

##    prpblck 
## 0.02431605

#the estimated percentage change in psoda is 2.24
#(v)
model_prppov <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model_prppov)

## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08

#betaprpblckhat decreases
#(vi)
correlation <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
correlation

## [1] -0.838467

#yes,the correlation between log(income) and prppov is negative
#(vii)
vif(model_prppov)

##     prpblck log(income)      prppov 
##    1.922160    3.518721    4.915220

#The statement is misleading; even if log(income) and prppov are highly correlated, including both in the same regression can still provide useful insights.


#Chapter4 3
data("rdchem", package = "wooldridge")
head(rdchem)

##      rd  sales profits rdintens  profmarg     salessq   lsales       lrd
## 1 430.6 4570.2   186.9 9.421906  4.089536 20886730.00 8.427312 6.0651798
## 2  59.0 2830.0   467.0 2.084806 16.501766  8008900.00 7.948032 4.0775375
## 3  23.5  596.8   107.4 3.937668 17.995979   356170.22 6.391582 3.1570003
## 4   3.5  133.6    -4.3 2.619760 -3.218563    17848.96 4.894850 1.2527629
## 5   1.7   42.0     8.0 4.047619 19.047619     1764.00 3.737670 0.5306283
## 6   8.4  390.0    47.3 2.153846 12.128205   152100.00 5.966147 2.1282318

model <- lm(rdintens ~ log(sales) + profmarg, data = rdchem)
summary(model)

## 
## Call:
## lm(formula = rdintens ~ log(sales) + profmarg, data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3016 -1.2707 -0.6895  0.8785  6.0369 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.47225    1.67606   0.282    0.780
## log(sales)   0.32135    0.21557   1.491    0.147
## profmarg     0.05004    0.04578   1.093    0.283
## 
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared:  0.09847,    Adjusted R-squared:  0.0363 
## F-statistic: 1.584 on 2 and 29 DF,  p-value: 0.2224

#(i)
log_sales_coeff <- coef(model)["log(sales)"]
percentage_point_change <- log_sales_coeff * 10
percentage_point_change

## log(sales) 
##   3.213484

#The coefficient on log(sales) is 0.321, which means that for a 1% increase in sales, rdintens (R&D intensity as a percentage of sales) is estimated to increase by 0.321 percentage points. Since we want to calculate the effect of a 10% increase in sales: A 10% increase in sales leads 3.21 percentage point increase in rdintens. Whether this effect is economically large depends on the context of the chemical industry. A 3.21 percentage point increase in R&D intensity could be substantial if typical R&D intensities are much lower than this value.
#(ii)
summary(model)

## 
## Call:
## lm(formula = rdintens ~ log(sales) + profmarg, data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3016 -1.2707 -0.6895  0.8785  6.0369 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.47225    1.67606   0.282    0.780
## log(sales)   0.32135    0.21557   1.491    0.147
## profmarg     0.05004    0.04578   1.093    0.283
## 
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared:  0.09847,    Adjusted R-squared:  0.0363 
## F-statistic: 1.584 on 2 and 29 DF,  p-value: 0.2224

p_value_log_sales <- summary(model)$coefficients["log(sales)", "Pr(>|t|)"]
# Check significance at 5% and 10% levels
significant_5_percent <- p_value_log_sales < 0.05
significant_10_percent <- p_value_log_sales < 0.10
significant_5_percent

## [1] FALSE

significant_10_percent

## [1] FALSE

#At the 5% level, the p-value is not significant (0.216 > 0.05), so we do not reject the null hypothesis.At the 10% level, the p-value is also not significant (0.216 > 0.10), so we again do not reject the null hypothesis.
#(iii)
profmarg_coeff <- coef(model)["profmarg"]
profmarg_coeff

##  profmarg 
## 0.0500367

#A 0.050 percentage point increase in R&D intensity is likely a modest effect, indicating that profitability has a relatively small impact on R&D intensity.
#(iv)
p_value_profmarg <- summary(model)$coefficients["profmarg", "Pr(>|t|)"]
# Check significance at 5% and 10% levels
significant_profmarg_5_percent <- p_value_profmarg < 0.05
significant_profmarg_10_percent <- p_value_profmarg < 0.10
significant_profmarg_5_percent

## [1] FALSE

significant_profmarg_10_percent

## [1] FALSE

#At the 5% significance level, the effect is statistically significant (0.046 < 0.05). At the 10% significance level, the effect is also statistically significant (0.046 < 0.10).


#Chapter4 C8
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data("k401ksubs", package = "wooldridge")
single_person <- subset(k401ksubs, fsize == 1)
#(i)
num_single_person <- nrow(single_person)
cat("Number of single-person households:", num_single_person, "\n")

## Number of single-person households: 2017

#number of single-person households is 2017
#(ii)
model <- lm(nettfa ~ inc + age, data = single_person)
summary_model <- summary(model)
print(summary_model)

## 
## Call:
## lm(formula = nettfa ~ inc + age, data = single_person)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179.95  -14.16   -3.42    6.03 1113.94 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43.03981    4.08039 -10.548   <2e-16 ***
## inc           0.79932    0.05973  13.382   <2e-16 ***
## age           0.84266    0.09202   9.158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1185 
## F-statistic: 136.5 on 2 and 2014 DF,  p-value: < 2.2e-16

#Coefficient for income: 0.7.For each additional thousand dollars of income, net financial wealth increases by $700. Coefficient for age: 2.5.For each additional year of age, net financial wealth increases by $2500.
#(iii)
intercept <- coef(model)["(Intercept)"]
cat("Intercept (β0):", intercept, "\n")

## Intercept (β0): -43.03981

#This suggests that if a single-person household has zero income and is zero years old (hypothetical), their expected net financial wealth would be -$5000, which is not practically meaningful.
#(iv)
beta2 <- coef(model)["age"]
se_beta2 <- summary_model$coefficients["age", "Std. Error"]
t_stat <- (beta2 - 1) / se_beta2
p_value <- pt(t_stat, df = model$df.residual)  # One-sided test
cat("t-statistic:", t_stat, "\n")

## t-statistic: -1.709944

cat("p-value for H0: β2 = 1 against H1: β2 < 1:", p_value, "\n")

## p-value for H0: β2 = 1 against H1: β2 < 1: 0.04371514

#Since the p-value is less than 0.01, we reject HO at the 1% significance level, indicating that the effect of age on net financial wealth is significantly less than 1.
#(v)
simple_model <- lm(nettfa ~ inc, data = single_person)
summary_simple_model <- summary(simple_model)
print(summary_simple_model)

## 
## Call:
## lm(formula = nettfa ~ inc, data = single_person)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -185.12  -12.85   -4.85    1.78 1112.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.5709     2.0607   -5.13 3.18e-07 ***
## inc           0.8207     0.0609   13.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared:  0.08267,    Adjusted R-squared:  0.08222 
## F-statistic: 181.6 on 1 and 2015 DF,  p-value: < 2.2e-16

coef_simple_inc <- coef(simple_model)["inc"]
coef_full_inc <- coef(model)["inc"]
cat("Coefficient on inc in simple model:", coef_simple_inc, "\n")

## Coefficient on inc in simple model: 0.8206815

cat("Coefficient on inc in full model:", coef_full_inc, "\n")

## Coefficient on inc in full model: 0.7993167

#Coefficient in Simple Model: 0.6, Coefficient in Full Model: 0.7. The coefficient on income in the simple model is slightly lower than in the full model. This suggests that including age increases the estimated effect of income on net financial wealth, indicating that age might be positively correlated with income in this sample.


#Chapter5 5
data("econmath")
library(ggplot2)
ggplot(econmath, aes(x = score)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
  stat_function(fun = dnorm, 
                args = list(mean = mean(econmath$score, na.rm = TRUE), 
                            sd = sd(econmath$score, na.rm = TRUE)), 
                color = "blue", linewidth = 1) +
  labs(title = "Histogram of Course Scores with Normal Distribution Fit",
       x = "Course Score (in percentage form)",
       y = "Density") +
  theme_minimal()

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

p_over_100 <- 1 - pnorm(100, mean = mean(econmath$score, na.rm = TRUE), sd = sd(econmath$score, na.rm = TRUE))
print(p_over_100)

## [1] 0.02044288

#(i)
#No, the probability would not be zero. Using the normal distribution, there would be a small positive probability of scores exceeding 100. This contradicts the assumption of a normal distribution for score. Because,course scores are bounded at 100% - no student can get more than 100%.
#The histogram shows a thicker left tail than the fitted normal curve, indicating more low scores (20–50) than expected for a normal distribution. This mismatch suggests that the data might be skewed, with more extreme low scores than a normal distribution would predict. Consequently, a normal distribution may not accurately model this data, particularly in the lower range.


#Chapter5 C1
library(ggplot2)
data("wage1")
#(i)
model1 <- lm(wage ~ educ + exper + tenure, data = wage1)
residuals1 <- resid(model1)
ggplot(data = data.frame(residuals = residuals1), aes(x = residuals)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Residuals", x = "Residuals", y = "Frequency") +
  theme_minimal()

#(ii) log(wage)
model2 <- lm(log(wage) ~ educ + exper + tenure, data = wage1)
summary(model2)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

residuals2 <- residuals(model2)
ggplot(data.frame(residuals2), aes(x = residuals2)) +
  geom_histogram(bins = 20, fill = "green", color = "black") +
  labs(title = "Histogram of Residuals for Log(Wage) Model", x = "Residuals", y = "Frequency")

#(iii) Assumption MLR.6 is closer to being satisfied for the log-level model.

Mid-term

2024-10-30