midterm

install.packages("wooldridge")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

#Chapter2

library(wooldridge)
data("countymurders")

data("countymurders")
countymurders_1996 <- subset(countymurders, year == 1996)

#i.how many counties had zero murders in 1996?
count_zero_murders <- sum(countymurders_1996$murders == 0)
print(count_zero_murders)

## [1] 1051

#How many counties had at least one execution?
count_at_least_one_execution <- sum(countymurders_1996$execs >= 1)
print(count_at_least_one_execution)

## [1] 31

#What is the largest number execution?
largest_execution_count <- max(countymurders_1996$execs)
print(largest_execution_count)

## [1] 3

#ii.
data("countymurders")
countymurders_1996 <- subset(countymurders, year == 1996)
data("countymurders_1996")

## Warning in data("countymurders_1996"): data set 'countymurders_1996' not found

model <- lm(murders ~ execs , data = countymurders)
summary(model)

## 
## Call:
## lm(formula = murders ~ execs, data = countymurders)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -202.23   -6.84   -5.84   -3.84 1937.16 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.8382     0.2418   28.28   <2e-16 ***
## execs        65.4650     2.1463   30.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.64 on 37347 degrees of freedom
## Multiple R-squared:  0.02431,    Adjusted R-squared:  0.02428 
## F-statistic: 930.4 on 1 and 37347 DF,  p-value: < 2.2e-16

#iii. when murder number increased by 1, execution increased by 65.4650 accordingly.
#iv.
min_executions <- min(countymurders_1996$execs)
min_predicted_murders <- coef(model)[1] + coef(model)[2] * min_executions
cat("Smallest number of murders predicted:", min_predicted_murders, "\n")

## Smallest number of murders predicted: 6.838201

# What is the residual for a county with zero executions and zero murders?
zero_exec_zero_murders_residual <- predict(model, newdata = data.frame(execs = 0), type = "response")
cat("Residual for a county with zero executions and zero murders:", zero_exec_zero_murders_residual, "\n")

## Residual for a county with zero executions and zero murders: 6.838201

#v. Simple regression analysis establishes a correlation between two variables but cannot determine causation. So it can not a determine deterrence.

#Chapter3

#5.i. In the model GPA = ß0 + ß1study + ß2sleep + ß3work + ß4leisure + u,  does it make sense to hold sleep, work, and leisure fixed while changing study?
#The reason why it does not make sense to hold sleep, work, and leisure fixed while changing study is that the sum of hours in all four activities must be 168 for each student. Changing the hours spent on studying would inherently change the hours available for other activities.
#ii. This model doesn't follow the rule that says the things we're trying to predict should stay the same and not change randomly. In this case, the hours spent on different activities can vary because they need to add up to 168. This introduces randomness and connections between the things we're using to make predictions.
#iii.we should use independent variables so that it is going to adhere to MLR.3 rule.

#10.
#i. If x1 is highly correlated with x2 and x3 and x2 and x3 have large partial effects on y,you would expect (B1 with ~ sign) and (adjusted B1) to be similar. The inclusion of x2 and x3 in the model should help in capturing the relationship between x1 and y more accurately, resulting in a similar effect.

#ii. If x1 is almost uncorrelated with x2 and x3 but x2 and x3 are highly correlated,(B1 with ~ sign) and (adjusted B1) tend to be similar. The high correlation between x2 and x3 may result in multicollinearity issues, leading to unstable coefficient estimates for x2 and x3.

#iii.If x1 is highly correlated with x2 and x3, and x2 and x3 have small partial effects on y, you would expect se(B₁ with ~ sign) to be smaller. The high correlation can lead to multicollinearity, inflating standard errors for the individual coefficients.

#iv. If x1 is almost uncorrelated with x2 and x3, x2 and x3 have large partial effects on y,and x2 and x3 are highly correlated, you would expect se(adjusted B₁) to be smaller.The inclusion of highly correlated variables (x2 and x3) without much correlation with x1 can improve the precision of the estimates for x1, resulting in smaller standard errors.

#Chapter3, C8

library(wooldridge)
data("discrim")

mean_prpblck <- mean(discrim$prpblck)
sd_prpblck <- sd(discrim$prpblck)
mean_income <- mean(discrim$income)
sd_income <- sd(discrim$income)
cat("Average prpblck:", mean_prpblck, "\n")

## Average prpblck: NA

cat("Standard deviation of prpblck:", sd_prpblck, "\n")

## Standard deviation of prpblck: NA

cat("Average income:", mean_income, "\n")

## Average income: NA

cat("Standard deviation of income:", sd_income, "\n\n")

## Standard deviation of income: NA

cat("Units of measurement: prpblck is the proportion of the population that is black (in percentage),\n")

## Units of measurement: prpblck is the proportion of the population that is black (in percentage),

cat("and income is the median income in the zip code.\n\n")

## and income is the median income in the zip code.

#ii. Estimate the model psoda = B0 + B1prpblck + B2income + u by OLS
model <- lm(psoda ~ prpblck + income, data = discrim)
summary(model)

## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06

#coef of prpblck is 1.15. The coefficient on prpblck is the estimated change in psoda for a one-unit change in prpblck.In this context, it represents the change in the price of soda for a 1% increase in the proportion of the population that is black. Whether it is economically large depends on the magnitude and significance of the coefficient, which can be determined from the summary output.

#iii.Compare the estimate from part (ii) with the simple regression estimate from psoda on prpblck.
simple_model <- lm(psoda ~ prpblck, data = discrim)
summary(simple_model)

## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015

#iv. Estimate the model log(psoda) = B0 + B1prpblck + B2log(income) + u
log_model <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(log_model)

## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07

# Calculate the estimated percentage change in psoda for a 20% increase in prpblck
percentage_change <- exp(coef(log_model)[2] * 0.20) * 100 - 100
cat("Estimated percentage change in psoda for a 20% increase in prpblck:", percentage_change, "\n\n")

## Estimated percentage change in psoda for a 20% increase in prpblck: 2.46141

#v. Add the variable prppov to the regression in part (iv)
model_with_prppov <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model_with_prppov)

## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08

#vi. Find the correlation between log(income) and prppov
correlation_log_income_prppov <- cor(discrim$lincome, discrim$prppov)
cat("Correlation between lincome and prppov:", correlation_log_income_prppov, "\n\n")

## Correlation between lincome and prppov: NA

#vii. Evaluate the following statement
cat("The statement 'Because lincome and prppov are so highly correlated, they have no business being in the same regression.'\n")

## The statement 'Because lincome and prppov are so highly correlated, they have no business being in the same regression.'

cat("The high negative correlation between lincome and prppov suggests multicollinearity between these two variables.\n")

## The high negative correlation between lincome and prppov suggests multicollinearity between these two variables.

cat("Multicollinearity can lead to unstable coefficient estimates, making it challenging to interpret the individual effects\n")

## Multicollinearity can lead to unstable coefficient estimates, making it challenging to interpret the individual effects

cat("of the variables. However, the decision to include or exclude variables should be based on the specific research question,\n")

## of the variables. However, the decision to include or exclude variables should be based on the specific research question,

cat("theoretical considerations, and the goals of the analysis.\n")

## theoretical considerations, and the goals of the analysis.

cat("In some cases, including both variables in the regression model might still be justified if they capture different aspects\n")

## In some cases, including both variables in the regression model might still be justified if they capture different aspects

cat("of the relationship with the dependent variable and contribute to a more comprehensive understanding of the phenomenon under study.\n")

## of the relationship with the dependent variable and contribute to a more comprehensive understanding of the phenomenon under study.

#Chapter4_3

# Given coefficients and standard errors
coeff_log_sales <- 0.321
se_log_sales <- 0.216
coeff_profmarg <- 0.50
se_profmarg <- 0.46

#i. Estimated percentage point change in Rdintens for a 10% increase in sales
percentage_change <- coeff_log_sales * 10
# Estimated percentage point change in Rdintens for a 10% increase in sales:", percentage_change

#ii. Test for log(sales) coefficient
t_stat_log_sales <- coeff_log_sales / se_log_sales
p_value_log_sales <- 2 * (1 - pt(abs(t_stat_log_sales), df = 29))  # two-tailed test
cat("b) p-value for the test on log(sales) coefficient:", p_value_log_sales, "\n")

## b) p-value for the test on log(sales) coefficient: 0.1480413

cat("   (At 5% level):", ifelse(p_value_log_sales < 0.05, "Reject H0", "Fail to reject H0"), "\n")

##    (At 5% level): Fail to reject H0

cat("   (At 10% level):", ifelse(p_value_log_sales < 0.10, "Reject H0", "Fail to reject H0"), "\n")

##    (At 10% level): Fail to reject H0

#iii. Interpretation of the coefficient on profmarg
cat("c) Coefficient on profmarg:", coeff_profmarg, "\n")

## c) Coefficient on profmarg: 0.5

#iv. Test for profmarg coefficient
t_stat_profmarg <- coeff_profmarg / se_profmarg
p_value_profmarg <- 2 * (1 - pt(abs(t_stat_profmarg), df = 29))  # two-tailed test
cat("d) p-value for the test on profmarg coefficient:", p_value_profmarg, "\n")

## d) p-value for the test on profmarg coefficient: 0.2860082

cat("   (At 5% level):", ifelse(p_value_profmarg < 0.05, "Reject H0", "Fail to reject H0"), "\n")

##    (At 5% level): Fail to reject H0

cat("   (At 10% level):", ifelse(p_value_profmarg < 0.10, "Reject H0", "Fail to reject H0"), "\n")

##    (At 10% level): Fail to reject H0

#Chapter4_C8

library(wooldridge)
data("k401ksubs")

#i. How many single-person households are there in the data set?
single_person_households <- subset(k401ksubs, fsize == 1)
num_single_person_households <- nrow(single_person_households)
cat("Number of single-person households:", num_single_person_households, "\n\n")

## Number of single-person households: 2017

#ii. Use OLS to estimate the model: nettfa = B0 + B1inc + B2age + u
model <- lm(nettfa ~ inc + age, data = single_person_households)
summary(model)

## 
## Call:
## lm(formula = nettfa ~ inc + age, data = single_person_households)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179.95  -14.16   -3.42    6.03 1113.94 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43.03981    4.08039 -10.548   <2e-16 ***
## inc           0.79932    0.05973  13.382   <2e-16 ***
## age           0.84266    0.09202   9.158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1185 
## F-statistic: 136.5 on 2 and 2014 DF,  p-value: < 2.2e-16

# Interpret the slope coefficients
cat("Interpretation of slope coefficients:\n")

## Interpretation of slope coefficients:

cat("B1 (inc): The estimated change in nettfa for a one-unit change in inc (annual family income).\n")

## B1 (inc): The estimated change in nettfa for a one-unit change in inc (annual family income).

cat("B2 (age): The estimated change in nettfa for a one-unit change in age.\n")

## B2 (age): The estimated change in nettfa for a one-unit change in age.

cat("There might be surprises depending on the context and expectations of the relationship between variables.\n\n")

## There might be surprises depending on the context and expectations of the relationship between variables.

#iii. Does the intercept from the regression in part (ii) have an interesting meaning? Explain.
cat("The intercept (B0) represents the estimated net financial wealth (nettfa) when both inc and age are zero.\n")

## The intercept (B0) represents the estimated net financial wealth (nettfa) when both inc and age are zero.

cat("In this context, it may not have a meaningful interpretation, as having zero income and age is not practically meaningful.\n\n")

## In this context, it may not have a meaningful interpretation, as having zero income and age is not practically meaningful.

#iv. Find the p-value for the test H0: B₂ = 1 against H₁: B₂ < 1. Do you reject H0 at the 1% significance level?
test_result <- summary(model)$coefficient[3, "Pr(>|t|)"]
cat("p-value for the test H0: B₂ = 1 against H₁: B₂ < 1:", test_result, "\n")

## p-value for the test H0: B₂ = 1 against H₁: B₂ < 1: 1.265959e-19

cat("At the 1% significance level, we would reject H0 if the p-value is less than 0.01.\n")

## At the 1% significance level, we would reject H0 if the p-value is less than 0.01.

if (test_result < 0.01) {cat("We reject H0; there is evidence that B₂ is less than 1.\n\n")
} else {
  cat("We do not reject H0; there is not enough evidence to conclude that B₂ is less than 1.\n\n")
}

## We reject H0; there is evidence that B₂ is less than 1.

#v. If you do a simple regression of nettfa on inc, is the estimated coefficient on inc much different from the estimate in part (ii)? Why or why not?
simple_model <- lm(nettfa ~ inc, data = single_person_households)
summary(simple_model)

## 
## Call:
## lm(formula = nettfa ~ inc, data = single_person_households)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -185.12  -12.85   -4.85    1.78 1112.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.5709     2.0607   -5.13 3.18e-07 ***
## inc           0.8207     0.0609   13.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared:  0.08267,    Adjusted R-squared:  0.08222 
## F-statistic: 181.6 on 1 and 2015 DF,  p-value: < 2.2e-16

cat("Comparison of the estimated coefficient on inc:\n")

## Comparison of the estimated coefficient on inc:

cat("The estimated coefficient on inc in the simple regression is compared to the estimate in part (ii).\n")

## The estimated coefficient on inc in the simple regression is compared to the estimate in part (ii).

cat("Differences may arise due to the inclusion of age in the multiple regression model, which may affect\n")

## Differences may arise due to the inclusion of age in the multiple regression model, which may affect

cat("the relationship between nettfa and inc. The context and goals of the analysis will determine\n")

## the relationship between nettfa and inc. The context and goals of the analysis will determine

cat("whether the inclusion of age improves the model.\n")

## whether the inclusion of age improves the model.

#Chapter5_5

library(wooldridge)
data('econmath')
library(ggplot2)
library(stats)

data <- econmath$score
hist_data <- hist(data, breaks = 30, plot = FALSE)

# Fit a normal distribution
mu <- mean(data)
sigma <- sd(data)
x <- seq(min(data), max(data), length = 100)
y <- dnorm(x, mean = mu, sd = sigma)

# Plot the histogram and the fitted normal distribution
hist_plot <- ggplot() +
  geom_histogram(aes(x = data, y = ..density..), bins = 30, fill = "lightyellow", color = "black") +
  geom_line(aes(x = x, y = y), color = "purple", size = 1) +
  labs(title = "Histogram and Fitted Normal Distribution",
       x = "Score",
       y = "Density") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print(hist_plot)

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# i. Probability that "score" exceeds 100 using the normal distribution
prob_exceed_100 <- 1 - pnorm(100, mean = mu, sd = sigma)
cat("i. Probability that 'score' exceeds 100 using the normal distribution:", prob_exceed_100, "\n")

## i. Probability that 'score' exceeds 100 using the normal distribution: 0.02044288

# ii. Assess the fit in the left tail visually or using statistical tests
qqnorm(data)
qqline(data)

shapiro.test(data)  # Perform a Shapiro-Wilk test for normality

## 
##  Shapiro-Wilk normality test
## 
## data:  data
## W = 0.96973, p-value = 2.454e-12

#Chapter5_C1

data("wage1")
wage_data <- wage1

# (i) Estimate the equation wage = b0 + b1educ + b2exper + b3tenure + u.
model_level <- lm(wage ~ educ + exper + tenure, data = wage_data)
residuals_level <- residuals(model_level)
summary(model_level)

## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16

model_log_level <- lm(log(wage) ~ educ + exper + tenure, data = wage_data)

residuals_log_level <- residuals(model_log_level)
# Plot histogram of residuals for the log-level model
hist(residuals_log_level, main = "Histogram of Residuals (Log-Level Model)", col = "pink", border = "black")

summary(model_log_level)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

# (iii) Q-Q plots for normality assessment
par(mfrow = c(2, 2))  # Set up a 2x2 grid for Q-Q plots

qqnorm(residuals_level, main = "Q-Q Plot - Level-Level Model")
qqline(residuals_level)

qqnorm(residuals_log_level, main = "Q-Q Plot - Log-Level Model")
qqline(residuals_log_level)

# Reset the plotting layout
par(mfrow = c(1, 1))

# Print summary statistics to the console for easier interpretation
cat("\nSummary Statistics - Level-Level Model:\n")

## 
## Summary Statistics - Level-Level Model:

summary(model_level)

## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16

cat("\nSummary Statistics - Log-Level Model:\n")

## 
## Summary Statistics - Log-Level Model:

summary(model_log_level)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

#Chapter6_3

library(wooldridge)
data("rdchem")

#i. At what point does the marginal effect of sales on rdintens become negative?
sales_negative_marginal_effect <- -coef(model)[2] / (2 * coef(model)[3])
cat("The marginal effect of sales on rdintens becomes negative when sales is greater than", sales_negative_marginal_effect, "\n")

## The marginal effect of sales on rdintens becomes negative when sales is greater than -0.4742839

#ii. Would you keep the quadratic term in the model? Explain.
cat("The decision to keep the quadratic term depends on the significance of the coefficient and the context of the analysis.\n")

## The decision to keep the quadratic term depends on the significance of the coefficient and the context of the analysis.

cat("If the quadratic term is statistically significant and improves the model fit, it may be kept.\n\n")

## If the quadratic term is statistically significant and improves the model fit, it may be kept.

#iii. Define salesbil as sales measured in billions of dollars
rdchem$salesbil <- rdchem$sales / 1000
model_salesbil <- lm(rdintens ~ salesbil + I(salesbil^2), data = rdchem)
summary(model_salesbil)

## 
## Call:
## lm(formula = rdintens ~ salesbil + I(salesbil^2), data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1418 -1.3630 -0.2257  1.0688  5.5808 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.612512   0.429442   6.084 1.27e-06 ***
## salesbil       0.300571   0.139295   2.158   0.0394 *  
## I(salesbil^2) -0.006946   0.003726  -1.864   0.0725 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.788 on 29 degrees of freedom
## Multiple R-squared:  0.1484, Adjusted R-squared:  0.08969 
## F-statistic: 2.527 on 2 and 29 DF,  p-value: 0.09733

# (iv) For the purpose of reporting the results, which equation do you prefer?
# First Model
model1 <- lm(rdintens ~ sales + I(sales^2), data = rdchem)

# Second Model
model2 <- lm(rdintens ~ salesbil + I(salesbil^2), data = rdchem)
# Compare Adjusted R-squared
adjusted_r_squared_model1 <- summary(model1)$adj.r.squared
adjusted_r_squared_model2 <- summary(model2)$adj.r.squared
# Compare Coefficients
coefficients_model1 <- coef(model1)
coefficients_model2 <- coef(model2)

# Compare Residuals (optional, for additional diagnostics)
residuals_model1 <- residuals(model1)
residuals_model2 <- residuals(model2)

# Decision
if (adjusted_r_squared_model1 == adjusted_r_squared_model2) {
  preference <- "Both models have the same adjusted R-squared."
} else if (adjusted_r_squared_model1 > adjusted_r_squared_model2) {
  preference <- "Model 1 is preferred based on adjusted R-squared."
} else {preference <- "Model 2 is preferred based on adjusted R-squared."
}

# Report Coefficients (you can customize this based on your needs)
report_model1 <- coef(model1)
report_model2 <- coef(model2)
# Print results
cat("Adjusted R-squared - Model 1:", adjusted_r_squared_model1, "\n")

## Adjusted R-squared - Model 1: 0.08969224

cat("Adjusted R-squared - Model 2:", adjusted_r_squared_model2, "\n")

## Adjusted R-squared - Model 2: 0.08969224

cat("Preference:", preference, "\n")

## Preference: Both models have the same adjusted R-squared.

cat("\nCoefficients - Model 1:\n")

## 
## Coefficients - Model 1:

print(report_model1)

##   (Intercept)         sales    I(sales^2) 
##  2.612512e+00  3.005713e-04 -6.945939e-09

# Print Coefficients - Model 2
cat("\nCoefficients - Model 2:\n")

## 
## Coefficients - Model 2:

print(report_model2)

##   (Intercept)      salesbil I(salesbil^2) 
##   2.612512085   0.300571301  -0.006945939

#Chapter6_10

library(wooldridge)
data("meapsingle")

#i. If you are a policy maker trying to estimate the causal effect of per-student spending on math test performance,
# explain why the first equation is more relevant than the second. What is the estimated effect of a 10% increase in expenditures per student?
cat("The first equation is more relevant because it provides a direct estimate of the effect of per-student spending on math test performance.\n")

## The first equation is more relevant because it provides a direct estimate of the effect of per-student spending on math test performance.

cat("In the first equation, the coefficient on lexppp represents the estimated effect of a one-unit increase in log expenditures per student.\n")

## In the first equation, the coefficient on lexppp represents the estimated effect of a one-unit increase in log expenditures per student.

cat("To get the estimated effect of a 10% increase, you can multiply the coefficient by 0.1.\n")

## To get the estimated effect of a 10% increase, you can multiply the coefficient by 0.1.

#ii.  Does adding read4 to the regression have strange effects on coefficients and statistical significance other than B1exppp?
# iii. How would you explain to someone with only basic knowledge of regression why, in this case, you prefer the equation with the smaller adjusted R-squared?
# Fit the first equation
model1 <- lm(math4 ~ lexppp + free + lmedinc + pctsgle, data = meapsingle)
summary(model1)

## 
## Call:
## lm(formula = math4 ~ lexppp + free + lmedinc + pctsgle, data = meapsingle)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.259  -7.422   1.615   7.274  49.524 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.48949   59.23781   0.413   0.6797    
## lexppp       9.00648    4.03530   2.232   0.0266 *  
## free        -0.42164    0.07064  -5.969 9.27e-09 ***
## lmedinc     -0.75221    5.35816  -0.140   0.8885    
## pctsgle     -0.27444    0.16086  -1.706   0.0894 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.59 on 224 degrees of freedom
## Multiple R-squared:  0.4716, Adjusted R-squared:  0.4622 
## F-statistic: 49.98 on 4 and 224 DF,  p-value: < 2.2e-16

# Fit the second equation
model2 <- lm(math4 ~ lexppp + free + lmedinc + pctsgle + read4, data = meapsingle)
summary(model2)

## 
## Call:
## lm(formula = math4 ~ lexppp + free + lmedinc + pctsgle + read4, 
##     data = meapsingle)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29.5690  -4.6729  -0.0349   4.3644  24.8425 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 149.37870   41.70293   3.582 0.000419 ***
## lexppp        1.93215    2.82480   0.684 0.494688    
## free         -0.06004    0.05399  -1.112 0.267297    
## lmedinc     -10.77595    3.75746  -2.868 0.004529 ** 
## pctsgle      -0.39663    0.11143  -3.559 0.000454 ***
## read4         0.66656    0.04249  15.687  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.012 on 223 degrees of freedom
## Multiple R-squared:  0.7488, Adjusted R-squared:  0.7432 
## F-statistic: 132.9 on 5 and 223 DF,  p-value: < 2.2e-16

# Compare the two equations
cat("Comparison of Equations:\n")

## Comparison of Equations:

cat("1. Equation without read4:\n")

## 1. Equation without read4:

summary(model1)

## 
## Call:
## lm(formula = math4 ~ lexppp + free + lmedinc + pctsgle, data = meapsingle)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.259  -7.422   1.615   7.274  49.524 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.48949   59.23781   0.413   0.6797    
## lexppp       9.00648    4.03530   2.232   0.0266 *  
## free        -0.42164    0.07064  -5.969 9.27e-09 ***
## lmedinc     -0.75221    5.35816  -0.140   0.8885    
## pctsgle     -0.27444    0.16086  -1.706   0.0894 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.59 on 224 degrees of freedom
## Multiple R-squared:  0.4716, Adjusted R-squared:  0.4622 
## F-statistic: 49.98 on 4 and 224 DF,  p-value: < 2.2e-16

cat("\n2. Equation with read4:\n")

## 
## 2. Equation with read4:

summary(model2)

## 
## Call:
## lm(formula = math4 ~ lexppp + free + lmedinc + pctsgle + read4, 
##     data = meapsingle)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29.5690  -4.6729  -0.0349   4.3644  24.8425 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 149.37870   41.70293   3.582 0.000419 ***
## lexppp        1.93215    2.82480   0.684 0.494688    
## free         -0.06004    0.05399  -1.112 0.267297    
## lmedinc     -10.77595    3.75746  -2.868 0.004529 ** 
## pctsgle      -0.39663    0.11143  -3.559 0.000454 ***
## read4         0.66656    0.04249  15.687  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.012 on 223 degrees of freedom
## Multiple R-squared:  0.7488, Adjusted R-squared:  0.7432 
## F-statistic: 132.9 on 5 and 223 DF,  p-value: < 2.2e-16

#Chapter6_C3

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

data("wage2")

# Fit the regression model
model <- lm(log(wage) ~ educ + exper + educ * exper, data = wage2)

#i. Show that the return to another year of education, holding exper fixed, is b1 + b3 * exper
coefs <- coef(model)
return_to_education <- coefs["educ"] + coefs["educ:exper"]

cat("Return to another year of education (holding exper fixed):", return_to_education, "\n")

## Return to another year of education (holding exper fixed): 0.04725277

# (ii) State the null hypothesis that the return to education does not depend on the level of exper.
# The null hypothesis (H0) is: B3 = 0 (No interaction effect)
# Appropriate alternative: The return to education depends on the level of exper (B3 ≠ 0)

#iii. Test the null hypothesis in (ii) against the stated alternative
summary(model)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + educ * exper, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.88558 -0.24553  0.03558  0.26171  1.28836 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.949455   0.240826  24.704   <2e-16 ***
## educ         0.044050   0.017391   2.533   0.0115 *  
## exper       -0.021496   0.019978  -1.076   0.2822    
## educ:exper   0.003203   0.001529   2.095   0.0365 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3923 on 931 degrees of freedom
## Multiple R-squared:  0.1349, Adjusted R-squared:  0.1321 
## F-statistic: 48.41 on 3 and 931 DF,  p-value: < 2.2e-16

#iv. Return to Education When exper = 10
# Extract coefficients
B_educ <- coef(model)["educ"]
B_exper <- coef(model)["exper"]

# Calculate the return to education when exper = 10
theta <- B_educ + 10 * B_exper

# Get standard error for the estimate
se_theta <- summary(model)$coefficients["exper", "Std. Error"]

# Calculate the 95% confidence interval
confidence_interval <- c(theta - 1.96 * se_theta, theta + 1.96 * se_theta)
cat("Theta:", theta, "\n")

## Theta: -0.1709096

cat("95% Confidence Interval:", confidence_interval, "\n")

## 95% Confidence Interval: -0.210067 -0.1317521

#Chapter 6 C12.

install.packages("wooldridge")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

library(wooldridge)
data("k401ksubs")
subsetted_data <- k401ksubs[k401ksubs$fsize == 1, ]

#i.What is the youngest age of people in this sample? How many people are at that age?
youngest_age <- min(k401ksubs$age)
count_youngest_age <- sum(k401ksubs$age == youngest_age)
cat("Youngest age:", youngest_age, "\n")

## Youngest age: 25

cat("Number of people at the youngest age:", count_youngest_age, "\n")

## Number of people at the youngest age: 211

#ii. What is the literal interpretation of b2? By itself, is it of much interest?
model <- lm(nettfa ~ inc + age + agesq , data = k401ksubs)
summary(model)

## 
## Call:
## lm(formula = nettfa ~ inc + age + agesq, data = k401ksubs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -504.93  -18.61   -3.08    9.96 1464.26 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.680388  10.080986   0.464    0.642    
## inc          0.978252   0.025489  38.379  < 2e-16 ***
## age         -2.231489   0.489712  -4.557 5.26e-06 ***
## agesq        0.037722   0.005621   6.710 2.05e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 58.18 on 9271 degrees of freedom
## Multiple R-squared:  0.1731, Adjusted R-squared:  0.1728 
## F-statistic: 646.8 on 3 and 9271 DF,  p-value: < 2.2e-16

#. it displayed minus number so net total fin. assets will decreased by 2.231489 wen people age by 1.

#iii. A negative coefficient indicates that there is a negative relationship between the predictor variable ('Age') and the response variable ('net asset').

midterm

Narantuya

2023-11-10