This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
htv <- read_excel("htv.xlsx")
educ_range <- range(htv$educ)
educ_range
## [1] 6 20
twelfth_only <- sum(htv$educ == 12)
percent_twelfth_only <- (twelfth_only / nrow(htv)) * 100
percent_twelfth_only
## [1] 41.62602
mean_educ <- mean(htv$educ)
mean_fatheduc <- mean(htv$fatheduc)
mean_motheduc <- mean(htv$motheduc)
mean_parents <- (mean_fatheduc + mean_motheduc) / 2
mean_educ
## [1] 13.0374
mean_fatheduc
## [1] 12.44715
mean_motheduc
## [1] 12.17805
mean_parents
## [1] 12.3126
if (mean_educ > mean_parents) {
cat("Men have higher average education than their parents.")
} else {
cat("Parents have higher average education than their sons.")
}
## Men have higher average education than their parents.
model <- lm(educ ~ motheduc + fatheduc, data = htv)
summary(model)
##
## Call:
## lm(formula = educ ~ motheduc + fatheduc, data = htv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2898 -0.9400 -0.4037 1.1239 8.1672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.96435 0.31982 21.776 <2e-16 ***
## motheduc 0.30420 0.03193 9.528 <2e-16 ***
## fatheduc 0.19029 0.02228 8.539 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.042 on 1227 degrees of freedom
## Multiple R-squared: 0.2493, Adjusted R-squared: 0.248
## F-statistic: 203.7 on 2 and 1227 DF, p-value: < 2.2e-16
tidy(model)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.96 0.320 21.8 3.75e-89
## 2 motheduc 0.304 0.0319 9.53 8.23e-21
## 3 fatheduc 0.190 0.0223 8.54 3.95e-17
motheduc
shows the
expected change in the man’s years of education for each additional year
of his mother’s education, assuming father’s education is constant.model_abil <- lm(educ ~ motheduc + fatheduc + abil, data = htv)
summary(model_abil)
##
## Call:
## lm(formula = educ ~ motheduc + fatheduc + abil, data = htv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.407 -1.195 -0.199 1.076 7.012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.44869 0.28954 29.180 < 2e-16 ***
## motheduc 0.18913 0.02851 6.635 4.87e-11 ***
## fatheduc 0.11109 0.01988 5.586 2.85e-08 ***
## abil 0.50248 0.02572 19.538 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.784 on 1226 degrees of freedom
## Multiple R-squared: 0.4275, Adjusted R-squared: 0.4261
## F-statistic: 305.2 on 3 and 1226 DF, p-value: < 2.2e-16
tidy(model_abil)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 8.45 0.290 29.2 1.41e-142
## 2 motheduc 0.189 0.0285 6.63 4.87e- 11
## 3 fatheduc 0.111 0.0199 5.59 2.85e- 8
## 4 abil 0.502 0.0257 19.5 3.18e- 74
\[ \text{educ} = \beta_0 + \beta_1 \cdot \text{motheduc} + \beta_2 \cdot \text{fatheduc} + \beta_3 \cdot \text{abil} + u \]
\[ \text{educ} = 8.449 + 0.189 \cdot \text{motheduc} + 0.111 \cdot \text{fatheduc} + 0.503 \cdot \text{abil} \]
abil
) is statistically
significant (p < 0.001) and positively associated
with education as it has positive coefficient:0.503 and
t-statistic: 19.54.motheduc
), (fatheduc
), and
(abil
)par(mfrow = c(2, 2))
plot(model_abil)
model_quad <- lm(educ ~ motheduc + fatheduc + abil + I(abil^2), data = htv)
summary(model_quad)
##
## Call:
## lm(formula = educ ~ motheduc + fatheduc + abil + I(abil^2), data = htv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2506 -1.1274 -0.1355 1.0223 7.0482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.240226 0.287410 28.671 < 2e-16 ***
## motheduc 0.190126 0.028096 6.767 2.03e-11 ***
## fatheduc 0.108939 0.019601 5.558 3.35e-08 ***
## abil 0.401462 0.030288 13.255 < 2e-16 ***
## I(abil^2) 0.050599 0.008304 6.093 1.48e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.758 on 1225 degrees of freedom
## Multiple R-squared: 0.4444, Adjusted R-squared: 0.4425
## F-statistic: 244.9 on 4 and 1225 DF, p-value: < 2.2e-16
tidy(model_quad)
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 8.24 0.287 28.7 9.51e-139
## 2 motheduc 0.190 0.0281 6.77 2.03e- 11
## 3 fatheduc 0.109 0.0196 5.56 3.35e- 8
## 4 abil 0.401 0.0303 13.3 1.42e- 37
## 5 I(abil^2) 0.0506 0.00830 6.09 1.48e- 9
# Extract coefficients
b0 <- coef(model_quad)["(Intercept)"]
b1 <- coef(model_quad)["motheduc"]
b2 <- coef(model_quad)["fatheduc"]
b3 <- coef(model_quad)["abil"]
b4 <- coef(model_quad)["I(abil^2)"]
# Find abil* where educ is minimized (assuming parents education is fixed)
abil_star <- -b3 / (2 * b4)
second_derivative <- 2 * b4
abil_star
## abil
## -3.967098
second_derivative
## I(abil^2)
## 0.101198
# Check how many individuals have abil < abil*
count_below_star <- sum(htv$abil < abil_star, na.rm = TRUE)
total_non_missing <- sum(!is.na(htv$abil))
percent_below_star <- (count_below_star / total_non_missing) * 100
percent_below_star
## [1] 1.219512
\[ \text{educ} = \beta_0 + \beta_1 \cdot \text{motheduc} + \beta_2 \cdot \text{fatheduc} + \beta_3 \cdot \text{abil} + \beta_4 \cdot \text{abil}^2 + u \]
We calculate \(abil^* = -\beta_3 /
(2\beta_4)\) to find the value of abil
that
minimizes education.
Second derivative is \(2\beta_4\). If positive, this confirms a minimum.
abil* = -3.9670977, and second derivative = 0.101198.
This confirms the point is a minimum since the second derivative > 0 and positive.
cat("Percentage of individuals with abil less than abil*: ", round(percent_below_star, 2), "%")
## Percentage of individuals with abil less than abil*: 1.22 %
ggplot(htv, aes(x = abil, y = educ)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", formula = y ~ x + I(x^2), color = "blue") +
geom_vline(xintercept = abil_star, linetype = "dashed", color = "red") +
annotate("text", x = abil_star, y = max(htv$educ, na.rm = TRUE), label = "abil*", vjust = -1, color = "red") +
labs(title = "Education vs. Ability (Quadratic Fit)",
x = "Cognitive Ability (abil)",
y = "Years of Education") +
theme_minimal()
abil
score
below abil*
.educ
and
abil
. But since most individuals have abil
> abil*
(as seen on above Visualization), we’re mostly
observing the increasing portion of the curve.# Generate sequence of abil values
abil_seq <- seq(min(htv$abil, na.rm = TRUE), max(htv$abil, na.rm = TRUE), length.out = 200)
# Set average parental education
mean_motheduc <- 12.18
mean_fatheduc <- 12.45
# Predict education
predicted_educ <- b0 + b1 * mean_motheduc + b2 * mean_fatheduc + b3 * abil_seq + b4 * abil_seq^2
# Create dataframe for plotting
plot_df <- data.frame(abil = abil_seq, predicted_educ = predicted_educ)
# Plot
ggplot(plot_df, aes(x = abil, y = predicted_educ)) +
geom_line(color = "darkgreen", linewidth = 1) +
geom_vline(xintercept = abil_star, linetype = "dashed", color = "red") +
annotate("text", x = abil_star, y = max(predicted_educ), label = "abil*", vjust = -1, color = "red") +
labs(title = "Predicted Education vs. Ability",
subtitle = "Holding motheduc = 12.18 and fatheduc = 12.45",
x = "Cognitive Ability (abil)",
y = "Predicted Years of Education") +
theme_minimal()