library(knitr) # Optional helpers: suppressPackageStartupMessages({ library(dplyr) library(broom) })
X <- c(4, 3, 5, 4, 7) Y <- c(85, 65, 100, 120, 93) dat <- data.frame(X, Y)
kable(dat, caption = “Q1 Data: Education (Years) and Earnings ($1,000s)”)
xbar <- mean(X) ybar <- mean(Y) Sxx <- sum( (X - xbar)^2 ) Sxy <- sum( (X - xbar) * (Y - ybar) )
beta1_hat_formula <- Sxy / Sxx beta0_hat_formula <- ybar - beta1_hat_formula * xbar
cat(“ (formula):”) cat(sprintf(“x̄ = %.3f, ȳ = %.3f”, xbar, ybar)) cat(sprintf(“Sxx = %.3f, Sxy = %.3f”, Sxx, Sxy)) cat(sprintf(“β̂1 = Sxy/Sxx = %.6f”, beta1_hat_formula)) cat(sprintf(“β̂0 = ȳ − β̂1·x̄ = %.6f”, beta0_hat_formula))
fit <- lm(Y ~ X, data = dat) cat(“Q1 (lm check):”) print(coef(fit)) cat(“summary (for confirmation):”) print(summary(fit))
q1_beta1 <- as.numeric(beta1_hat_formula) # ≈ 3.9347826 q1_beta0 <- as.numeric(beta0_hat_formula) # ≈ 74.5 cat(sprintf(“ ANSWER:̂0 ≈ %.3f β̂1 ≈ %.3f”, q1_beta0, q1_beta1))
cat(“Q2) Interpretations”)
cat(“(a)”) cat(“* β̂0 = 123,456: Predicted votes if a candidate gave 0 campaign speeches.”) cat(“* β̂1 = 3,245: Each additional campaign speech is associated with +3,245 votes, on average.”) cat(“* R^2 = 0.72: 72% of the variance in votes is explained by the number of speeches.”) cat(” Goodness of fit: High for a single-predictor model; speeches explain much of votes’ variation.“)
cat(“(b)”) cat(“* β̂0 = 52: Predicted life expectancy when satisfaction score is 0 (baseline intercept).”) cat(“* β̂1 = 1.35: A 1-point increase in satisfaction is associated with +1.35 years life expectancy, on average.”) cat(“* R^2 = 0.68: 68% of cross-country life expectancy variation is explained by satisfaction.”) cat(” Goodness of fit: Strong; model explains a large share of variation.“)
cat(“(c)”) cat(“* β̂0 = 124,000: Predicted crimes when population is 0 (intercept; not directly meaningful but needed for the line).”) cat(“* β̂1 = 26: Each additional person is associated with +26 crimes, on average (units imply total crimes per county).”) cat(“* R^2 = 0.34: 34% of variation in crime is explained by population.”) cat(” Goodness of fit: Moderate/weak; much variation remains unexplained, suggesting other key predictors matter.“)
cat(“) Pearson r interpretations”) cat(“(a) r = -0.065: Very weak negative linear relationship (essentially ~0).”) cat(“(b) r = -0.83 : Strong negative linear relationship.”) cat(“(c) r = 0.70 : Strong positive linear relationship.”)
cat(“) Limitation of Pearson r”) cat(“- r only measures linear association: it can miss strong nonlinear relationships.”) cat(“- It is sensitive to outliers and does not imply causation.”) cat(“- Proper inference for r relies on assumptions (e.g., approximate bivariate normality for tests/CI).”)
cat(“) Gauss–Markov assumptions”) cat(“1) Linearity in parameters (model is linear in β).”) cat(“2) Random sampling (and correctly specified model).”) cat(“3) No perfect multicollinearity (predictors not exact linear combos).”) cat(“4) Zero conditional mean: E[u | X] = 0 (exogeneity; no omitted-variable bias).”) cat(“5) Homoskedasticity: Var(u | X) = constant.”) cat(” (Normality is not required for BLUE, but helps with small-sample inference.)“)