#PROBLEM 2-1 Omitted Variable Bias #install libraries

library(dplyr)

## 
## 次のパッケージを付け加えます: 'dplyr'

## 以下のオブジェクトは 'package:stats' からマスクされています:
## 
##     filter, lag

## 以下のオブジェクトは 'package:base' からマスクされています:
## 
##     intersect, setdiff, setequal, union

library(modelsummary)
library(estimatr)
library(AER) # for CASchools data set

## 要求されたパッケージ car をロード中です

## 要求されたパッケージ carData をロード中です

## 
## 次のパッケージを付け加えます: 'car'

## 以下のオブジェクトは 'package:dplyr' からマスクされています:
## 
##     recode

## 要求されたパッケージ lmtest をロード中です

## 要求されたパッケージ zoo をロード中です

## 
## 次のパッケージを付け加えます: 'zoo'

## 以下のオブジェクトは 'package:base' からマスクされています:
## 
##     as.Date, as.Date.numeric

## 要求されたパッケージ sandwich をロード中です

## 要求されたパッケージ survival をロード中です

#data import

data("CASchools")

create the variable

CASchools <- CASchools %>% 
  mutate(STR= students/teachers) %>% 
  mutate(score = (read + math)/2)

formula for the omitted variable regression

ovreg <- english ~ STR

regression model using the formula from ‘ovreg’

model_ovreg <- lm_robust(ovreg, data = CASchools)

tabulate with function ‘modelsummary’

modelsummary(model_ovreg, stars = TRUE)

	(1)
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
(Intercept)	-19.854*
	(8.698)
STR	1.814***
	(0.448)
Num.Obs.	420
R2	0.035
R2 Adj.	0.033
AIC	3623.0
BIC	3635.1
RMSE	17.94

long regression:

long <- score ~ STR + english

formula for the omitted variable regression from (a)

ovreg <- english ~ STR

fill in the below to complete the function

ovb2 <- function(data, treatment, omitted_variable) {
  
# regression model using the formula from 'long'
model_long <- lm_robust(long, data = CASchools)
  

# regression model using the formula from 'ovreg' from (a)

model_ovreg <- lm_robust(ovreg, data = CASchools)


# extract the coefficients

lambda <- coef[["coefficients"]][omitted_variable]
pi <- coef[["coefficients"]][omitted_variable]

# compute the omitted variable bias
bias <- lambda * pi

paste("The magnitude of the bias is", bias)
}

#Problem 2-2 #formulas for model

model_list <- list()

reg1 <- score ~ STR
reg2 <- score ~ STR + english
reg3 <- score ~ STR + english + lunch 
reg4 <- score ~ STR + english + calworks
reg5 <- score ~ STR + english + lunch + calworks

model1 <- lm_robust(reg1, data = CASchools)
model2 <- lm_robust(reg2, data = CASchools)
model3 <- lm_robust(reg3, data = CASchools)
model4 <- lm_robust(reg4, data = CASchools)
model5 <- lm_robust(reg5, data = CASchools)

model_list[["model1"]] <- model1
model_list[["model2"]] <- model2
model_list[["model3"]] <- model3
model_list[["model4"]] <- model4
model_list[["model5"]] <- model5

modelsummary(model_list,  stars=TRUE)

	model1	model2	model3	model4	model5
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
(Intercept)	698.933***	686.032***	700.150***	697.999***	700.392***
	(10.400)	(8.754)	(5.591)	(6.946)	(5.559)
STR	-2.280***	-1.101*	-0.998***	-1.308***	-1.014***
	(0.521)	(0.434)	(0.271)	(0.340)	(0.270)
english		-0.650***	-0.122***	-0.488***	-0.130***
		(0.031)	(0.033)	(0.030)	(0.036)
lunch			-0.547***		-0.529***
			(0.024)		(0.038)
calworks				-0.790***	-0.048
				(0.069)	(0.060)
Num.Obs.	420	420	420	420	420
R2	0.051	0.426	0.775	0.629	0.775
R2 Adj.	0.049	0.424	0.773	0.626	0.773
AIC	3650.5	3441.1	3051.0	3260.7	3052.4
BIC	3662.6	3457.3	3071.2	3280.9	3076.6
RMSE	18.54	14.41	9.04	11.60	9.03

print(modelsummary)

## function (models, output = getOption("modelsummary_output", default = "default"), 
##     fmt = getOption("modelsummary_fmt", default = 3), estimate = getOption("modelsummary_estimate", 
##         default = "estimate"), statistic = getOption("modelsummary_statistic", 
##         default = "std.error"), vcov = getOption("modelsummary_vcov", 
##         default = NULL), conf_level = getOption("modelsummary_conf_level", 
##         default = 0.95), exponentiate = getOption("modelsummary_exponentiate", 
##         default = FALSE), stars = getOption("modelsummary_stars", 
##         default = FALSE), shape = getOption("modelsummary_shape", 
##         default = term + statistic ~ model), coef_map = getOption("modelsummary_coef_map", 
##         default = NULL), coef_omit = getOption("modelsummary_coef_omit", 
##         default = NULL), coef_rename = getOption("modelsummary_coef_rename", 
##         default = FALSE), gof_map = getOption("modelsummary_gof_map", 
##         default = NULL), gof_omit = getOption("modelsummary_gof_omit", 
##         default = NULL), gof_function = getOption("modelsummary_gof_function", 
##         default = NULL), group_map = getOption("modelsummary_group_map", 
##         default = NULL), add_columns = getOption("modelsummary_add_columns", 
##         default = NULL), add_rows = getOption("modelsummary_add_rows", 
##         default = NULL), align = getOption("modelsummary_align", 
##         default = NULL), notes = getOption("modelsummary_notes", 
##         default = NULL), title = getOption("modelsummary_title", 
##         default = NULL), escape = getOption("modelsummary_escape", 
##         default = TRUE), ...) 
## {
##     checkmate::assert(checkmate::check_formula(shape), checkmate::check_choice(shape, 
##         c("cbind", "rbind", "rcollapse")), checkmate::check_null(shape))
##     if (isTRUE(checkmate::check_choice(shape, c("rbind", "rcollapse")))) {
##         out <- modelsummary_rbind(models, output = output, fmt = fmt, 
##             estimate = estimate, statistic = statistic, vcov = vcov, 
##             conf_level = conf_level, exponentiate = exponentiate, 
##             stars = stars, coef_map = coef_map, coef_omit = coef_omit, 
##             coef_rename = coef_rename, gof_map = gof_map, gof_omit = gof_omit, 
##             gof_function = gof_function, add_columns = add_columns, 
##             add_rows = add_rows, align = align, shape = shape, 
##             group_map = NULL, notes = notes, title = title, escape = escape, 
##             ...)
##         return(out)
##     }
##     dots <- list(...)
##     if (!settings_equal("function_called", "modelsummary_rbind")) {
##         settings_init(settings = list(function_called = "modelsummary"))
##     }
##     checkmate::assert_string(gof_omit, null.ok = TRUE)
##     tmp <- sanitize_output(output)
##     output_format <- tmp$output_format
##     output_factory <- tmp$output_factory
##     output_file <- tmp$output_file
##     tmp <- get_span_cbind(models, shape)
##     shape <- tmp$shape
##     models <- tmp$models
##     span_cbind <- tmp$span_cbind
##     sanitize_escape(escape)
##     sanity_ellipsis(vcov, ...)
##     models <- sanitize_models(models, ...)
##     vcov <- sanitize_vcov(vcov, models, ...)
##     number_of_models <- max(length(models), length(vcov))
##     estimate <- sanitize_estimate(estimate, number_of_models)
##     exponentiate <- sanitize_exponentiate(exponentiate, number_of_models)
##     shape <- sanitize_shape(shape)
##     statistic <- sanitize_statistic(statistic, shape, conf_level)
##     gof_map <- sanitize_gof_map(gof_map)
##     fmt <- sanitize_fmt(fmt, calling_function = "modelsummary")
##     sanity_group_map(group_map)
##     conf_level <- sanitize_conf_level(conf_level, estimate, statistic)
##     sanity_coef(coef_map, coef_rename, coef_omit)
##     sanity_stars(stars)
##     sanity_align(align, estimate = estimate, statistic = statistic, 
##         stars = stars)
##     checkmate::assert_function(gof_function, null.ok = TRUE)
##     if (!any(grepl("conf", c(estimate, statistic)))) {
##         conf_level <- NULL
##     }
##     modelsummary_model_labels <- getOption("modelsummary_model_labels", 
##         default = "(arabic)")
##     if (is.null(names(models))) {
##         checkmate::assert_choice(modelsummary_model_labels, choices = c("model", 
##             "arabic", "letters", "roman", "(arabic)", "(letters)", 
##             "(roman)"))
##         if (modelsummary_model_labels == "model") {
##             model_names <- paste("Model", 1:number_of_models)
##         }
##         else if (grepl("arabic", modelsummary_model_labels)) {
##             model_names <- as.character(1:number_of_models)
##         }
##         else if (grepl("letters", modelsummary_model_labels)) {
##             model_names <- LETTERS[1:number_of_models]
##         }
##         else if (grepl("roman", modelsummary_model_labels)) {
##             model_names <- as.character(utils::as.roman(1:number_of_models))
##         }
##         if (grepl("\\(", modelsummary_model_labels)) {
##             model_names <- sprintf("(%s)", model_names)
##         }
##     }
##     else {
##         model_names <- names(models)
##     }
##     model_names <- pad(model_names, output_format = output_format)
##     if (!settings_equal("function_called", "modelsummary_rbind") && 
##         all(grepl("^\\(\\d+\\)$", model_names)) && identical(output_format, 
##         "kableExtra")) {
##         model_names <- paste0("&nbsp;", model_names)
##     }
##     msl <- get_list_of_modelsummary_lists(models = models, conf_level = conf_level, 
##         vcov = vcov, gof_map = gof_map, gof_function = gof_function, 
##         shape = shape, coef_rename = coef_rename, output_format = output_format, 
##         ...)
##     names(msl) <- model_names
##     if (identical(output_format, "modelsummary_list")) {
##         if (length(msl) == 1) {
##             return(msl[[1]])
##         }
##         else {
##             return(msl)
##         }
##     }
##     est <- list()
##     for (i in seq_along(msl)) {
##         tmp <- format_estimates(est = msl[[i]]$tidy, fmt = fmt, 
##             estimate = estimate[[i]], estimate_label = names(estimate)[1], 
##             statistic = statistic, vcov = vcov[[i]], conf_level = conf_level, 
##             stars = stars, shape = shape, group_name = shape$group_name, 
##             exponentiate = exponentiate[[i]], ...)
##         tmp <- map_estimates(tmp, coef_rename = coef_rename, 
##             coef_map = coef_map, coef_omit = coef_omit, group_map = group_map)
##         colnames(tmp)[match("modelsummary_value", colnames(tmp))] <- model_names[i]
##         est[[model_names[i]]] <- tmp
##     }
##     term_order <- unique(unlist(lapply(est, function(x) x$term)))
##     statistic_order <- unique(unlist(lapply(est, function(x) x$statistic)))
##     bycols <- c(list(c(shape$group_name, "group", "term", "statistic")), 
##         lapply(est, colnames))
##     bycols <- Reduce(intersect, bycols)
##     f <- function(x, y) {
##         merge(x, y, all = TRUE, sort = FALSE, by = bycols)
##     }
##     est <- Reduce(f, est)
##     if (is.null(shape$group_name)) {
##         idx <- paste(est$term, est$statistic)
##         if (anyDuplicated(idx) > 0) {
##             candidate_groups <- sapply(msl, function(x) colnames(x[["tidy"]]))
##             candidate_groups <- unlist(candidate_groups)
##             candidate_groups <- setdiff(candidate_groups, c("term", 
##                 "type", "estimate", "std.error", "conf.level", 
##                 "conf.low", "conf.high", "statistic", "df.error", 
##                 "p.value"))
##             msg <- c("There are duplicate term names in the table.", 
##                 "The `shape` argument of the `modelsummary` function can be used to print related terms together. The `group_map` argument can be used to reorder, subset, and rename group identifiers. See `?modelsummary` for details.", 
##                 "You can find the group identifier to use in the `shape` argument by calling `get_estimates()` on one of your models. Candidates include:", 
##                 paste(candidate_groups, collapse = ", "))
##             insight::format_warning(msg)
##         }
##     }
##     est <- shape_estimates(est, shape, conf_level = conf_level, 
##         statistic = statistic, estimate = estimate)
##     est$part <- "estimates"
##     est <- est[, unique(c("part", names(est)))]
##     est[is.na(est)] <- ""
##     if ("term" %in% colnames(est)) {
##         if (!is.null(coef_map)) {
##             term_order <- coef_map
##         }
##         est$term <- factor(est$term, unique(term_order))
##         if ("group" %in% colnames(est)) {
##             if (!is.null(group_map)) {
##                 est$group <- factor(est$group, group_map)
##             }
##             else {
##                 est$group <- factor(est$group, unique(est$group))
##             }
##         }
##     }
##     else if ("model" %in% colnames(est)) {
##         est$model <- factor(est$model, model_names)
##     }
##     if ("statistic" %in% colnames(est)) {
##         est$statistic <- factor(est$statistic, statistic_order)
##     }
##     est <- est[do.call(order, as.list(est)), ]
##     if (is.numeric(coef_omit)) {
##         coef_omit <- unique(round(coef_omit))
##         if (length(unique(sign(coef_omit))) != 1) {
##             insight::format_error("All elements of `coef_omit` must have the same sign.")
##         }
##         if (!"term" %in% shape$lhs) {
##             msg <- "`term` must be on the left-hand side of the `shape` formula when `coef_omit` is a numeric vector."
##             insight::format_error(msg)
##         }
##         term_idx <- paste(est$group, est$term)
##         if (max(abs(coef_omit)) > length(unique(term_idx))) {
##             msg <- sprintf("There are %s unique terms, but `coef_omit` tried to omit more than that.", 
##                 length(term_idx))
##             insight::format_error(msg)
##         }
##         idx <- !term_idx %in% unique(term_idx)[abs(coef_omit)]
##         if (any(coef_omit > 0)) {
##             est <- est[idx, , drop = FALSE]
##         }
##         else {
##             est <- est[!idx, , drop = FALSE]
##         }
##     }
##     if (isTRUE(checkmate::check_character(coef_rename, names = "unnamed"))) {
##         nterms <- length(unique(est$term))
##         if (length(coef_rename) != nterms) {
##             msg <- "`coef_rename` must be a named character vector or an unnamed vector of length %s"
##             insight::format_error(sprintf(msg, nterms))
##         }
##         dict <- stats::setNames(coef_rename, as.character(unique(est$term)))
##         tmp <- replace_dict(as.character(est$term), dict)
##         est$term <- factor(tmp, unique(tmp))
##     }
##     if (is.null(shape$group_name)) {
##         est[["group"]] <- NULL
##     }
##     cols <- intersect(colnames(est), c("term", shape$group_name, 
##         "model", "statistic"))
##     for (col in cols) {
##         est[[col]] <- as.character(est[[col]])
##     }
##     if (all(est[["term"]] == "cross")) {
##         est[["term"]] <- NULL
##     }
##     gof <- list()
##     for (i in seq_along(msl)) {
##         if (is.data.frame(msl[[i]]$glance)) {
##             gof[[i]] <- format_gof(msl[[i]]$glance, fmt = fmt, 
##                 gof_map = gof_map, ...)
##             colnames(gof[[i]])[2] <- model_names[i]
##         }
##         else {
##             gof[[i]] <- NULL
##         }
##     }
##     f <- function(x, y) merge(x, y, all = TRUE, sort = FALSE, 
##         by = "term")
##     gof <- Reduce(f, gof)
##     gof <- map_gof(gof, gof_omit, gof_map)
##     tab <- bind_est_gof(est, gof)
##     tab[is.na(tab)] <- ""
##     if (is.null(coef_map) && isFALSE(coef_rename) && "term" %in% 
##         colnames(tab) && !identical(output_format, "rtf")) {
##         idx <- tab$part != "gof"
##         tab$term <- ifelse(idx, gsub("::", " = ", tab$term), 
##             tab$term)
##         tab$term <- ifelse(idx, gsub(":", " × ", tab$term), 
##             tab$term)
##     }
##     hrule <- match("gof", tab$part)
##     if (!is.na(hrule) && !is.null(add_rows) && !is.null(attr(add_rows, 
##         "position"))) {
##         hrule <- hrule + sum(length(attr(add_rows, "position") < 
##             hrule)) - 1
##     }
##     if (is.na(hrule)) {
##         hrule <- NULL
##     }
##     stars_note <- settings_get("stars_note")
##     if (isTRUE(stars_note) && !isFALSE(stars) && !any(grepl("\\{stars\\}", 
##         c(estimate, statistic)))) {
##         stars_note <- make_stars_note(stars, output_format = output_format, 
##             output_factory = output_factory)
##         if (is.null(notes)) {
##             notes <- stars_note
##         }
##         else {
##             notes <- c(stars_note, notes)
##         }
##     }
##     if (settings_equal("function_called", "modelsummary_rbind")) {
##         tab <- redundant_labels(tab, "term")
##     }
##     if (!identical(output_format, "dataframe") && !settings_equal("function_called", 
##         "modelsummary_rbind")) {
##         dups <- c("term", "model", shape$group_name)
##         for (d in dups) {
##             tab <- redundant_labels(tab, d)
##         }
##         tab$statistic <- tab$part <- NULL
##         if ("term" %in% colnames(tab)) 
##             colnames(tab)[colnames(tab) == "term"] <- "       "
##         if ("model" %in% colnames(tab)) 
##             colnames(tab)[colnames(tab) == "model"] <- "         "
##         if ("group" %in% colnames(tab)) 
##             colnames(tab)[colnames(tab) == "model"] <- "          "
##     }
##     if (length(unique(tab$group)) == 1) {
##         tab$group <- NULL
##     }
##     tmp <- setdiff(shape$lhs, c("model", "term"))
##     if (length(tmp) == 0) {
##         tab$group <- NULL
##     }
##     else if (!identical(output_format, "dataframe")) {
##         colnames(tab)[colnames(tab) == "group"] <- "        "
##     }
##     if (is.null(align)) {
##         n_stub <- sum(grepl("^ *$", colnames(tab))) + sum(colnames(tab) %in% 
##             c(" ", shape$group_name))
##         align <- paste0(strrep("l", n_stub), strrep("c", ncol(tab) - 
##             n_stub))
##         if (isTRUE(checkmate::check_data_frame(add_columns))) {
##             align <- paste0(align, strrep("c", ncol(add_columns)))
##         }
##     }
##     flag <- any(sapply(models, inherits, c("marginaleffects", 
##         "comparisons", "marginalmeans")))
##     if (isTRUE(flag)) {
##         colnames(tab) <- gsub("^value$", " ", colnames(tab))
##         colnames(tab) <- gsub("^contrast_", "", colnames(tab))
##         colnames(tab) <- gsub("^contrast$", " ", colnames(tab))
##     }
##     for (i in seq_along(tab)) {
##         tab[[i]] <- gsub("\\(\\s*\\)", "", tab[[i]])
##         tab[[i]] <- gsub("\\(\\\\num\\{NA\\}\\)", "", tab[[i]])
##         tab[[i]] <- gsub("\\[,\\s*\\]", "", tab[[i]])
##         tab[[i]] <- gsub("\\[\\\\num\\{NA\\}, \\\\num\\{NA\\}\\]", 
##             "", tab[[i]])
##     }
##     idx <- apply(tab, 1, function(x) any(x != ""))
##     tab <- tab[idx, ]
##     out <- factory(tab, align = align, fmt = fmt, hrule = hrule, 
##         gof_idx = hrule, notes = notes, output = output, title = title, 
##         add_rows = add_rows, add_columns = add_columns, escape = escape, 
##         output_factory = output_factory, output_format = output_format, 
##         output_file = output_file, ...)
##     out <- set_span_cbind(out, span_cbind)
##     if (settings_equal("function_called", "modelsummary_rbind")) {
##         return(out)
##     }
##     else if (!is.null(output_file) || isTRUE(output == "jupyter") || 
##         (isTRUE(output == "default") && settings_equal("output_default", 
##             "jupyter"))) {
##         settings_rm()
##         return(invisible(out))
##     }
##     else {
##         settings_rm()
##         return(out)
##     }
## }
## <bytecode: 0x0000020345b60468>
## <environment: namespace:modelsummary>

Percent on public income assistance(calworks) would be unnecessary. This is becasse the coefficient of STR variable increases once calworks added in the model4.

Homework2

Asahi

2025-05-20