library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
enrollment <- read_excel("unenrolled_data.xlsx")
Dependent variable: enrollment (individual enrolls, 1) (individuals doesn’t enroll, 0) Independent variables: regions (NW=1, N=2, NE=3, C=4, W=5, S=6, E=7), race (hispanic=1, non=0)
enrollment_race <- enrollment %>% mutate(hispanic=ifelse(grepl("hispanic",race),1,0))
enrollment_model <- lm(enrollee~region_code, data=enrollment_race)
summary(enrollment_model)
##
## Call:
## lm(formula = enrollee ~ region_code, data = enrollment_race)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8175 0.1825 0.1946 0.2189 0.2553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.82967 0.04114 20.169 <2e-16 ***
## region_code -0.01214 0.01215 -0.998 0.319
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4056 on 481 degrees of freedom
## Multiple R-squared: 0.002068, Adjusted R-squared: -6.25e-06
## F-statistic: 0.997 on 1 and 481 DF, p-value: 0.3185
p-value is not significant at 0.31, adjusted r-square is -6.25e-06
enrollment_2 <- read_excel("enrolled.xlsx")
enrollment_race_2 <- enrollment_2 %>% mutate(hispanic=ifelse(grepl("hispanic",race),1,0))
enrolled_multiple<-lm(hispanic~region_code,data=enrollment_race_2)
summary(enrolled_multiple)
##
## Call:
## lm(formula = hispanic ~ region_code, data = enrollment_race_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5153 -0.4452 -0.4101 0.5548 0.5899
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.39253 0.05503 7.133 5e-12 ***
## region_code 0.01755 0.01631 1.076 0.283
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4975 on 380 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.003038, Adjusted R-squared: 0.0004144
## F-statistic: 1.158 on 1 and 380 DF, p-value: 0.2826
plot(enrolled_multiple, which=3)