library(readxl)
library(readr)
library(mosaic)
library(ggplot2)
library(dplyr)
Colleges <- read_excel("C:/Users/casey/Downloads/Colleges.xlsx")
tally(Colleges$REGION)
## X
## 1 2 3 4 5 6 7 8 9
## 215 628 560 335 1054 433 130 536 92
df1 <- Colleges %>%
mutate(
Setting = ifelse(LOCALE >40, "Rural",
ifelse(LOCALE == 31, "Town",
ifelse(LOCALE == 32, "Town",
ifelse(LOCALE == 33, "Town",
ifelse(LOCALE == 21, "Suburb",
ifelse(LOCALE == 22, "Suburb",
ifelse(LOCALE == 23, "Suburb",
ifelse(LOCALE == 11, "City",
ifelse(LOCALE == 12, "City",
ifelse(LOCALE == 13, "City",
ifelse(LOCALE == -3, "Town", "."
))))))))))),
Region = ifelse(REGION == 1, "NewEngland",
ifelse(REGION == 2, "MidEast",
ifelse(REGION == 3, "GreatLakes",
ifelse(REGION == 4, "Plains",
ifelse(REGION == 5, "Southeast",
ifelse(REGION == 6, "Southwest",
ifelse(REGION == 7, "RockyMountains",
ifelse(REGION == 8, "FarWest",
ifelse(REGION == 9, "Others", "."
))))))))),
FirstGen = 100*FirstGen
)
df1 <- filter(df1, HighDeg > 2)
model1 <- lm(MDEARNP10 ~ FirstGen, data = df1)
summary(model1)
##
## Call:
## lm(formula = MDEARNP10 ~ FirstGen, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41793 -7956 -2405 5501 76190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73611.41 995.42 73.95 <2e-16 ***
## FirstGen -635.93 26.66 -23.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13870 on 1964 degrees of freedom
## Multiple R-squared: 0.2246, Adjusted R-squared: 0.2242
## F-statistic: 568.8 on 1 and 1964 DF, p-value: < 2.2e-16
##tally(df1)
options(scipen = 999)
ggplot(data = df1, aes(x = FirstGen, y = MDEARNP10)) + geom_point() + stat_smooth(method = lm) + labs(title = "Schools with more First Income Students Produce Lower Income Graduates?", x = "First Gen Students (Percent of Student Body)", y = "Median Earnings Afer 10 Years (USD)")
