library(readxl)
library(readr)
library(mosaic)
library(ggplot2)
library(dplyr)
Colleges <- read_excel("C:/Users/casey/Downloads/Colleges.xlsx")
tally(Colleges$REGION)
## X
##    1    2    3    4    5    6    7    8    9 
##  215  628  560  335 1054  433  130  536   92
df1 <- Colleges %>%
  mutate(
    Setting = ifelse(LOCALE >40, "Rural",
             ifelse(LOCALE == 31, "Town",
             ifelse(LOCALE == 32, "Town",
             ifelse(LOCALE == 33, "Town",
             ifelse(LOCALE == 21, "Suburb",
             ifelse(LOCALE == 22, "Suburb",
             ifelse(LOCALE == 23, "Suburb",
             ifelse(LOCALE == 11, "City",
             ifelse(LOCALE == 12, "City",
             ifelse(LOCALE == 13, "City", 
             ifelse(LOCALE == -3, "Town", "."
                    ))))))))))),
    Region = ifelse(REGION == 1, "NewEngland",
            ifelse(REGION == 2, "MidEast",
            ifelse(REGION == 3, "GreatLakes",
            ifelse(REGION == 4, "Plains",
            ifelse(REGION == 5, "Southeast",
            ifelse(REGION == 6, "Southwest",
            ifelse(REGION == 7, "RockyMountains",
            ifelse(REGION == 8, "FarWest",
            ifelse(REGION == 9, "Others", "."
            ))))))))),
    FirstGen = 100*FirstGen
    )
df1 <- filter(df1, HighDeg > 2)
model1 <- lm(MDEARNP10 ~ FirstGen, data = df1)
summary(model1)
## 
## Call:
## lm(formula = MDEARNP10 ~ FirstGen, data = df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41793  -7956  -2405   5501  76190 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 73611.41     995.42   73.95   <2e-16 ***
## FirstGen     -635.93      26.66  -23.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13870 on 1964 degrees of freedom
## Multiple R-squared:  0.2246, Adjusted R-squared:  0.2242 
## F-statistic: 568.8 on 1 and 1964 DF,  p-value: < 2.2e-16
##tally(df1)
options(scipen = 999)
ggplot(data = df1, aes(x = FirstGen, y = MDEARNP10)) + geom_point()  + stat_smooth(method = lm) + labs(title = "Schools with more First Income Students Produce Lower Income Graduates?", x = "First Gen Students (Percent of Student Body)", y = "Median Earnings Afer 10 Years (USD)")