## To install ipeds: devtools::install_github('jbryer/ipeds')E 
## To install unzip: install.packages("utils")
library(ipeds)
library(tidyverse)
library(dplyr)
library(tidycensus)
library(knitr)
library(RODBC)
library(questionr)
library(tibble)

###Be sure to enter cencus API key ###census_api_key(enter key here)

###Get Access file path IPEDSDatabase <- odbcDriverConnect(“Driver={Microsoft Access Driver (.mdb, .accdb)};DBQ=FILE PATH HERE”)

institutioninformation <-  sqlFetch(IPEDSDatabase, "HD2021")

instituioninformation<- subset(institutioninformation, CONTROL == 1)%>%
  subset(C21SZSET %in% c(1, 2, 3, 4, 5))%>%
  select(UNITID, INSTNM, COUNTYCD, C21SZSET)


graduationinformation <-  sqlFetch(IPEDSDatabase, "GR200_21")
graduationinformation <-  graduationinformation %>%
 select(UNITID, L4GR100, L4GR150, L4GR200)

transferinformation<- sqlFetch(IPEDSDatabase, "DRVGR2021")
transferinformation<- transferinformation %>%
  select(UNITID, TRRTTOT)

retentioninformation <-  sqlFetch(IPEDSDatabase, "EF2021D")
retentioninformation <-  retentioninformation %>%
 select(UNITID, RET_PCF, RET_PCP)

ipeds<- left_join(instituioninformation, graduationinformation, by = "UNITID")
ipeds<- left_join(ipeds, transferinformation, by = "UNITID")
ipeds<- left_join(ipeds, retentioninformation)
ipeds<- na.omit(ipeds)
ipeds<- ipeds%>%
  mutate(gradtransf = .01*(TRRTTOT + L4GR150))%>%
  mutate(RET_PCF = .01*RET_PCF)%>%
  mutate(RET_PCP = .01*RET_PCP)

ipeds$COUNTYCD<-paste0("0",as.character(ipeds$COUNTYCD))


write.csv(ipeds, "ipeds.csv")
#Capture the aligned cesus data and export to an excel file
key <- load_variables(2020, "acs5", cache = TRUE)

write.csv(key, "censuskey.csv")
years<- lst(2015, 2020)

my_vars<- c(total_pop = "B01003_001")

population<-  map_dfr(years, ~ get_acs(geography = "county", variables = my_vars, year = .x, survey = "acs5", geometry = FALSE), .id = "year"  # when combining results, add id var (name of list item)
  ) %>%
  select(-moe) %>%  # shhhh
  arrange(variable, NAME)

population<- select(population, GEOID, year, estimate)

population<- population %>%
  spread(year, estimate)

population<- population %>%
  mutate("popchange" = population$'2020'-population$'2015')

population<- population %>%
  mutate("percentpopchange" = popchange/population$'2015')

population<- select(population, GEOID, popchange, percentpopchange)
#Estimated median household income in past 12 months, inflation adjusted 2020
income <- get_acs( geography = "county", variables = c(MedIncome="B19013_001"))
income<- rename.variable(income, "estimate","med_income")
income<- select(income, GEOID, med_income)


#Total in Population

HealthPro<-get_acs(geography = "county", variables = c(White="C27001A_005", Black="C27001B_005", Native="C27001C_005", Asian = "C27001D_005", Pacific="C27001E_005", Other= "C27001F_005", TwoMore="C27001G_005", WhiteNot="C27001H_005", Hispanic="C27001I_005"))
HealthPro<- HealthPro %>%
  select(GEOID, NAME, variable, estimate)
HealthPro<- HealthPro %>%
  spread(variable, estimate)
HealthPro<- HealthPro %>%
  replace_na(list(White= 0, Black=0, Native=0, Asian=0, Pacific=0, Other=0, TwoMore=0, WhiteNot=0, Hispanic=0))
HealthPro<- HealthPro %>%
  mutate("HPro" = White + Black + Native + Asian + Pacific + Other + TwoMore + WhiteNot + Hispanic)

#Total with health

HealthTot<-get_acs(geography = "county", variables = c(Whitet="C27001A_007", Blackt="C27001B_007", Nativet="C27001C_007", Asiant = "C27001D_007", Pacifict="C27001E_007", Othert= "C27001F_007", TwoMoret="C27001G_007", WhiteNott="C27001H_007", Hispanict="C27001I_007"))
HealthTot<- HealthTot %>%
  select(GEOID, NAME, variable, estimate)
HealthTot<- HealthTot %>%
  spread(variable, estimate)
HealthTot<- HealthTot %>%
  replace_na(list(Whitet= 0, Blackt=0, Nativet=0, Asiant=0, Pacifict=0, Othert=0, TwoMoret=0, WhiteNott=0, Hispanict=0))
HealthTot<- HealthTot %>%
  mutate("HTot" = Whitet + Blackt + Nativet + Asiant + Pacifict + Othert + TwoMoret + WhiteNott + Hispanict)


#Join the health variables
Health<-left_join(HealthTot, HealthPro, by = "GEOID")
Health<-rename.variable(Health, "NAME.x","County")
Health<- select(Health, GEOID, County, HPro, HTot)
Health<- Health%>%
  mutate("NoHealth"=HTot/HPro)

#Total in Population
MalePop<-get_acs(geography = "county", variables = c(WhMalePop="C23002A_004", BMalePop="C23002B_004", NMalePop="C23002C_004", AMalePop="C23002D_004", PIPop="C23002E_004", OMalePop="C23002F_004", TwoMalePop="C23002G_004", WAMalePop="C23002H_004", HMalePopl="C23002I_004"))
MalePop<- MalePop %>%
  select(GEOID, NAME, variable, estimate)
MalePop<- MalePop %>%
  spread(variable, estimate)
MalePop<- MalePop %>%
  replace_na(list(WhMalePop = 0, BMalePop = 0, NMalePop = 0, AMalePop= 0, PIPop = 0, OMalePop = 0, TwoMalePop = 0, WAMalePop = 0, HMalePopl= 0))
MalePop<- MalePop %>%
  mutate("MPop" = WhMalePop +  BMalePop +  NMalePop +  AMalePop + PIPop + OMalePop + TwoMalePop +  WAMalePop +  HMalePopl)
MalePop<- MalePop %>%
select(GEOID, NAME, MPop)


#Total Unemployed
MaleEmp<-get_acs(geography = "county", variables = c(WhMaleEmp="C23002A_008", BMaleEmp="C23002B_008", NMaleEmp="C23002C_008", AMaleEmp="C23002D_008", PIMaleEmpl="C23002E_008", OMaleEmp="C23002F_008", TwoMaleEmp="C23002G_008", WAMaleEmp="C23002H_008", HMaleEmpl="C23002I_008"))
MaleEmp<- MaleEmp %>%
  select(GEOID, NAME, variable, estimate)
MaleEmp<- MaleEmp %>%
  spread(variable, estimate)
MaleEmp<- MaleEmp %>%
  replace_na(list(WhMaleEmp = 0, BMaleEmp = 0, NMaleEmp = 0, AMaleEmp= 0, PMaleEmpl = 0, OMaleEmp = 0, TwoMaleEmp = 0, WAMaleEmp = 0, HMaleEmpl= 0))
MaleEmp<- MaleEmp %>%
  mutate("MUnEmp" = WhMaleEmp +  BMaleEmp +  NMaleEmp +  AMaleEmp + PIMaleEmpl + OMaleEmp + TwoMaleEmp +  WAMaleEmp +  HMaleEmpl)

MaleEmp<- MaleEmp %>%
select(GEOID,MUnEmp)

MaleEmp<- left_join(MalePop, MaleEmp, by = "GEOID")

MaleEmp<- MaleEmp%>%
mutate("MUnEmploy"= MUnEmp/MPop)

#Total in Population
FemalePop<-get_acs(geography = "county", variables = c(WhFemalePop="C23002A_016", BFemalePop="C23002B_016", NFemalePop="C23002C_016", AFemalePop="C23002D_016", PIPop="C23002E_016", OFemalePop="C23002F_016", TwoFemalePop="C23002G_016", WAFemalePop="C23002H_016", HFemalePopl="C23002I_016"))
FemalePop<- FemalePop %>%
  select(GEOID, NAME, variable, estimate)
FemalePop<- FemalePop %>%
  spread(variable, estimate)
FemalePop<- FemalePop %>%
  replace_na(list(WhFemalePop = 0, BFemalePop = 0, NFemalePop = 0, AFemalePop= 0, PIPop = 0, OFemalePop = 0, TwoFemalePop = 0, WAFemalePop = 0, HFemalePopl= 0))
FemalePop<- FemalePop %>%
  mutate("FPop" = WhFemalePop +  BFemalePop +  NFemalePop +  AFemalePop + PIPop + OFemalePop + TwoFemalePop +  WAFemalePop +  HFemalePopl)
FemalePop<- FemalePop %>% 
select(GEOID, NAME, FPop)

#Total Unemployed
FemaleEmp<-get_acs(geography = "county", variables = c(WhFemaleEmp="C23002A_021", BFemaleEmp="C23002B_021", NFemaleEmp="C23002C_021", AFemaleEmp="C23002D_021", PIFemaleEmpl="C23002E_021", OFemaleEmp="C23002F_021", TwoFemaleEmp="C23002G_021", WAFemaleEmp="C23002H_021", HFemaleEmpl="C23002I_021"))
FemaleEmp<- FemaleEmp %>%
  select(GEOID, NAME, variable, estimate)
FemaleEmp<- FemaleEmp %>%
  spread(variable, estimate)
FemaleEmp<- FemaleEmp %>%
  replace_na(list(WhFemaleEmp = 0, BFemaleEmp = 0, NFemaleEmp = 0, AFemaleEmp= 0, PFemaleEmpl = 0, OFemaleEmp = 0, TwoFemaleEmp = 0, WAFemaleEmp = 0, HFemaleEmpl= 0))
FemaleEmp<- FemaleEmp %>%
  mutate("FUnEmp" = WhFemaleEmp +  BFemaleEmp +  NFemaleEmp +  AFemaleEmp + PIFemaleEmpl + OFemaleEmp + TwoFemaleEmp +  WAFemaleEmp +  HFemaleEmpl)

FemaleEmp<- FemaleEmp %>%
select(GEOID,FUnEmp)

FemaleEmp<- left_join(FemalePop, FemaleEmp, by = "GEOID")

FemaleEmp<- FemaleEmp%>%
mutate("FUnEmploy"= FUnEmp/FPop)

employ<-left_join(MaleEmp, FemaleEmp, by="GEOID")
employ<- select(employ, GEOID, NAME.x, MPop, MUnEmp, FPop, FUnEmp)
employ<-rename.variable(employ, "NAME.x","County")
employ<- employ %>%
  mutate("total"= MPop+FPop)
employ<- employ %>%
  mutate("popemployed"= MUnEmp+FUnEmp)
employ<- employ %>%
  mutate ("Percent_Unemployed" = (popemployed/total))

#Total in Population
RacePop<-get_acs(geography = "county", variables = c(RacePop="B01001_001"))
RacePop<- rename.variable(RacePop, "estimate","RacePop")

#Total White
WhiteTot<-get_acs(geography = "county", variables = c(WhiteTot="B01001A_001"))
WhiteTot<-rename.variable(WhiteTot, "estimate","WhiteTot")

#Join the Race/Ethnicity
TotalWhite<-left_join(RacePop, WhiteTot, by = "GEOID")
TotalWhite<-rename.variable(TotalWhite, "NAME.x","County")
TotalWhite<- select(TotalWhite, GEOID, County, RacePop, WhiteTot)
TotalWhite<- TotalWhite%>%
mutate("PercentWhite"= WhiteTot/RacePop)

#Total Veterans
VetPop<-get_acs(geography = "county", variables = c(VetPop="B21001_002"))
VetPop<- rename.variable(VetPop, "estimate","VetPop")

#Total NonVeterans
VetNon<-get_acs(geography = "county", variables = c(VetNon="B21001_003"))
VetNon<-rename.variable(VetNon, "estimate","VetNon")

#Join the Veterans
Veteran<-left_join(VetPop, VetNon, by = "GEOID")
Veteran<-rename.variable(Veteran, "NAME.x","County")
Veteran<- select(Veteran, GEOID, County, VetNon, VetPop)
Veteran<- Veteran%>%
mutate("TotalPopV"= VetNon + VetPop)
Veteran<- Veteran%>%
mutate("PercentVet"= VetPop/TotalPopV)

#Total Pop
House<-get_acs(geography = "county", variables = c(House="B07401_001"))
House<- rename.variable(House, "estimate","House")
House<- select(House, GEOID, House)

#Total Same House
HouseSame<-get_acs(geography = "county", variables = c(HouseSame="B07401_017"))
HouseSame<-rename.variable(HouseSame, "estimate","HouseSame")
HouseSame<-  select(HouseSame, GEOID, HouseSame)

HousePercent<- left_join(HouseSame, House, by = "GEOID")

HousePercent<- 
  HousePercent%>%
  mutate("HousePercent" = HouseSame/House)
HousePercent<- select(HousePercent, GEOID, HousePercent)

#Get Married Statuses
Married<-get_acs(geography = "county", variables = c(MarriedPop="B07408_001",NeverMarried="B07408_002", Married="B07408_003", Divorced="B07408_004", Separated="B07408_005", Widowed="B07408_006"))
Married<- select(Married, GEOID, variable, estimate)

#Tranverse the data with spread
Married<- Married %>%
  spread(variable, estimate)

#Calculate Proportions and clean dataset
Married<- Married %>%
  mutate("percentnevermarried"=NeverMarried/MarriedPop)

Married<- Married %>%
  mutate("percentmarried"=Married/MarriedPop)

Married<- Married %>%
  mutate("percentdivorced"=Divorced/MarriedPop)

Married<- Married %>%
  mutate("percentseparated"=Separated/MarriedPop)

Married<- Married %>%
  mutate("percentwidowed"=Widowed/MarriedPop)

Married<- Married %>%
  mutate("percentsingle"=(1- percentmarried)+percentwidowed)

#Clean the dataset
Married<- select(Married, GEOID, percentnevermarried, percentmarried, percentdivorced, percentseparated, percentwidowed, percentsingle)
Education<-get_acs(geography = "county", variables = c(EduPop="B07409_001",LessHS="B07409_002", HSorEquiv="B07409_003", SomeCollegeAss="B07409_004", Bach="B07409_005", GradorProf="B07409_006"))
Education<- select(Education, GEOID, variable, estimate)

#Tranverse the data with spread
Education<- Education %>%
  spread(variable, estimate)

#Calculate Proportions and clean dataset
Education<- Education %>%
  mutate("PercentLessthanHS"=LessHS/EduPop)

Education<- Education %>%
  mutate("PercentHS"=(HSorEquiv+SomeCollegeAss+Bach+GradorProf)/EduPop)

Education<- Education %>%
  mutate("PercentSomeorASS"=(SomeCollegeAss+Bach+GradorProf)/EduPop)

Education<- Education %>%
  mutate("PercentBach"=(Bach+GradorProf)/EduPop)

Education<- Education %>%
  mutate("PercentGradorPro"=GradorProf/EduPop)

Education<- select(Education, GEOID, PercentLessthanHS, PercentHS, PercentSomeorASS, PercentBach, PercentGradorPro)
Tribes<-get_acs(geography = "county", variables = c(pop = "B01003_001", tribes = "B02014_002"))
Tribes<- select(Tribes, GEOID, variable, estimate)

# Transpose with spread
Tribes<- Tribes %>%
  spread(variable,estimate)

# Calculate the proportion
Tribes<- Tribes %>%
  mutate(percenttribe = tribes/pop)

Tribes<- select(Tribes, GEOID, percenttribe)
Parenting<-get_acs(geography = "county", variables = c(totkids = "B05009_001", undersixoneparent="B05009_013", sixtoseventeeenoneparent="B05009_031"))
Parenting<- select(Parenting, GEOID, variable, estimate)

#Tranverse the data with spread
Parenting<- Parenting %>%
  spread(variable, estimate)

#Calculate proportion
Parenting<- Parenting %>%
  mutate(SingleParPercent = (sixtoseventeeenoneparent + undersixoneparent)/totkids)
Parenting<- select(Parenting, GEOID, SingleParPercent)
Citizen<-get_acs(geography = "county", variables = c(totalpop= "B01003_001", totalmig = "B05011_001", notcitizen="B05011_002", naturalized="B05011_003"))
Citizen<- select(Citizen, GEOID, variable, estimate)

#Tranverse the data with spread
Citizen<- Citizen %>%
  spread(variable, estimate)

#Calculate proportion
Citizen<- Citizen %>%
  mutate(PercentNotCitizen = notcitizen/totalpop)

Citizen<- Citizen %>%
  mutate(PercentImmigrant    = totalmig/totalpop)

Citizen<- select(Citizen, GEOID, PercentNotCitizen, PercentImmigrant)
Renters<- get_acs(geography = "county", variables = c(totalh="B07013_001", renters = "B07013_003"))
Renters<- select(Renters, GEOID, variable, estimate)

#Transverse the data with spread
Renters<- Renters %>%
  spread(variable, estimate)

#Calculate proportions
Renters<- Renters %>%
  mutate(PercentRent = renters/totalh)

Renters<- select(Renters, GEOID, PercentRent)
census<- left_join(employ, Health, by = "GEOID")
census<- rename.variable(census, "GEOID.x", "GEOID")
census<- left_join(census, income, by = "GEOID")
census<- left_join(census, TotalWhite, by = "GEOID")
census<- left_join(census, Veteran, by = "GEOID")
census<- left_join(census, population, by = "GEOID")
census<- left_join(census, HousePercent, by = "GEOID")
census<- left_join(census, Married, by = "GEOID")
census<- left_join(census, Education, by =  "GEOID")
census<- left_join(census, Tribes, by = "GEOID")
census<- left_join(census, Parenting, by= "GEOID")
census<- left_join(census, Citizen, by = "GEOID")
census<- left_join(census, Renters, by = "GEOID")
census<-rename.variable(census, "County.x","County")
census<- census %>%
  mutate("WithHealth" = 1-NoHealth)
census<- select(census, GEOID, County,  Percent_Unemployed, HPro, HTot, NoHealth, WithHealth, med_income, RacePop, WhiteTot, PercentWhite, VetNon, VetPop, PercentVet, popchange, percentpopchange, HousePercent, percentnevermarried, percentmarried, percentdivorced, percentseparated, percentwidowed, percentsingle, PercentLessthanHS, PercentHS, PercentSomeorASS, PercentBach, PercentGradorPro, percenttribe, SingleParPercent, PercentNotCitizen, PercentImmigrant, PercentRent)

write.csv(census, "census.csv")
census<- read.csv("census.csv")

ipeds<- read.csv("ipeds.csv")

ipeds$COUNTYCD<- as.numeric(ipeds$COUNTYCD)


ipedsgraddata<-left_join(ipeds, census, by = c("COUNTYCD" = "GEOID"))


write.csv(ipedsgraddata, "ipedsgradmassive2.csv")

##Results may change slightly with each iteration of the model.

library(randomForest)
library(tidyverse)
library(ROCR)

set.seed(71)

gf<- read.csv('ipedsgradmassive2.csv')

gf<-  select(gf, gradtransf, Percent_Unemployed, WithHealth, med_income, PercentWhite, percentpopchange, HousePercent, percentmarried, percentdivorced, percentseparated, PercentSomeorASS, PercentBach, percenttribe, SingleParPercent, PercentNotCitizen, PercentImmigrant, PercentRent)


gf<- na.omit(gf)

descriptives<- psych::describe(gf)
descriptives
##                    vars   n     mean       sd   median  trimmed      mad
## gradtransf            1 775     0.46     0.12     0.45     0.46     0.10
## Percent_Unemployed    2 775     0.05     0.01     0.04     0.04     0.01
## WithHealth            3 775     0.88     0.06     0.89     0.88     0.06
## med_income            4 775 65613.25 19449.81 61310.00 63379.26 16028.39
## PercentWhite          5 775     0.72     0.18     0.75     0.73     0.20
## percentpopchange      6 775     0.01     0.04     0.00     0.01     0.04
## HousePercent          7 775     0.87     0.03     0.88     0.87     0.03
## percentmarried        8 775     0.48     0.05     0.48     0.48     0.05
## percentdivorced       9 775     0.11     0.02     0.11     0.11     0.02
## percentseparated     10 775     0.02     0.01     0.02     0.02     0.01
## PercentSomeorASS     11 775     0.60     0.09     0.60     0.60     0.09
## PercentBach          12 775     0.29     0.11     0.26     0.28     0.11
## percenttribe         13 775     0.01     0.03     0.00     0.00     0.00
## SingleParPercent     14 775     0.36     0.09     0.35     0.35     0.08
## PercentNotCitizen    15 775     0.05     0.05     0.03     0.04     0.03
## PercentImmigrant     16 775     0.10     0.09     0.06     0.08     0.06
## PercentRent          17 775     0.31     0.09     0.31     0.31     0.09
##                         min       max     range  skew kurtosis     se
## gradtransf             0.17      0.95      0.78  0.55     0.98   0.00
## Percent_Unemployed     0.00      0.11      0.11  0.80     2.04   0.00
## WithHealth             0.59      0.97      0.39 -0.92     0.92   0.00
## med_income         24958.00 140258.00 115300.00  1.22     1.88 698.66
## PercentWhite           0.15      0.98      0.83 -0.70    -0.33   0.01
## percentpopchange      -0.11      0.22      0.33  0.77     1.75   0.00
## HousePercent           0.74      0.94      0.21 -0.75     1.07   0.00
## percentmarried         0.26      0.62      0.36 -0.72     1.40   0.00
## percentdivorced        0.05      0.19      0.14  0.22    -0.32   0.00
## percentseparated       0.00      0.06      0.06  0.96     1.75   0.00
## PercentSomeorASS       0.33      0.83      0.49 -0.02    -0.43   0.00
## PercentBach            0.09      0.63      0.54  0.68    -0.15   0.00
## percenttribe           0.00      0.42      0.42  8.86    88.18   0.00
## SingleParPercent       0.10      0.75      0.66  0.90     1.99   0.00
## PercentNotCitizen      0.00      0.21      0.21  1.26     0.84   0.00
## PercentImmigrant       0.00      0.47      0.47  1.41     1.32   0.00
## PercentRent            0.13      0.79      0.66  0.89     1.97   0.00
write.csv(descriptives, "descriptives.csv")



mtry <- tuneRF(gf[-1],gf$gradtransf, ntreeTry=500,  stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
## mtry = 5  OOB error = 0.01051121 
## Searching left ...
## mtry = 4     OOB error = 0.01060431 
## -0.008857718 0.01 
## Searching right ...
## mtry = 7     OOB error = 0.01036733 
## 0.01368802 0.01 
## mtry = 10    OOB error = 0.01044787 
## -0.007768348 0.01

best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]

print(mtry)
##    mtry   OOBError
## 4     4 0.01060431
## 5     5 0.01051121
## 7     7 0.01036733
## 10   10 0.01044787
ind <- sample(2, nrow(gf), replace = TRUE, prob = c(0.7, 0.3))
train <- gf[ind==1,]
test <- gf[ind==2,]

Graduation_Transfer_Model <-randomForest(gradtransf~.,data=train, mtry=best.m, importance=TRUE, ntree=500, proximity = TRUE)

print(Graduation_Transfer_Model)
## 
## Call:
##  randomForest(formula = gradtransf ~ ., data = train, mtry = best.m,      importance = TRUE, ntree = 500, proximity = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##           Mean of squared residuals: 0.009788583
##                     % Var explained: 26.75
## Prediction with test data
pred_test <- predict(Graduation_Transfer_Model, newdata = test)

## Calculate RMSE of the difference between training model and test model
RMSE<-sqrt(mean((pred_test-test$gradtransf)^2))

RMSE
## [1] 0.1111422
## Importance table and plot
randomForest::importance(Graduation_Transfer_Model)
##                      %IncMSE IncNodePurity
## Percent_Unemployed 20.859259     0.7349980
## WithHealth          9.313247     0.2803900
## med_income         12.104970     0.3274085
## PercentWhite       12.336679     0.6518768
## percentpopchange   11.214726     0.3943554
## HousePercent        7.487493     0.2686669
## percentmarried     12.133233     0.4272672
## percentdivorced    13.583127     0.4092022
## percentseparated   12.372320     0.3511591
## PercentSomeorASS    8.169681     0.2852537
## PercentBach        12.549303     0.2802166
## percenttribe        7.032034     0.4353853
## SingleParPercent   10.989753     0.3003422
## PercentNotCitizen  14.810841     0.5628101
## PercentImmigrant   13.283750     0.4941443
## PercentRent         9.066181     0.3429072
importance_gr<- rownames_to_column(as.data.frame(importance(Graduation_Transfer_Model)))%>%
  mutate(Category = "Grad_Plus_Trans")
 
varImpPlot(Graduation_Transfer_Model)

library(randomForest)
library(tidyverse)
library(ROCR)

set.seed(71)

gf1<- read.csv('ipedsgradmassive2.csv')

gf1<-  select(gf1, RET_PCF, Percent_Unemployed, WithHealth, med_income, PercentWhite, percentpopchange, HousePercent, percentmarried, percentdivorced, percentseparated, PercentSomeorASS, PercentBach, percenttribe, SingleParPercent, PercentNotCitizen, PercentImmigrant, PercentRent)


gf1<- na.omit(gf1)

descriptives2<- psych::describe(gf1)
descriptives2
##                    vars   n     mean       sd   median  trimmed      mad
## RET_PCF               1 775     0.60     0.08     0.60     0.60     0.07
## Percent_Unemployed    2 775     0.05     0.01     0.04     0.04     0.01
## WithHealth            3 775     0.88     0.06     0.89     0.88     0.06
## med_income            4 775 65613.25 19449.81 61310.00 63379.26 16028.39
## PercentWhite          5 775     0.72     0.18     0.75     0.73     0.20
## percentpopchange      6 775     0.01     0.04     0.00     0.01     0.04
## HousePercent          7 775     0.87     0.03     0.88     0.87     0.03
## percentmarried        8 775     0.48     0.05     0.48     0.48     0.05
## percentdivorced       9 775     0.11     0.02     0.11     0.11     0.02
## percentseparated     10 775     0.02     0.01     0.02     0.02     0.01
## PercentSomeorASS     11 775     0.60     0.09     0.60     0.60     0.09
## PercentBach          12 775     0.29     0.11     0.26     0.28     0.11
## percenttribe         13 775     0.01     0.03     0.00     0.00     0.00
## SingleParPercent     14 775     0.36     0.09     0.35     0.35     0.08
## PercentNotCitizen    15 775     0.05     0.05     0.03     0.04     0.03
## PercentImmigrant     16 775     0.10     0.09     0.06     0.08     0.06
## PercentRent          17 775     0.31     0.09     0.31     0.31     0.09
##                         min       max     range  skew kurtosis     se
## RET_PCF                0.22      0.89      0.67 -0.31     1.24   0.00
## Percent_Unemployed     0.00      0.11      0.11  0.80     2.04   0.00
## WithHealth             0.59      0.97      0.39 -0.92     0.92   0.00
## med_income         24958.00 140258.00 115300.00  1.22     1.88 698.66
## PercentWhite           0.15      0.98      0.83 -0.70    -0.33   0.01
## percentpopchange      -0.11      0.22      0.33  0.77     1.75   0.00
## HousePercent           0.74      0.94      0.21 -0.75     1.07   0.00
## percentmarried         0.26      0.62      0.36 -0.72     1.40   0.00
## percentdivorced        0.05      0.19      0.14  0.22    -0.32   0.00
## percentseparated       0.00      0.06      0.06  0.96     1.75   0.00
## PercentSomeorASS       0.33      0.83      0.49 -0.02    -0.43   0.00
## PercentBach            0.09      0.63      0.54  0.68    -0.15   0.00
## percenttribe           0.00      0.42      0.42  8.86    88.18   0.00
## SingleParPercent       0.10      0.75      0.66  0.90     1.99   0.00
## PercentNotCitizen      0.00      0.21      0.21  1.26     0.84   0.00
## PercentImmigrant       0.00      0.47      0.47  1.41     1.32   0.00
## PercentRent            0.13      0.79      0.66  0.89     1.97   0.00
write.csv(descriptives2, "descriptives2.csv")


mtry <- tuneRF(gf1[-1],gf1$RET_PCF, ntreeTry=500,  stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
## mtry = 5  OOB error = 0.006336744 
## Searching left ...
## mtry = 4     OOB error = 0.006285523 
## 0.008083153 0.01 
## Searching right ...
## mtry = 7     OOB error = 0.006351955 
## -0.002400447 0.01

best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]

print(mtry)
##   mtry    OOBError
## 4    4 0.006285523
## 5    5 0.006336744
## 7    7 0.006351955
ind <- sample(2, nrow(gf1), replace = TRUE, prob = c(0.7, 0.3))
train <- gf1[ind==1,]
test <- gf1[ind==2,]

Full_Time_Retention_Model <-randomForest(RET_PCF~.,data=train, mtry=best.m, importance=TRUE, ntree=500, proximity = TRUE)

print(Full_Time_Retention_Model)
## 
## Call:
##  randomForest(formula = RET_PCF ~ ., data = train, mtry = best.m,      importance = TRUE, ntree = 500, proximity = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.007033668
##                     % Var explained: 6.75
## Prediction with test data
pred_test <- predict(Full_Time_Retention_Model, newdata = test)

## Calculate RMSE of the difference between training model and test model
RMSE<-sqrt(mean((pred_test-test$RET_PCF)^2))

RMSE
## [1] 0.07212177
## Importance table and plot
importance(Full_Time_Retention_Model)
##                      %IncMSE IncNodePurity
## Percent_Unemployed  5.624713     0.2116929
## WithHealth          7.643891     0.2064366
## med_income         16.440263     0.2758204
## PercentWhite        9.187872     0.1630955
## percentpopchange    5.398700     0.2038181
## HousePercent        2.699104     0.2283913
## percentmarried     10.553436     0.2312817
## percentdivorced    11.058296     0.2920554
## percentseparated   10.802755     0.1830417
## PercentSomeorASS    7.550398     0.1637116
## PercentBach         9.992177     0.1845852
## percenttribe        5.657944     0.1801108
## SingleParPercent    8.994367     0.1904701
## PercentNotCitizen  14.185082     0.2684800
## PercentImmigrant   13.864259     0.2715811
## PercentRent         7.874369     0.1902822
importance_ret<- rownames_to_column(as.data.frame(importance(Full_Time_Retention_Model)))%>%
  mutate(Category = "Full_Time_Retention")

varImpPlot(Full_Time_Retention_Model)

tabledata<- left_join(importance_gr, importance_ret, by = "rowname")

write.csv(tabledata, "tabledata.csv")
library(ggplot2)
im<- rbind(importance_gr, importance_ret)%>%
  rename("Variable" = rowname)%>%
  rename("Increase_MSE" = "%IncMSE")%>%
  rename("Increase_Node_Purity" = IncNodePurity)%>%
  select(Variable, Category, Increase_MSE, Increase_Node_Purity)%>%
  pivot_longer(cols = c("Increase_MSE", "Increase_Node_Purity"), 
                        names_to = "Measure", 
                        values_to = "Rate")%>%
  mutate(Rate = round(Rate, 2))


im1<- im%>%
  subset(Measure == "Increase_MSE")

im2<- im%>%
  subset(Measure == "Increase_Node_Purity")

ggplot(im1, aes(x = Variable, y = Rate, fill = Category))+
  geom_bar(stat='identity', position = position_dodge())+
  coord_flip()+
   geom_text(aes(label=Rate), color="black",hjust = 1.6,  position = position_dodge(1), size=3)+
   scale_fill_brewer(palette="Set2")+
   labs(title="Effects on MSE of Predictors by Dependent Variable", x = "Predictor", y ="Percent Increase MSE")

ggplot(im2, aes(x = Variable, y = Rate, fill = Category))+
  geom_bar(stat='identity', position = position_dodge())+
  coord_flip()+
   geom_text(aes(label=Rate), color="black",hjust = 1.6,  position = position_dodge(1), size=3)+
   scale_fill_brewer(palette="Accent")+
   labs(title="Effects on Node Purity of Predictors by Dependent Variable", x = "Predictor", y ="Increase Node Purity")

library(ggpubr)
formula <- ipedsgraddata$RET_PCF ~ poly(ipedsgraddata$gradtransf, 3, raw = TRUE)
ggplot(ipedsgraddata, aes(x=RET_PCF, y=gradtransf)) + 
  geom_point()+
  geom_smooth(method = lm)+
  labs(title="Full-Time Retention Predicting Graduation Plus Transfer",
       x="Full-Time Retention", y = "Graduation Plus Transfer")+
  stat_regline_equation(
    aes(label =  paste(..adj.rr.label..)),
    formula = formula)